diff --git a/.ci/monolithic-windows.sh b/.ci/monolithic-windows.sh
index 219979dd3e36e..5fb8f69528e89 100755
--- a/.ci/monolithic-windows.sh
+++ b/.ci/monolithic-windows.sh
@@ -23,8 +23,8 @@ runtimes_targets="${4}"
 start-group "CMake"
 pip install -q -r "${MONOREPO_ROOT}"/.ci/all_requirements.txt
 
-export CC=cl
-export CXX=cl
+export CC=C:/clang/clang-msvc/bin/clang-cl.exe
+export CXX=C:/clang/clang-msvc/bin/clang-cl.exe
 export LD=link
 
 # The CMAKE_*_LINKER_FLAGS to disable the manifest come from research
@@ -49,6 +49,7 @@ cmake -S "${MONOREPO_ROOT}"/llvm -B "${BUILD_DIR}" \
       -D CMAKE_EXE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_MODULE_LINKER_FLAGS="/MANIFEST:NO" \
       -D CMAKE_SHARED_LINKER_FLAGS="/MANIFEST:NO" \
+      -D CMAKE_CXX_FLAGS="-Wno-c++98-compat -Wno-c++14-compat -Wno-unsafe-buffer-usage -Wno-old-style-cast" \
       -D LLVM_ENABLE_RUNTIMES="${runtimes}"
 
 start-group "ninja"
diff --git a/.ci/premerge_advisor_upload.py b/.ci/premerge_advisor_upload.py
index 1fc2423dd24fc..cb379b0e77cd6 100644
--- a/.ci/premerge_advisor_upload.py
+++ b/.ci/premerge_advisor_upload.py
@@ -12,9 +12,12 @@
 
 import generate_test_report_lib
 
-PREMERGE_ADVISOR_URL = (
-    "http://premerge-advisor.premerge-advisor.svc.cluster.local:5000/upload"
-)
+# These are IP addresses of the two premerge advisor instances. They should
+# eventually be updated to domain names.
+PREMERGE_ADVISOR_URLS = [
+    "http://34.82.126.63:5000/upload",
+    "http://136.114.125.23:5000/upload",
+]
 
 
 def main(commit_sha, workflow_run_number, build_log_files):
@@ -41,7 +44,8 @@ def main(commit_sha, workflow_run_number, build_log_files):
         ninja_failures = generate_test_report_lib.find_failure_in_ninja_logs(ninja_logs)
         for name, failure_message in ninja_failures:
             failure_info["failures"].append({"name": name, "message": failure_message})
-    requests.post(PREMERGE_ADVISOR_URL, json=failure_info)
+    for premerge_advisor_url in PREMERGE_ADVISOR_URLS:
+        requests.post(premerge_advisor_url, json=failure_info)
 
 
 if __name__ == "__main__":
diff --git a/.ci/utils.sh b/.ci/utils.sh
index dc8ce9b9a4214..540acfa8d5cc5 100644
--- a/.ci/utils.sh
+++ b/.ci/utils.sh
@@ -40,13 +40,17 @@ function at-exit {
   fi
 
   if [[ "$retcode" != "0" ]]; then
-    python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
-      $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
-      "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
     if [[ "$GITHUB_ACTIONS" != "" ]]; then
       python "${MONOREPO_ROOT}"/.ci/premerge_advisor_explain.py \
         $(git rev-parse HEAD~1) "${BUILD_DIR}"/test-results.*.xml \
         "${MONOREPO_ROOT}"/ninja*.log
+      python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
+        $(git rev-parse HEAD~1) $GITHUB_RUN_NUMBER \
+        "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
+    else
+      python "${MONOREPO_ROOT}"/.ci/premerge_advisor_upload.py \
+        $(git rev-parse HEAD) $BUILDBOT_BUILDNUMBER \
+        "${BUILD_DIR}"/test-results.*.xml "${MONOREPO_ROOT}"/ninja*.log
     fi
   fi
 }
diff --git a/.github/workflows/containers/github-action-ci-tooling/Dockerfile b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
index 9d2aaf6bbd48a..8aaa2e88f2bab 100644
--- a/.github/workflows/containers/github-action-ci-tooling/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-tooling/Dockerfile
@@ -37,6 +37,14 @@ RUN apt-get update && \
     apt-get clean && \
     rm -rf /var/lib/apt/lists/*
 
+# Create a new user with id 1001 as that is the user id that
+# Github Actions uses to perform the checkout action.
+RUN useradd gha -u 1001 -m -s /bin/bash
+RUN adduser gha sudo
+RUN echo '%sudo ALL=(ALL) NOPASSWD:ALL' >> /etc/sudoers
+# Don't set USER gha right away because we still need to install packages
+# as root in 'ci-container-code-format' and 'ci-container-code-lint' containers
+
 
 FROM base AS ci-container-code-format
 ARG LLVM_VERSION
@@ -51,6 +59,8 @@ ENV PATH=${LLVM_SYSROOT}/bin:${PATH}
 COPY llvm/utils/git/requirements_formatting.txt requirements_formatting.txt
 RUN pip install -r requirements_formatting.txt --break-system-packages && \
     rm requirements_formatting.txt
+USER gha
+WORKDIR /home/gha
 
 
 FROM base AS ci-container-code-lint
@@ -80,3 +90,5 @@ RUN apt-get update && \
 COPY llvm/utils/git/requirements_linting.txt requirements_linting.txt
 RUN pip install -r requirements_linting.txt --break-system-packages && \
     rm requirements_linting.txt
+USER gha
+WORKDIR /home/gha
diff --git a/.github/workflows/containers/github-action-ci-windows/Dockerfile b/.github/workflows/containers/github-action-ci-windows/Dockerfile
index 9ddf5017bc020..f1e4f1538540b 100644
--- a/.github/workflows/containers/github-action-ci-windows/Dockerfile
+++ b/.github/workflows/containers/github-action-ci-windows/Dockerfile
@@ -98,3 +98,45 @@ RUN powershell -Command \
     Add-Type -AssemblyName System.IO.Compression.FileSystem ; \
     [System.IO.Compression.ZipFile]::ExtractToDirectory('actions-runner-win.zip', $PWD) ;\
     rm actions-runner-win.zip
+
+# Set the LLVM_VERSION environment variable
+ENV LLVM_VERSION=21.1.2
+
+# Download and extract Clang compiler.
+# Create directories, download, extract, and clean up all in one layer
+RUN powershell -Command \
+    # --- Setup directories --- \
+    Write-Host "Creating directories..."; \
+    New-Item -Path "C:\temp-download" -ItemType "Directory" -Force ; \
+    New-Item -Path "C:\xz-utils" -ItemType "Directory" -Force ; \
+    New-Item -Path "C:\clang" -ItemType "Directory" -Force ; \
+    # --- 1. Download and extract xz --- \
+    Set-Location C:\temp-download ; \
+    Invoke-WebRequest -Uri "http://github.com/tukaani-project/xz/releases/download/v5.8.1/xz-5.8.1-windows.zip" -OutFile "xz.zip"; \
+    (Get-FileHash -Path "C:\temp-download\xz.zip" -Algorithm MD5).Hash -eq 'c3c69fdce3e825cc0b76123b36b0bcc2' ; \
+    Add-Type -AssemblyName "System.IO.Compression.FileSystem"; \
+    [System.IO.Compression.ZipFile]::ExtractToDirectory('C:\temp-download\xz.zip', 'C:\xz-utils'); \ 
+    # --- 2. Download and decompress Clang --- \
+    Invoke-WebRequest -Uri "http://github.com/llvm/llvm-project/releases/download/llvmorg-21.1.2/clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" -OutFile "clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" ; \
+    (Get-FileHash -Path "C:\temp-download\clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz" -Algorithm MD5).Hash -eq '0ae1d3effd9ab9d323f7fa595777f0a2' ; \
+    C:\xz-utils\bin_x86-64\xz.exe -d -qq clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar.xz ; \
+    # --- 3. Extract clang --- \
+    C:\Windows\System32\tar.exe -xf clang+llvm-21.1.2-x86_64-pc-windows-msvc.tar -C C:\clang ; \
+    # --- 4. Clean up --- \
+    Set-Location C:\ ; \
+    Remove-Item C:\temp-download -Recurse -Force; \
+    Remove-Item C:\xz-utils -Recurse -Force; \
+    # -- 5. Shorten path to clang files & remove unnecessary files -- \
+    Set-Location C:\clang ; \
+    Rename-Item -Path "C:\clang\clang+llvm-21.1.2-x86_64-pc-windows-msvc" -NewName "C:\clang\clang-msvc" ; \
+    Set-Location C:\clang\clang-msvc ; \
+    Remove-Item -Path C:\clang\clang-msvc\libexec -Recurse -Force ; \
+    Remove-Item -Path C:\clang\clang-msvc\share -Recurse -Force ; \
+    Rename-Item -Path "C:\clang\clang-msvc\bin" -NewName "C:\clang\clang-msvc\bin-full" ; \
+    New-Item -Path "C:\clang\clang-msvc\bin" -ItemType Directory -Force ; \
+    Set-Location C:\clang\clang-msvc\bin ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\*.dll -Destination C:\clang\clang-msvc\bin\. ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\clang-cl.exe -Destination C:\clang\clang-msvc\bin\. ; \
+    Copy-Item -Path C:\clang\clang-msvc\bin-full\lld-link.exe -Destination C:\clang\clang-msvc\bin\. ; \
+    Set-Location C:\clang\clang-msvc ; \
+    Remove-Item -Path C:\clang\clang-msvc\bin-full -Recurse -Force ;
diff --git a/.github/workflows/libcxx-run-benchmarks.yml b/.github/workflows/libcxx-run-benchmarks.yml
index 0379a0a1f857d..9e8f55859fc7a 100644
--- a/.github/workflows/libcxx-run-benchmarks.yml
+++ b/.github/workflows/libcxx-run-benchmarks.yml
@@ -64,17 +64,21 @@ jobs:
           path: repo # Avoid nuking the workspace, where we have the Python virtualenv
 
       - name: Run baseline
+        env:
+          BENCHMARKS: ${{ steps.vars.outputs.benchmarks }}
         run: |
           source .venv/bin/activate && cd repo
           python -m pip install -r libcxx/utils/requirements.txt
           baseline_commit=$(git merge-base ${{ steps.vars.outputs.pr_base }} ${{ steps.vars.outputs.pr_head }})
-          ./libcxx/utils/test-at-commit --commit ${baseline_commit} -B build/baseline -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }}
+          ./libcxx/utils/test-at-commit --commit ${baseline_commit} -B build/baseline -- -sv -j1 --param optimization=speed "$BENCHMARKS"
           ./libcxx/utils/consolidate-benchmarks build/baseline | tee baseline.lnt
 
       - name: Run candidate
+        env:
+          BENCHMARKS: ${{ steps.vars.outputs.benchmarks }}
         run: |
           source .venv/bin/activate && cd repo
-          ./libcxx/utils/test-at-commit --commit ${{ steps.vars.outputs.pr_head }} -B build/candidate -- -sv -j1 --param optimization=speed ${{ steps.vars.outputs.benchmarks }}
+          ./libcxx/utils/test-at-commit --commit ${{ steps.vars.outputs.pr_head }} -B build/candidate -- -sv -j1 --param optimization=speed "$BENCHMARKS"
           ./libcxx/utils/consolidate-benchmarks build/candidate | tee candidate.lnt
 
       - name: Compare baseline and candidate runs
diff --git a/.github/workflows/pr-code-format.yml b/.github/workflows/pr-code-format.yml
index 2b85d8b59869c..ac0689b4d3243 100644
--- a/.github/workflows/pr-code-format.yml
+++ b/.github/workflows/pr-code-format.yml
@@ -25,14 +25,6 @@ jobs:
         with:
           fetch-depth: 2
 
-      # We need to set the repo checkout as safe, otherwise tj-actions/changed-files
-      # will fail due to the changed ownership inside the container.
-      # TODO(boomanaiden154): We should probably fix this by having the default user
-      # in the container have the same ID as the GHA user on the host.
-      - name: Set Safe Directory
-        run: |
-          chown -R root $(pwd)
-
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
diff --git a/.github/workflows/pr-code-lint.yml b/.github/workflows/pr-code-lint.yml
index e67b518149c2c..8ba9378703739 100644
--- a/.github/workflows/pr-code-lint.yml
+++ b/.github/workflows/pr-code-lint.yml
@@ -31,11 +31,6 @@ jobs:
         with:
           fetch-depth: 2
       
-      # FIXME: same as in ".github/workflows/pr-code-format.yml"
-      - name: Set Safe Directory
-        run: |
-          chown -R root $(pwd)
-      
       - name: Get changed files
         id: changed-files
         uses: tj-actions/changed-files@ed68ef82c095e0d48ec87eccea555d944a631a4c # v46.0.5
diff --git a/.github/workflows/premerge.yaml b/.github/workflows/premerge.yaml
index 951fc16bed215..6303a119750b5 100644
--- a/.github/workflows/premerge.yaml
+++ b/.github/workflows/premerge.yaml
@@ -193,7 +193,7 @@ jobs:
         uses: llvm/actions/install-ninja@main
       - name: Build and Test
         run: |
-          source <(git diff --name-only HEAD~2..HEAD | python3 .ci/compute_projects.py)
+          source <(git diff --name-only HEAD~1...HEAD | python3 .ci/compute_projects.py)
 
           if [[ "${projects_to_build}" == "" ]]; then
             echo "No projects to build"
diff --git a/.github/workflows/release-binaries.yml b/.github/workflows/release-binaries.yml
index 83969b5490685..512eacc689c02 100644
--- a/.github/workflows/release-binaries.yml
+++ b/.github/workflows/release-binaries.yml
@@ -58,7 +58,6 @@ jobs:
       ref: ${{ steps.vars.outputs.ref }}
       upload: ${{ steps.vars.outputs.upload }}
       target-cmake-flags: ${{ steps.vars.outputs.target-cmake-flags }}
-      ccache: ${{ steps.vars.outputs.ccache }}
       build-flang: ${{ steps.vars.outputs.build-flang }}
       release-binary-basename: ${{ steps.vars.outputs.release-binary-basename }}
       release-binary-filename: ${{ steps.vars.outputs.release-binary-filename }}
@@ -123,13 +122,6 @@ jobs:
         echo "release-binary-filename=$release_binary_basename.tar.xz" >> $GITHUB_OUTPUT
 
         target="$RUNNER_OS-$RUNNER_ARCH"
-        # The hendrikmuhs/ccache-action action does not support installing sccache
-        # on arm64 Linux.
-        if [ "$target" = "Linux-ARM64" ]; then
-          echo ccache=ccache >> $GITHUB_OUTPUT
-        else
-          echo ccache=sccache >> $GITHUB_OUTPUT
-        fi
 
         # The macOS builds try to cross compile some libraries so we need to
         # add extra CMake args to disable them.
@@ -149,15 +141,12 @@ jobs:
           target_cmake_flags="$target_cmake_flags -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_ARCHS=$arches -DBOOTSTRAP_BOOTSTRAP_DARWIN_osx_BUILTIN_ARCHS=$arches"
         fi
 
-        build_flang="true"
-
         if [ "$RUNNER_OS" = "Windows" ]; then
           # The build times out on Windows, so we need to disable LTO.
           target_cmake_flags="$target_cmake_flags -DLLVM_RELEASE_ENABLE_LTO=OFF"
         fi
 
         echo "target-cmake-flags=$target_cmake_flags" >> $GITHUB_OUTPUT
-        echo "build-flang=$build_flang" >> $GITHUB_OUTPUT
         case "${{ inputs.runs-on }}" in
           ubuntu-22.04*)
             build_runs_on="depot-${{ inputs.runs-on }}-16"
@@ -222,15 +211,12 @@ jobs:
     - name: Configure
       id: build
       shell: bash
-      env:
-        CCACHE_BIN: ${{ needs.prepare.outputs.ccache }}
       run: |
         # There were some issues on the ARM64 MacOS runners with trying to build x86 object,
         # so we need to set some extra cmake flags to disable this.
         cmake -G Ninja -S llvm -B ${{ steps.setup-stage.outputs.build-prefix }}/build \
             ${{ needs.prepare.outputs.target-cmake-flags }} \
             -C clang/cmake/caches/Release.cmake \
-            -DBOOTSTRAP_LLVM_PARALLEL_LINK_JOBS=1 \
             -DBOOTSTRAP_BOOTSTRAP_CPACK_PACKAGE_FILE_NAME="${{ needs.prepare.outputs.release-binary-basename }}"
 
     - name: Build
diff --git a/.github/workflows/release-documentation.yml b/.github/workflows/release-documentation.yml
index d3d375d3a6df9..4cf973d000a4b 100644
--- a/.github/workflows/release-documentation.yml
+++ b/.github/workflows/release-documentation.yml
@@ -25,6 +25,10 @@ on:
         description: 'Upload documentation'
         required: false
         type: boolean
+    secrets:
+      WWW_RELEASES_TOKEN:
+        description: "Secret used to create a PR with the documentation changes."
+        required: false
 
 jobs:
   release-documentation:
diff --git a/.github/workflows/release-tasks.yml b/.github/workflows/release-tasks.yml
index a184996968cdd..d4c2a55fcc9d7 100644
--- a/.github/workflows/release-tasks.yml
+++ b/.github/workflows/release-tasks.yml
@@ -54,6 +54,9 @@ jobs:
     with:
       release-version: ${{ needs.validate-tag.outputs.release-version }}
       upload: true
+    # Called workflows don't have access to secrets by default, so we need to explicitly pass secrets that we use.
+    secrets:
+      WWW_RELEASES_TOKEN: ${{ secrets.WWW_RELEASES_TOKEN }}
 
   release-doxygen:
     name: Build and Upload Release Doxygen
diff --git a/amd/comgr/include/amd_comgr.h.in b/amd/comgr/include/amd_comgr.h.in
index 5a0f0df2506b0..6a91b38e39e98 100644
--- a/amd/comgr/include/amd_comgr.h.in
+++ b/amd/comgr/include/amd_comgr.h.in
@@ -1634,11 +1634,17 @@ typedef enum amd_comgr_action_kind_s {
    */
   AMD_COMGR_ACTION_COMPILE_SOURCE_TO_BC = 0x2,
   /**
-   * Link a collection of bitcodes in @p into a single composite bitcode @p.
+   * Link a collection of bitcodes, bundled bitcodes, and bundled bitcode
+   * archives in @p into a single composite (unbundled) bitcode @p.
    * Any device library bc data object must be explicitly added to @p input if
    * needed.
    *
-   * Return @p AMD_COMGR_STATUS_ERROR if the link fails.
+   * Return @p AMD_COMGR_STATUS_ERROR if the link or unbundling fails.
+   *
+   * Return @p AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT
+   * if IsaName is not set in @p info and does not match the isa name
+   * of all bc data objects in @p input, or if the Name field is not set for
+   * any DataObject in the input set.
    */
   AMD_COMGR_ACTION_LINK_BC_TO_BC = 0x3,
   /**
diff --git a/amd/comgr/src/comgr-clang-command.cpp b/amd/comgr/src/comgr-clang-command.cpp
index 05e340db9f688..7111dcecace9f 100644
--- a/amd/comgr/src/comgr-clang-command.cpp
+++ b/amd/comgr/src/comgr-clang-command.cpp
@@ -96,7 +96,8 @@ SmallVector<StringRef, 1> getInputFiles(driver::Command &Command) {
 
 } // namespace
 ClangCommand::ClangCommand(driver::Command &Command,
-                           DiagnosticOptions &DiagOpts, vfs::FileSystem &VFS,
+                           DiagnosticOptions &DiagOpts,
+                           IntrusiveRefCntPtr<vfs::FileSystem> VFS,
                            ExecuteFnTy &&ExecuteImpl)
     : Command(Command), DiagOpts(DiagOpts), VFS(VFS),
       ExecuteImpl(std::move(ExecuteImpl)) {}
diff --git a/amd/comgr/src/comgr-clang-command.h b/amd/comgr/src/comgr-clang-command.h
index b7974dd96fb34..31aeee7d9e99c 100644
--- a/amd/comgr/src/comgr-clang-command.h
+++ b/amd/comgr/src/comgr-clang-command.h
@@ -25,12 +25,12 @@ class ClangCommand final : public CachedCommandAdaptor {
 public:
   using ExecuteFnTy = std::function<amd_comgr_status_t(
       clang::driver::Command &, llvm::raw_ostream &, clang::DiagnosticOptions &,
-      llvm::vfs::FileSystem &)>;
+      llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem>)>;
 
 private:
   clang::driver::Command &Command;
   clang::DiagnosticOptions &DiagOpts;
-  llvm::vfs::FileSystem &VFS;
+  llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS;
   ExecuteFnTy ExecuteImpl;
 
   // To avoid copies, store the output of execute, such that readExecuteOutput
@@ -39,7 +39,8 @@ class ClangCommand final : public CachedCommandAdaptor {
 
 public:
   ClangCommand(clang::driver::Command &Command,
-               clang::DiagnosticOptions &DiagOpts, llvm::vfs::FileSystem &VFS,
+               clang::DiagnosticOptions &DiagOpts,
+               llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
                ExecuteFnTy &&ExecuteImpl);
 
   bool canCache() const override;
diff --git a/amd/comgr/src/comgr-compiler.cpp b/amd/comgr/src/comgr-compiler.cpp
index bf8d33176dd85..2f9126d2f030e 100644
--- a/amd/comgr/src/comgr-compiler.cpp
+++ b/amd/comgr/src/comgr-compiler.cpp
@@ -639,9 +639,10 @@ void logArgv(raw_ostream &OS, StringRef ProgramName,
   OS.flush();
 }
 
-amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS,
-                                  DiagnosticOptions &DiagOpts,
-                                  llvm::vfs::FileSystem &FS) {
+amd_comgr_status_t
+executeCommand(const Command &Job, raw_ostream &LogS,
+               DiagnosticOptions &DiagOpts,
+               IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
   TextDiagnosticPrinter DiagClient(LogS, DiagOpts);
   IntrusiveRefCntPtr<DiagnosticIDs> DiagID(new DiagnosticIDs);
   DiagnosticsEngine Diags(DiagID, DiagOpts, &DiagClient, false);
@@ -661,7 +662,7 @@ amd_comgr_status_t executeCommand(const Command &Job, raw_ostream &LogS,
 
     std::unique_ptr<CompilerInstance> Clang(new CompilerInstance());
     Clang->setVerboseOutputStream(LogS);
-    Clang->setFileManager(new FileManager(Clang->getFileSystemOpts(), &FS));
+    Clang->setVirtualFileSystem(FS);
     if (!Argv.back()) {
       Argv.pop_back();
     }
@@ -777,7 +778,7 @@ AMDGPUCompiler::executeInProcessDriver(ArrayRef<const char *> Args) {
 
   auto Cache = CommandCache::get(LogS);
   for (auto &Job : C->getJobs()) {
-    ClangCommand C(Job, *DiagOpts, *OverlayFS, executeCommand);
+    ClangCommand C(Job, *DiagOpts, OverlayFS, executeCommand);
     if (Cache) {
       if (auto Status = Cache->execute(C, LogS)) {
         return Status;
@@ -1373,6 +1374,13 @@ amd_comgr_status_t AMDGPUCompiler::unbundle() {
     size_t Index = OutputPrefix.find_last_of(".");
     OutputPrefix = OutputPrefix.substr(0, Index);
 
+    // TODO: Log Command (see linkBitcodeToBitcode() unbundling)
+    if (env::shouldEmitVerboseLogs()) {
+      LogS << "   Extracting Bundle:\n"
+           << "   Input Filename: " << BundlerConfig.InputFileNames[0] << "\n"
+           << "   Unbundled Files Extension: ." << FileExtension << "\n";
+    }
+
     for (StringRef Entry : ActionInfo->BundleEntryIDs) {
       // Add an output file for each target
       SmallString<128> OutputFilePath = OutputDir;
@@ -1462,31 +1470,266 @@ amd_comgr_status_t AMDGPUCompiler::linkBitcodeToBitcode() {
       }
     }
 
-    if (Input->DataKind != AMD_COMGR_DATA_KIND_BC) {
-      continue; 
-    }
+    if (Input->DataKind == AMD_COMGR_DATA_KIND_BC) {
+      if (env::shouldEmitVerboseLogs()) {
+        LogS << "\t     Linking Bitcode: " << InputDir << path::get_separator() << Input->Name
+             << "\n";
+      }
 
-    if (env::shouldEmitVerboseLogs()) {
-      LogS << "\t     Linking Bitcode: " << InputDir << path::get_separator() << Input->Name
-        << "\n";
-    }
+      // The data in Input outlives Mod, and the linker destructs Mod after
+      // linking it into composite (i.e. ownership is not transferred to the
+      // composite) so MemoryBuffer::getMemBuffer is sufficient.
+      auto Mod =
+          getLazyIRModule(MemoryBuffer::getMemBuffer(
+                              StringRef(Input->Data, Input->Size), "", false),
+                          SMDiag, Context, true);
 
-    // The data in Input outlives Mod, and the linker destructs Mod after
-    // linking it into composite (i.e. ownership is not transferred to the
-    // composite) so MemoryBuffer::getMemBuffer is sufficient.
-    auto Mod =
-      getLazyIRModule(MemoryBuffer::getMemBuffer(
-          StringRef(Input->Data, Input->Size), "", false),
-        SMDiag, Context, true);
+      if (!Mod) {
+        SMDiag.print(Input->Name, LogS, /* ShowColors */ false);
+        return AMD_COMGR_STATUS_ERROR;
+      }
+      if (verifyModule(*Mod, &LogS))
+        return AMD_COMGR_STATUS_ERROR;
+      if (L.linkInModule(std::move(Mod), ApplicableFlags))
+        return AMD_COMGR_STATUS_ERROR;
+    } else if (Input->DataKind == AMD_COMGR_DATA_KIND_BC_BUNDLE) {
+      if (env::shouldEmitVerboseLogs()) {
+        LogS << "      Linking Bundle: " << InputDir << path::get_separator() << Input->Name
+             << "\n";
+      }
 
-    if (!Mod) {
-      SMDiag.print(Input->Name, LogS, /* ShowColors */ false);
-      return AMD_COMGR_STATUS_ERROR;
+      // Determine desired bundle entry ID
+      // TODO: Move away from using ActionInfo->IsaName
+      //   Use ActionInfo->BundleEntryIDs instead
+      if (!ActionInfo->IsaName)
+        return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT;
+
+      std::string IsaName = ActionInfo->IsaName;
+      size_t Index = IsaName.find("gfx");
+      std::string BundleEntryId =
+          "hip-amdgcn-amd-amdhsa--gfx" + IsaName.substr(Index + 3);
+
+      // Write data to file system so that Offload Bundler can process, assuming
+      // we didn't already write due to shouldSaveTemps() conditional above
+      // TODO: Switch write to VFS
+      if (!env::shouldSaveTemps()) {
+        if (auto Status = outputToFile(Input, getFilePath(Input, InputDir))) {
+          return Status;
+        }
+      }
+
+      // Configure Offload Bundler
+      OffloadBundlerConfig BundlerConfig;
+      BundlerConfig.AllowMissingBundles = true;
+      BundlerConfig.FilesType = "bc";
+
+      BundlerConfig.TargetNames.push_back(BundleEntryId);
+      std::string InputFilePath = getFilePath(Input, InputDir).str().str();
+      BundlerConfig.InputFileNames.push_back(InputFilePath);
+
+      // Generate prefix for output files
+      std::string OutputPrefix = std::string(Input->Name);
+      Index = OutputPrefix.find_last_of(".");
+      OutputPrefix = OutputPrefix.substr(0, Index);
+      std::string OutputFileName = OutputPrefix + '-' + BundleEntryId + ".bc";
+
+      // ISA name may contain ':', which is an invalid character in file names
+      // on Windows. Replace with '_'
+      std::replace(OutputFileName.begin(), OutputFileName.end(), ':', '_');
+
+      std::string OutputFilePath = OutputDir.str().str() + path::get_separator().str() + OutputFileName;
+      BundlerConfig.OutputFileNames.push_back(OutputFilePath);
+
+      OffloadBundler Bundler(BundlerConfig);
+
+      // Execute unbundling
+      if (env::shouldEmitVerboseLogs()) {
+        LogS << "Extracting Bitcode Bundle:\n"
+             << "\t  Bundle Entry ID: " << BundlerConfig.TargetNames[0] << "\n"
+             << "\t   Input Filename: " << BundlerConfig.InputFileNames[0]
+             << "\n"
+             << "\t  Output Filename: " << BundlerConfig.OutputFileNames[0]
+             << "\n";
+        LogS << "\t          Command: clang-offload-bundler -unbundle -type=bc"
+                " -targets="
+             << BundleEntryId << " -input=" << InputFilePath
+             << " -output=" << OutputFilePath << "\n";
+        LogS.flush();
+      }
+
+      llvm::Error Err = Bundler.UnbundleFiles();
+      llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(),
+                                  "UnbundleFiles error: ");
+
+      // Read unbundled bitcode from file system in order to pass to linker
+      amd_comgr_data_t ResultT;
+      if (auto Status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_BC, &ResultT))
+        return Status;
+
+      // ResultT can be released after addition to the data_set
+      ScopedDataObjectReleaser SDOR(ResultT);
+
+      DataObject *Result = DataObject::convert(ResultT);
+      if (auto Status = inputFromFile(Result, StringRef(OutputFilePath)))
+        return Status;
+
+      Result->Name = strdup(OutputFileName.c_str());
+
+      auto Mod =
+          getLazyIRModule(MemoryBuffer::getMemBuffer(
+                              StringRef(Result->Data, Result->Size), "", false),
+                          SMDiag, Context, true);
+
+      if (!Mod) {
+        SMDiag.print(Result->Name, LogS, /* ShowColors */ false);
+        return AMD_COMGR_STATUS_ERROR;
+      }
+      if (verifyModule(*Mod, &LogS))
+        return AMD_COMGR_STATUS_ERROR;
+      if (L.linkInModule(std::move(Mod), ApplicableFlags))
+        return AMD_COMGR_STATUS_ERROR;
     }
-    if (verifyModule(*Mod, &LogS))
-      return AMD_COMGR_STATUS_ERROR;
-    if (L.linkInModule(std::move(Mod), ApplicableFlags))
-      return AMD_COMGR_STATUS_ERROR;
+    // Unbundle bitcode archive
+    else if (Input->DataKind == AMD_COMGR_DATA_KIND_AR_BUNDLE) {
+      if (env::shouldEmitVerboseLogs()) {
+        LogS << "\t     Linking Archive: " << InputDir << path::get_separator() << Input->Name
+             << "\n";
+      }
+
+      // Determine desired bundle entry ID
+      // TODO: Move away from using ActionInfo->IsaName
+      //   Use ActionInfo->BundleEntryIDs instead
+      if (!ActionInfo->IsaName)
+        return AMD_COMGR_STATUS_ERROR_INVALID_ARGUMENT;
+
+      std::string IsaName = ActionInfo->IsaName;
+      size_t Index = IsaName.find("gfx");
+      std::string BundleEntryId =
+          "hip-amdgcn-amd-amdhsa--gfx" + IsaName.substr(Index + 3);
+
+      // Write data to file system so that Offload Bundler can process, assuming
+      // we didn't already write due to shouldSaveTemps() conditional above
+      // TODO: Switch write to VFS
+      if (!env::shouldSaveTemps()) {
+        if (auto Status = outputToFile(Input, getFilePath(Input, InputDir))) {
+          return Status;
+        }
+      }
+
+      // Configure Offload Bundler
+      OffloadBundlerConfig BundlerConfig;
+      BundlerConfig.AllowMissingBundles = true;
+      BundlerConfig.FilesType = "a";
+      BundlerConfig.HipOpenmpCompatible = 1;
+      BundlerConfig.AllowNoHost = 1;
+
+      BundlerConfig.TargetNames.push_back(BundleEntryId);
+      std::string InputFilePath = getFilePath(Input, InputDir).str().str();
+      BundlerConfig.InputFileNames.push_back(InputFilePath);
+
+      // Generate prefix for output files
+      std::string OutputPrefix = std::string(Input->Name);
+      Index = OutputPrefix.find_last_of(".");
+      OutputPrefix = OutputPrefix.substr(0, Index);
+
+      std::string OutputFileName = OutputPrefix + '-' + BundleEntryId + ".a";
+
+      // ISA name may contain ':', which is an invalid character in file names
+      // on Windows. Replace with '_'
+      std::replace(OutputFileName.begin(), OutputFileName.end(), ':', '_');
+
+      std::string OutputFilePath = OutputDir.str().str() + path::get_separator().str() + OutputFileName;
+      BundlerConfig.OutputFileNames.push_back(OutputFilePath);
+
+      OffloadBundler Bundler(BundlerConfig);
+
+      // Execute unbundling
+      if (env::shouldEmitVerboseLogs()) {
+        LogS << "    Extracting Bitcode Archive:\n"
+             << "\t  Bundle Entry ID: " << BundlerConfig.TargetNames[0] << "\n"
+             << "\t   Input Filename: " << BundlerConfig.InputFileNames[0]
+             << "\n"
+             << "\t  Output Filename: " << BundlerConfig.OutputFileNames[0]
+             << "\n";
+        LogS << "\t          Command: clang-offload-bundler -unbundle -type=a "
+                " -targets="
+             << BundleEntryId << " -input=" << InputFilePath
+             << " -output=" << OutputFilePath << "\n";
+        LogS.flush();
+      }
+      llvm::Error Err = Bundler.UnbundleArchive();
+      llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(),
+                                  "UnbundleArchive error: ");
+
+      // Read archive back into Comgr
+      amd_comgr_data_t ResultT;
+      if (auto Status = amd_comgr_create_data(AMD_COMGR_DATA_KIND_AR, &ResultT))
+        return Status;
+
+      // ResultT can be released after addition to the data_set
+      ScopedDataObjectReleaser SDOR(ResultT);
+
+      DataObject *Result = DataObject::convert(ResultT);
+      if (auto Status = inputFromFile(Result, StringRef(OutputFilePath)))
+        return Status;
+
+      // Get memory buffer for each bitcode in archive file
+      //   Modeled after static loadArFile in llvm-link.cpp
+      std::string ArchiveName = "comgr.ar";
+      llvm::StringRef ArchiveBuf = StringRef(Result->Data, Result->Size);
+      auto ArchiveOrError =
+          object::Archive::create(MemoryBufferRef(ArchiveBuf, ArchiveName));
+
+      if (!ArchiveOrError) {
+        llvm::logAllUnhandledErrors(ArchiveOrError.takeError(), llvm::errs(),
+                                    "Unpack Archives error: ");
+        return AMD_COMGR_STATUS_ERROR;
+      }
+
+      auto Archive = std::move(ArchiveOrError.get());
+
+      Err = Error::success();
+      for (const object::Archive::Child &C : Archive->children(Err)) {
+
+        // Get child name
+        Expected<StringRef> Ename = C.getName();
+        if (Error E = Ename.takeError()) {
+          errs() << ": ";
+          WithColor::error() << " failed to read name of archive member"
+                             << ArchiveName << "'\n";
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        std::string ChildName = Ename.get().str();
+
+        // Get memory buffer
+        SMDiagnostic ParseErr;
+        Expected<MemoryBufferRef> MemBuf = C.getMemoryBufferRef();
+        if (Error E = MemBuf.takeError()) {
+          errs() << ": ";
+          WithColor::error()
+              << " loading memory for member '"
+              << "' of archive library failed'" << ArchiveName << "'\n";
+          return AMD_COMGR_STATUS_ERROR;
+        };
+
+        // Link memory buffer into composite
+        auto Mod = getLazyIRModule(MemoryBuffer::getMemBuffer(MemBuf.get()),
+                                   SMDiag, Context, true);
+
+        if (!Mod) {
+          SMDiag.print(ChildName.c_str(), LogS, /* ShowColors */ false);
+          return AMD_COMGR_STATUS_ERROR;
+        }
+        if (verifyModule(*Mod, &LogS))
+          return AMD_COMGR_STATUS_ERROR;
+        if (L.linkInModule(std::move(Mod), ApplicableFlags))
+          return AMD_COMGR_STATUS_ERROR;
+      }
+
+      llvm::logAllUnhandledErrors(std::move(Err), llvm::errs(),
+                                  "Unpack Archives error: ");
+    } else
+      continue;
   }
 
   if (verifyModule(*Composite, &LogS)) {
diff --git a/amd/comgr/test/unbundle_hip_test.c b/amd/comgr/test/unbundle_hip_test.c
index 847fd46aabb5d..fe5276c52372e 100644
--- a/amd/comgr/test/unbundle_hip_test.c
+++ b/amd/comgr/test/unbundle_hip_test.c
@@ -348,6 +348,79 @@ int main(int Argc, char *Argv[]) {
     }
   }
 
+  // Unbundle silently via LINK action
+  {
+    // Set up ActionInfo
+    Status = amd_comgr_create_action_info(&ActionInfoLink);
+    checkError(Status, "amd_comgr_create_action_info");
+
+    Status = amd_comgr_action_info_set_language(ActionInfoLink,
+                                                AMD_COMGR_LANGUAGE_HIP);
+    checkError(Status, "amd_comgr_action_info_set_language");
+
+    const char *IsaName = "amdgcn-amd-amdhsa--gfx900";
+    Status = amd_comgr_action_info_set_isa_name(ActionInfoLink, IsaName);
+
+    // Unbundle
+    Status = amd_comgr_create_data_set(&DataSetLinked);
+    checkError(Status, "amd_comgr_create_data_set");
+    Status = amd_comgr_do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, ActionInfoLink,
+                                 DataSetBundled, DataSetLinked);
+    checkError(Status, "amd_comgr_do_action");
+
+    // Check Linked bitcode count
+    size_t Count;
+    Status = amd_comgr_action_data_count(DataSetLinked, AMD_COMGR_DATA_KIND_BC,
+                                         &Count);
+    checkError(Status, "amd_comgr_action_data_count");
+
+    if (Count != 1) {
+      printf("Bundled bitcode linking: "
+             "produced %zu bitcodes (expected 1)\n",
+             Count);
+      exit(1);
+    }
+
+    // Compile to relocatable
+    Status = amd_comgr_create_data_set(&DataSetReloc);
+    checkError(Status, "amd_comgr_create_data_set");
+
+    Status = amd_comgr_do_action(AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE,
+                                 ActionInfoLink, DataSetLinked, DataSetReloc);
+    checkError(Status, "amd_comgr_do_action");
+
+    Status = amd_comgr_action_data_count(
+        DataSetReloc, AMD_COMGR_DATA_KIND_RELOCATABLE, &Count);
+    checkError(Status, "amd_comgr_action_data_count");
+
+    if (Count != 1) {
+      printf("AMD_COMGR_ACTION_CODEGEN_BC_TO_RELOCATABLE Failed: "
+             "produced %zu source objects (expected 1)\n",
+             Count);
+      exit(1);
+    }
+
+    // Compile to executable
+    Status = amd_comgr_create_data_set(&DataSetExec);
+    checkError(Status, "amd_comgr_create_data_set");
+
+    Status =
+        amd_comgr_do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE,
+                            ActionInfoLink, DataSetReloc, DataSetExec);
+    checkError(Status, "amd_comgr_do_action");
+
+    Status = amd_comgr_action_data_count(
+        DataSetExec, AMD_COMGR_DATA_KIND_EXECUTABLE, &Count);
+    checkError(Status, "amd_comgr_action_data_count");
+
+    if (Count != 1) {
+      printf("AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE Failed: "
+             "produced %zu executable objects (expected 1)\n",
+             Count);
+      exit(1);
+    }
+  }
+
   // Cleanup
   Status = amd_comgr_destroy_action_info(ActionInfoUnbundle);
   checkError(Status, "amd_comgr_destroy_action_info");
diff --git a/amd/device-libs/cmake/Packages.cmake b/amd/device-libs/cmake/Packages.cmake
index 07c60eb39a227..7406d31e6fad6 100644
--- a/amd/device-libs/cmake/Packages.cmake
+++ b/amd/device-libs/cmake/Packages.cmake
@@ -15,9 +15,6 @@ configure_file(AMDDeviceLibsConfig.cmake.in
   ${PACKAGE_PREFIX}/AMDDeviceLibsConfig.cmake
   @ONLY)
 
-
-set(install_path_suffix "amdgcn/bitcode")
-
 # Generate the install-tree package.
 # We do not know the absolute path to the intall tree until we are installed,
 # so we calculate it dynamically in AMD_DEVICE_LIBS_PREFIX_CODE and use
@@ -38,7 +35,7 @@ foreach(target ${AMDGCN_LIB_LIST})
   set(AMD_DEVICE_LIBS_TARGET_CODE "${AMD_DEVICE_LIBS_TARGET_CODE}
 add_library(${target} STATIC IMPORTED)
 set_target_properties(${target} PROPERTIES
-  IMPORTED_LOCATION \"\${AMD_DEVICE_LIBS_PREFIX}/${install_path_suffix}/${target_prefix}${target_name}${target_suffix}\")")
+  IMPORTED_LOCATION \"\${AMD_DEVICE_LIBS_PREFIX}/${INSTALL_ROOT_SUFFIX}/${target_prefix}${target_name}${target_suffix}\")")
 endforeach()
 configure_file(AMDDeviceLibsConfig.cmake.in
   ${CMAKE_CURRENT_BINARY_DIR}/AMDDeviceLibsConfig.cmake.install
diff --git a/bolt/README.md b/bolt/README.md
index bf52f8c416915..902d1eb6e7694 100644
--- a/bolt/README.md
+++ b/bolt/README.md
@@ -164,7 +164,7 @@ $ perf2bolt -p perf.data -o perf.fdata <executable>
 This command will aggregate branch data from `perf.data` and store it in a
 format that is both more compact and more resilient to binary modifications.
 
-If the profile was collected without brstacks, you will need to add `-nl` flag to
+If the profile was collected without brstacks, you will need to add `-ba` flag to
 the command line above.
 
 ### Step 3: Optimize with BOLT
diff --git a/bolt/docs/Heatmaps.md b/bolt/docs/Heatmaps.md
index b10e72f0632e3..a9def71638465 100644
--- a/bolt/docs/Heatmaps.md
+++ b/bolt/docs/Heatmaps.md
@@ -21,7 +21,7 @@ $ perf record -e cycles:u -j any,u [-p PID|-a] -- sleep <interval>
 ```
 
 Running with brstack (`-j any,u` or `-b`) is recommended. Heatmaps can be generated
-from basic events by using the llvm-bolt-heatmap option `-nl` (no brstack) but
+from basic events by using the llvm-bolt-heatmap option `-ba` (basic events) but
 such heatmaps do not have the coverage provided by brstack and may only be useful
 for finding event hotspots at larger code block granularities.
 
diff --git a/bolt/docs/index.rst b/bolt/docs/index.rst
index fce6d25500237..07cefd0476422 100644
--- a/bolt/docs/index.rst
+++ b/bolt/docs/index.rst
@@ -205,8 +205,8 @@ This command will aggregate branch data from ``perf.data`` and store it
 in a format that is both more compact and more resilient to binary
 modifications.
 
-If the profile was collected without LBRs, you will need to add ``-nl``
-flag to the command line above.
+If the profile was collected without brstacks, you will need to add `-ba` flag to
+the command line above.
 
 Step 3: Optimize with BOLT
 ~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h
index 7b10b2d28d7e3..b215a1558cbb4 100644
--- a/bolt/include/bolt/Core/BinaryFunction.h
+++ b/bolt/include/bolt/Core/BinaryFunction.h
@@ -620,13 +620,11 @@ class BinaryFunction {
   }
 
   /// Return a label at a given \p Address in the function. If the label does
-  /// not exist - create it. Assert if the \p Address does not belong to
-  /// the function. If \p CreatePastEnd is true, then return the function
-  /// end label when the \p Address points immediately past the last byte
-  /// of the function.
+  /// not exist - create it.
+  ///
   /// NOTE: the function always returns a local (temp) symbol, even if there's
   ///       a global symbol that corresponds to an entry at this address.
-  MCSymbol *getOrCreateLocalLabel(uint64_t Address, bool CreatePastEnd = false);
+  MCSymbol *getOrCreateLocalLabel(uint64_t Address);
 
   /// Register an data entry at a given \p Offset into the function.
   void markDataAtOffset(uint64_t Offset) {
@@ -2142,8 +2140,9 @@ class BinaryFunction {
   }
 
   /// Detects whether \p Address is inside a data region in this function
-  /// (constant islands).
-  bool isInConstantIsland(uint64_t Address) const {
+  /// (constant islands), and optionally return the island size starting
+  /// from the given \p Address.
+  bool isInConstantIsland(uint64_t Address, uint64_t *Size = nullptr) const {
     if (!Islands)
       return false;
 
@@ -2161,10 +2160,15 @@ class BinaryFunction {
     DataIter = std::prev(DataIter);
 
     auto CodeIter = Islands->CodeOffsets.upper_bound(Offset);
-    if (CodeIter == Islands->CodeOffsets.begin())
+    if (CodeIter == Islands->CodeOffsets.begin() ||
+        *std::prev(CodeIter) <= *DataIter) {
+      if (Size)
+        *Size = (CodeIter == Islands->CodeOffsets.end() ? getMaxSize()
+                                                        : *CodeIter) -
+                Offset;
       return true;
-
-    return *std::prev(CodeIter) <= *DataIter;
+    }
+    return false;
   }
 
   uint16_t getConstantIslandAlignment() const;
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 7dded4c59ed38..c33540ada8a05 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -77,6 +77,11 @@ cl::opt<std::string> CompDirOverride(
              "location, which is used with DW_AT_dwo_name to construct a path "
              "to *.dwo files."),
     cl::Hidden, cl::init(""), cl::cat(BoltCategory));
+
+static cl::opt<bool>
+    FailOnInvalidPadding("fail-on-invalid-padding", cl::Hidden, cl::init(false),
+                         cl::desc("treat invalid code padding as error"),
+                         cl::ZeroOrMore, cl::cat(BoltCategory));
 } // namespace opts
 
 namespace llvm {
@@ -942,8 +947,7 @@ std::string BinaryContext::generateJumpTableName(const BinaryFunction &BF,
 }
 
 bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
-  // FIXME: aarch64 support is missing.
-  if (!isX86())
+  if (!isX86() && !isAArch64())
     return true;
 
   if (BF.getSize() == BF.getMaxSize())
@@ -973,14 +977,26 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
     return Offset - StartOffset;
   };
 
-  // Skip a sequence of zero bytes.
+  // Skip a sequence of zero bytes. For AArch64 we only skip 4 bytes of zeros
+  // in case the following zeros belong to constant island or veneer.
   auto skipZeros = [&]() {
     const uint64_t StartOffset = Offset;
-    for (; Offset < BF.getMaxSize(); ++Offset)
-      if ((*FunctionData)[Offset] != 0)
+    uint64_t CurrentOffset = Offset;
+    for (; CurrentOffset < BF.getMaxSize() &&
+           (!isAArch64() || CurrentOffset < StartOffset + 4);
+         ++CurrentOffset)
+      if ((*FunctionData)[CurrentOffset] != 0)
         break;
 
-    return Offset - StartOffset;
+    uint64_t NumZeros = CurrentOffset - StartOffset;
+    if (isAArch64())
+      NumZeros &= ~((uint64_t)0x3);
+
+    if (NumZeros == 0)
+      return false;
+    Offset += NumZeros;
+    InstrAddress += NumZeros;
+    return true;
   };
 
   // Accept the whole padding area filled with breakpoints.
@@ -993,6 +1009,8 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
   // Some functions have a jump to the next function or to the padding area
   // inserted after the body.
   auto isSkipJump = [&](const MCInst &Instr) {
+    if (!isX86())
+      return false;
     uint64_t TargetAddress = 0;
     if (MIB->isUnconditionalBranch(Instr) &&
         MIB->evaluateBranch(Instr, InstrAddress, InstrSize, TargetAddress)) {
@@ -1004,34 +1022,73 @@ bool BinaryContext::hasValidCodePadding(const BinaryFunction &BF) {
     return false;
   };
 
+  // For veneers that are not already covered by binary functions, only those
+  // that handleAArch64Veneer() can recognize are checked here.
+  auto skipAArch64Veneer = [&]() {
+    if (!isAArch64() || Offset >= BF.getMaxSize())
+      return false;
+    BinaryFunction *BFVeneer = getBinaryFunctionContainingAddress(InstrAddress);
+    if (BFVeneer) {
+      // A binary function may have been created to point to this veneer.
+      Offset += BFVeneer->getSize();
+      assert(Offset <= BF.getMaxSize() &&
+             "AArch64 veneeer goes past the max size of function");
+      InstrAddress += BFVeneer->getSize();
+      return true;
+    }
+    const uint64_t AArch64VeneerSize = 12;
+    if (Offset + AArch64VeneerSize <= BF.getMaxSize() &&
+        handleAArch64Veneer(InstrAddress, /*MatchOnly*/ true)) {
+      Offset += AArch64VeneerSize;
+      InstrAddress += AArch64VeneerSize;
+      this->errs() << "BOLT-WARNING: found unmarked AArch64 veneer at 0x"
+                   << Twine::utohexstr(BF.getAddress() + Offset) << '\n';
+      return true;
+    }
+    return false;
+  };
+
+  auto skipAArch64ConstantIsland = [&]() {
+    if (!isAArch64() || Offset >= BF.getMaxSize())
+      return false;
+    uint64_t Size;
+    if (BF.isInConstantIsland(InstrAddress, &Size)) {
+      Offset += Size;
+      InstrAddress += Size;
+      return true;
+    }
+    return false;
+  };
+
   // Skip over nops, jumps, and zero padding. Allow interleaving (this happens).
-  while (skipInstructions(isNoop) || skipInstructions(isSkipJump) ||
+  // For AArch64 also check veneers and skip constant islands.
+  while (skipAArch64Veneer() || skipAArch64ConstantIsland() ||
+         skipInstructions(isNoop) || skipInstructions(isSkipJump) ||
          skipZeros())
     ;
 
   if (Offset == BF.getMaxSize())
     return true;
 
-  if (opts::Verbosity >= 1) {
-    this->errs() << "BOLT-WARNING: bad padding at address 0x"
-                 << Twine::utohexstr(BF.getAddress() + BF.getSize())
-                 << " starting at offset " << (Offset - BF.getSize())
-                 << " in function " << BF << '\n'
-                 << FunctionData->slice(BF.getSize(),
-                                        BF.getMaxSize() - BF.getSize())
-                 << '\n';
-  }
-
+  this->errs() << "BOLT-WARNING: bad padding at address 0x"
+               << Twine::utohexstr(BF.getAddress() + BF.getSize())
+               << " starting at offset " << (Offset - BF.getSize())
+               << " in function " << BF << '\n'
+               << FunctionData->slice(BF.getSize(),
+                                      BF.getMaxSize() - BF.getSize())
+               << '\n';
   return false;
 }
 
 void BinaryContext::adjustCodePadding() {
+  uint64_t NumInvalid = 0;
   for (auto &BFI : BinaryFunctions) {
     BinaryFunction &BF = BFI.second;
     if (!shouldEmit(BF))
       continue;
 
     if (!hasValidCodePadding(BF)) {
+      NumInvalid++;
       if (HasRelocations) {
         this->errs() << "BOLT-WARNING: function " << BF
                      << " has invalid padding. Ignoring the function\n";
@@ -1041,6 +1098,11 @@ void BinaryContext::adjustCodePadding() {
       }
     }
   }
+  if (NumInvalid && opts::FailOnInvalidPadding) {
+    this->errs() << "BOLT-ERROR: found " << NumInvalid
+                 << " instance(s) of invalid code padding\n";
+    exit(1);
+  }
 }
 
 MCSymbol *BinaryContext::registerNameAtAddress(StringRef Name, uint64_t Address,
diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp
index 4dfd4ba6f611b..84023efe1084e 100644
--- a/bolt/lib/Core/BinaryFunction.cpp
+++ b/bolt/lib/Core/BinaryFunction.cpp
@@ -1035,13 +1035,9 @@ BinaryFunction::processIndirectBranch(MCInst &Instruction, unsigned Size,
   return BranchType;
 }
 
-MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
-                                                bool CreatePastEnd) {
+MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address) {
   const uint64_t Offset = Address - getAddress();
 
-  if ((Offset == getSize()) && CreatePastEnd)
-    return getFunctionEndLabel();
-
   auto LI = Labels.find(Offset);
   if (LI != Labels.end())
     return LI->second;
@@ -1052,6 +1048,9 @@ MCSymbol *BinaryFunction::getOrCreateLocalLabel(uint64_t Address,
       return IslandSym;
   }
 
+  if (Offset == getSize())
+    return getFunctionEndLabel();
+
   MCSymbol *Label = BC.Ctx->createNamedTempSymbol();
   Labels[Offset] = Label;
 
@@ -1427,10 +1426,9 @@ Error BinaryFunction::disassemble() {
                 !(BC.isAArch64() &&
                   BC.handleAArch64Veneer(TargetAddress, /*MatchOnly*/ true))) {
               // Result of __builtin_unreachable().
-              LLVM_DEBUG(dbgs() << "BOLT-DEBUG: jump past end detected at 0x"
-                                << Twine::utohexstr(AbsoluteInstrAddr)
-                                << " in function " << *this
-                                << " : replacing with nop.\n");
+              errs() << "BOLT-WARNING: jump past end detected at 0x"
+                     << Twine::utohexstr(AbsoluteInstrAddr) << " in function "
+                     << *this << " : replacing with nop.\n";
               BC.MIB->createNoop(Instruction);
               if (IsCondBranch) {
                 // Register branch offset for profile validation.
@@ -1995,7 +1993,7 @@ void BinaryFunction::postProcessJumpTables() {
       if (IsBuiltinUnreachable) {
         BinaryFunction *TargetBF = BC.getBinaryFunctionAtAddress(EntryAddress);
         MCSymbol *Label = TargetBF ? TargetBF->getSymbol()
-                                   : getOrCreateLocalLabel(EntryAddress, true);
+                                   : getOrCreateLocalLabel(EntryAddress);
         JT.Entries.push_back(Label);
         continue;
       }
@@ -2006,7 +2004,7 @@ void BinaryFunction::postProcessJumpTables() {
           BC.getBinaryFunctionContainingAddress(EntryAddress);
       MCSymbol *Label;
       if (HasOneParent && TargetBF == this) {
-        Label = getOrCreateLocalLabel(EntryAddress, true);
+        Label = getOrCreateLocalLabel(EntryAddress);
       } else {
         const uint64_t Offset = EntryAddress - TargetBF->getAddress();
         Label = Offset ? TargetBF->addEntryPointAtOffset(Offset)
diff --git a/bolt/lib/Passes/BinaryPasses.cpp b/bolt/lib/Passes/BinaryPasses.cpp
index 06e840e84f53b..e1a1856b506cf 100644
--- a/bolt/lib/Passes/BinaryPasses.cpp
+++ b/bolt/lib/Passes/BinaryPasses.cpp
@@ -346,13 +346,16 @@ void EliminateUnreachableBlocks::runOnFunction(BinaryFunction &Function) {
   uint64_t Bytes;
   Function.markUnreachableBlocks();
   LLVM_DEBUG({
+    bool HasInvalidBB = false;
     for (BinaryBasicBlock &BB : Function) {
       if (!BB.isValid()) {
+        HasInvalidBB = true;
         dbgs() << "BOLT-INFO: UCE found unreachable block " << BB.getName()
                << " in function " << Function << "\n";
-        Function.dump();
       }
     }
+    if (HasInvalidBB)
+      Function.dump();
   });
   BinaryContext::IndependentCodeEmitter Emitter =
       BC.createIndependentMCCodeEmitter();
diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp
index 9faccc23b4b81..dc3d918d14bd6 100644
--- a/bolt/lib/Profile/DataAggregator.cpp
+++ b/bolt/lib/Profile/DataAggregator.cpp
@@ -45,10 +45,23 @@ using namespace bolt;
 namespace opts {
 
 static cl::opt<bool>
-    BasicAggregation("nl",
-                     cl::desc("aggregate basic samples (without brstack info)"),
+    BasicAggregation("basic-events",
+                     cl::desc("aggregate basic events (without brstack info)"),
                      cl::cat(AggregatorCategory));
 
+static cl::alias BasicAggregationAlias("ba",
+                                       cl::desc("Alias for --basic-events"),
+                                       cl::aliasopt(BasicAggregation));
+
+static cl::opt<bool> DeprecatedBasicAggregationNl(
+    "nl", cl::desc("Alias for --basic-events (deprecated. Use --ba)"),
+    cl::cat(AggregatorCategory), cl::ReallyHidden,
+    cl::callback([](const bool &Enabled) {
+      errs()
+          << "BOLT-WARNING: '-nl' is deprecated, please use '--ba' instead.\n";
+      BasicAggregation = Enabled;
+    }));
+
 cl::opt<bool> ArmSPE("spe", cl::desc("Enable Arm SPE mode."),
                      cl::cat(AggregatorCategory));
 
@@ -1433,7 +1446,7 @@ std::error_code DataAggregator::printLBRHeatMap() {
                 "Cannot build heatmap.";
     } else {
       errs() << "HEATMAP-ERROR: no brstack traces detected in profile. "
-                "Cannot build heatmap. Use -nl for building heatmap from "
+                "Cannot build heatmap. Use -ba for building heatmap from "
                 "basic events.\n";
     }
     exit(1);
@@ -1629,8 +1642,8 @@ std::error_code DataAggregator::parseBranchEvents() {
             << "PERF2BOLT-WARNING: all recorded samples for this binary lack "
                "brstack. Record profile with perf record -j any or run "
                "perf2bolt "
-               "in non-brstack mode with -nl (the performance improvement in "
-               "-nl "
+               "in non-brstack mode with -ba (the performance improvement in "
+               "-ba "
                "mode may be limited)\n";
       else
         errs()
diff --git a/bolt/lib/Rewrite/RewriteInstance.cpp b/bolt/lib/Rewrite/RewriteInstance.cpp
index 6fa66ab55210d..77e5688781d57 100644
--- a/bolt/lib/Rewrite/RewriteInstance.cpp
+++ b/bolt/lib/Rewrite/RewriteInstance.cpp
@@ -2949,9 +2949,7 @@ void RewriteInstance::handleRelocation(const SectionRef &RelocatedSection,
           ReferencedSymbol =
               ReferencedBF->addEntryPointAtOffset(RefFunctionOffset);
         } else {
-          ReferencedSymbol =
-              ReferencedBF->getOrCreateLocalLabel(Address,
-                                                  /*CreatePastEnd =*/true);
+          ReferencedSymbol = ReferencedBF->getOrCreateLocalLabel(Address);
 
           // If ContainingBF != nullptr, it equals ReferencedBF (see
           // if-condition above) so we're handling a relocation from a function
diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
index 6954cb295e86a..7769162d67eaf 100644
--- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
+++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp
@@ -157,6 +157,10 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
   MCPhysReg getStackPointer() const override { return AArch64::SP; }
   MCPhysReg getFramePointer() const override { return AArch64::FP; }
 
+  bool isBreakpoint(const MCInst &Inst) const override {
+    return Inst.getOpcode() == AArch64::BRK;
+  }
+
   bool isPush(const MCInst &Inst) const override {
     return isStoreToStack(Inst);
   };
@@ -2153,7 +2157,7 @@ class AArch64MCPlusBuilder : public MCPlusBuilder {
 
     --I;
     Address -= 4;
-    if (I->getOpcode() != AArch64::ADRP ||
+    if (I != Begin || I->getOpcode() != AArch64::ADRP ||
         MCPlus::getNumPrimeOperands(*I) < 2 || !I->getOperand(0).isReg() ||
         !I->getOperand(1).isImm() || I->getOperand(0).getReg() != AArch64::X16)
       return 0;
diff --git a/bolt/test/AArch64/Inputs/plt-gnu-ld.yaml b/bolt/test/AArch64/Inputs/plt-gnu-ld.yaml
index 1e3353acb633a..f6dc60d062ced 100644
--- a/bolt/test/AArch64/Inputs/plt-gnu-ld.yaml
+++ b/bolt/test/AArch64/Inputs/plt-gnu-ld.yaml
@@ -93,7 +93,7 @@ Sections:
     Flags:           [ SHF_ALLOC, SHF_EXECINSTR ]
     Address:         0x400510
     AddressAlign:    0x8
-    Content:         1D0080D21E0080D2E50300AAE10340F9E2230091E603009100000090002015910300009063201B910400009084201D91E0FFFF97EBFFFF972F0000148000009000F047F9400000B4E2FFFF17C0035FD6800000B000000191810000B0210001913F0000EBC00000540100009021B443F9610000B4F00301AA00021FD6C0035FD6800000B000000191810000B021000191210000CB22FC7FD3410C818BFF0781EB21FC4193C00000540200009042B843F9620000B4F00302AA00021FD6C0035FD6FD7BBEA9FD030091F30B00F9930000B06002413980000035DEFFFF972000805260020139F30B40F9FD7BC2A8C0035FD6E4FFFF17FF8300D1FD7B01A9FD430091BFC31FB8E1230091E8DD9752A8D5BB72E80300B9E80B00B9E0130091FF0700B9880000B00900009029C11291092500F9082540F9820080D200013FD6E90340B9E80740B90801096BA00000540100001428008052A8C31FB814000014880000B00900009029411391092900F9082940F9E0230091E1031F2A820080D200013FD6E80B40B9A80000340100001428008052A8C31FB8050000140000009000E01D9194FFFF9701000014A0C35FB8FD7B41A9FF830091C0035FD6FD7BBCA9FD030091F35301A99400009094023891F55B02A995000090B5E23791940215CBF603002AF76303A9F70301AAF80302AA5DFFFF97FF0F94EB6001005494FE4393130080D2A37A73F8E20318AA73060091E10317AAE003162A60003FD69F0213EB21FFFF54F35341A9F55B42A9F76343A9FD7BC4A8C0035FD61F2003D5C0035FD6
+    Content:         1D0080D21E0080D2E50300AAE10340F9E2230091E603009100000090002015910300009063201B910400009084201D91E0FFFF97EBFFFF972F0000148000009000F047F9400000B4E2FFFF17C0035FD6800000B000000191810000B0210001913F0000EBC00000540100009021B443F9610000B4F00301AA00021FD6C0035FD6800000B000000191810000B021000191210000CB22FC7FD3410C818BFF0781EB21FC4193C00000540200009042B843F9620000B4F00302AA00021FD6C0035FD6FD7BBEA9FD030091F30B00F9930000B06002413980000035DEFFFF972000805260020139F30B40F9FD7BC2A8C0035FD6E4FFFF17FF8300D1FD7B01A9FD430091BFC31FB8E1230091E8DD9752A8D5BB72E80300B9E80B00B9E0130091FF0700B9880000B00900009029C11291092500F9082540F9820080D200013FD6E90340B9E80740B90801096BA00000540100001428008052A8C31FB814000014880000B00900009029411391092900F9082940F9E0230091E1031F2A820080D200013FD6E80B40B9A80000340100001428008052A8C31FB8050000140000009000E01D9194FFFF9701000014A0C35FB8FD7B41A9FF830091C0035FD6FD7BBCA9FD030091F35301A99400009094023891F55B02A995000090B5E23791940215CBF603002AF76303A9F70301AAF80302AA60003FD6FF0F94EB6001005494FE4393130080D2A37A73F8E20318AA73060091E10317AAE003162A60003FD69F0213EB21FFFF54F35341A9F55B42A9F76343A9FD7BC4A8C0035FD61F2003D5C0035FD6
   - Name:            .rodata
     Type:            SHT_PROGBITS
     Flags:           [ SHF_ALLOC ]
@@ -435,6 +435,12 @@ Symbols:
     Binding:         STB_GLOBAL
     Value:           0x400604
     Size:            0xC4
+  - Name:            post_main
+    Type:            STT_FUNC
+    Section:         .text
+    Binding:         STB_GLOBAL
+    Value:           0x4006C8
+    Size:            0x7C
   - Name:            'printf@@GLIBC_2.17'
     Type:            STT_FUNC
     Binding:         STB_GLOBAL
diff --git a/bolt/test/AArch64/invalid-code-padding.s b/bolt/test/AArch64/invalid-code-padding.s
new file mode 100644
index 0000000000000..4706e600621ab
--- /dev/null
+++ b/bolt/test/AArch64/invalid-code-padding.s
@@ -0,0 +1,34 @@
+# RUN: %clang %cflags %s -o %t.so -Wl,-q
+# RUN: not llvm-bolt %t.so -o %t.bolt --fail-on-invalid-padding \
+# RUN:   2>&1 | FileCheck %s
+# CHECK: BOLT-ERROR: found 1 instance(s) of invalid code padding
+
+  .text
+  .align 2
+  .global foo
+  .type foo, %function
+foo:
+  cmp x0, x1
+  b.eq .Ltmp1
+  adrp x1, jmptbl
+  add x1, x1, :lo12:jmptbl
+  ldrsw x2, [x1, x2, lsl #2]
+  br x2
+  b .Ltmp1
+.Ltmp2:
+  add x0, x0, x1
+  ret
+  .size foo, .-foo
+
+.Ltmp1:
+  add x0, x0, x1
+  b .Ltmp2
+
+  # Dummy relocation to force relocation mode
+  .reloc 0, R_AARCH64_NONE
+
+  .section .rodata, "a"
+  .align 2
+  .global jmptbl
+jmptbl:
+  .word .text+0x28 - .
diff --git a/bolt/test/AArch64/plt-got.test b/bolt/test/AArch64/plt-got.test
index be1c095784b70..18b1ea4bf908a 100644
--- a/bolt/test/AArch64/plt-got.test
+++ b/bolt/test/AArch64/plt-got.test
@@ -1,7 +1,8 @@
 // This test checks .plt.got handling by BOLT
 
 RUN: yaml2obj %p/Inputs/plt-got.yaml &> %t.exe
-RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm --print-only=_start/1 | \
-RUN:   FileCheck %s
+RUN: llvm-bolt %t.exe -o %t.bolt --print-disasm --print-only=_start/1 \
+RUN:   2>&1 | FileCheck %s
 
 CHECK: bl abort@PLT
+CHECK: BOLT-WARNING: found unmarked AArch64 veneer
diff --git a/bolt/test/X86/nolbr.s b/bolt/test/X86/nolbr.s
index d6710deeec343..432797eaaa129 100644
--- a/bolt/test/X86/nolbr.s
+++ b/bolt/test/X86/nolbr.s
@@ -10,7 +10,7 @@
 # RUN: FileCheck %s --input-file %t.fdata --check-prefix=CHECK-FDATA
 # RUN: llvm-strip --strip-unneeded %t.o
 # RUN: %clang %cflags %t.o -o %t.exe -Wl,-q -nostdlib
-# RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --dyno-stats -nl \
+# RUN: llvm-bolt %t.exe -o %t.out --data %t.fdata --dyno-stats -ba \
 # RUN:    --print-only=_start 2>&1 | FileCheck %s --check-prefix=CHECK-BOLT
 
 # CHECK-FDATA:      no_lbr
diff --git a/bolt/test/X86/pre-aggregated-perf.test b/bolt/test/X86/pre-aggregated-perf.test
index c4f5b8f2ea445..6951d09db3de6 100644
--- a/bolt/test/X86/pre-aggregated-perf.test
+++ b/bolt/test/X86/pre-aggregated-perf.test
@@ -61,11 +61,11 @@ RUN: FileCheck %s -check-prefix=NEWFORMAT --input-file %t.bolt.yaml
 RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated-basic.txt -o %t.ba \
 RUN:   2>&1 | FileCheck %s --check-prefix=BASIC-ERROR
 RUN: perf2bolt %t.exe -o %t --pa -p %p/Inputs/pre-aggregated-basic.txt -o %t.ba.nl \
-RUN:   -nl 2>&1 | FileCheck %s --check-prefix=BASIC-SUCCESS
-RUN: FileCheck %s --input-file %t.ba.nl --check-prefix CHECK-BASIC-NL
+RUN:   -ba 2>&1 | FileCheck %s --check-prefix=BASIC-SUCCESS
+RUN: FileCheck %s --input-file %t.ba.nl --check-prefix CHECK-BASIC-BA
 BASIC-ERROR: BOLT-INFO: 0 out of 7 functions in the binary (0.0%) have non-empty execution profile
 BASIC-SUCCESS: BOLT-INFO: 4 out of 7 functions in the binary (57.1%) have non-empty execution profile
-CHECK-BASIC-NL: no_lbr cycles
+CHECK-BASIC-BA: no_lbr cycles
 
 PERF2BOLT: 1 frame_dummy/1 1e 1 frame_dummy/1 0 0 1
 PERF2BOLT-NEXT: 1 main 451 1 SolveCubic 0 0 2
diff --git a/bolt/test/perf2bolt/perf_test.test b/bolt/test/perf2bolt/perf_test.test
index 08b3413a67a0b..984432716992a 100644
--- a/bolt/test/perf2bolt/perf_test.test
+++ b/bolt/test/perf2bolt/perf_test.test
@@ -4,7 +4,7 @@ REQUIRES: system-linux, perf
 
 RUN: %clang %S/Inputs/perf_test.c -fuse-ld=lld -pie -Wl,--script=%S/Inputs/perf_test.lds -o %t
 RUN: perf record -Fmax -e cycles:u -o %t2 -- %t
-RUN: perf2bolt %t -p=%t2 -o %t3 -nl -ignore-build-id --show-density \
+RUN: perf2bolt %t -p=%t2 -o %t3 -ba -ignore-build-id --show-density \
 RUN:   --heatmap %t.hm 2>&1 | FileCheck %s
 RUN: FileCheck %s --input-file %t.hm-section-hotness.csv --check-prefix=CHECK-HM
 
@@ -15,7 +15,7 @@ CHECK: BOLT-INFO: Functions with density >= {{.*}} account for 99.00% total samp
 
 RUN: %clang %S/Inputs/perf_test.c -no-pie -fuse-ld=lld -o %t4
 RUN: perf record -Fmax -e cycles:u -o %t5 -- %t4
-RUN: perf2bolt %t4 -p=%t5 -o %t6 -nl -ignore-build-id --show-density \
+RUN: perf2bolt %t4 -p=%t5 -o %t6 -ba -ignore-build-id --show-density \
 RUN:   --heatmap %t.hm2 2>&1 | FileCheck %s
 RUN: FileCheck %s --input-file %t.hm2-section-hotness.csv --check-prefix=CHECK-HM
 
diff --git a/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css
index e555ee7c370f7..67f11f77eae61 100644
--- a/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css
+++ b/clang-tools-extra/clang-doc/assets/clang-doc-mustache.css
@@ -396,7 +396,7 @@ body, html {
 
 .sidebar {
     width: 250px;
-    top: 0;
+    top: 60px;
     left: 0;
     height: 100%;
     position: fixed;
diff --git a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
index e825547ba0134..799e02ffaf3af 100644
--- a/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
+++ b/clang-tools-extra/clang-include-fixer/IncludeFixer.cpp
@@ -90,6 +90,7 @@ bool IncludeFixerActionFactory::runInvocation(
 
   // Set up Clang.
   CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps));
+  Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr());
   Compiler.setFileManager(Files);
 
   // Create the compiler's actual diagnostics engine. We want to drop all
diff --git a/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp b/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
index 410df1288b1cc..d43a3ea327a39 100644
--- a/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
+++ b/clang-tools-extra/clang-include-fixer/find-all-symbols/STLPostfixHeaderMap.cpp
@@ -13,6 +13,9 @@ namespace find_all_symbols {
 
 const HeaderMapCollector::RegexHeaderMap *getSTLPostfixHeaderMap() {
   static const HeaderMapCollector::RegexHeaderMap STLPostfixHeaderMap = {
+      {"include/__float_float.h$", "<cfloat>"},
+      {"include/__float_header_macro.h$", "<cfloat>"},
+      {"include/__float_infinity_nan.h$", "<cfloat>"},
       {"include/__stdarg___gnuc_va_list.h$", "<cstdarg>"},
       {"include/__stdarg___va_copy.h$", "<cstdarg>"},
       {"include/__stdarg_header_macro.h$", "<cstdarg>"},
diff --git a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
index 0399af2a673f4..61ccd26e48c1e 100644
--- a/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
+++ b/clang-tools-extra/clang-tidy/bugprone/UnsafeFunctionsCheck.cpp
@@ -49,7 +49,7 @@ static StringRef getReplacementFor(StringRef FunctionName,
     // Try to find a better replacement from Annex K first.
     StringRef AnnexKReplacementFunction =
         StringSwitch<StringRef>(FunctionName)
-            .Cases("asctime", "asctime_r", "asctime_s")
+            .Cases({"asctime", "asctime_r"}, "asctime_s")
             .Case("gets", "gets_s")
             .Default({});
     if (!AnnexKReplacementFunction.empty())
@@ -59,7 +59,7 @@ static StringRef getReplacementFor(StringRef FunctionName,
   // FIXME: Some of these functions are available in C++ under "std::", and
   // should be matched and suggested.
   return StringSwitch<StringRef>(FunctionName)
-      .Cases("asctime", "asctime_r", "strftime")
+      .Cases({"asctime", "asctime_r"}, "strftime")
       .Case("gets", "fgets")
       .Case("rewind", "fseek")
       .Case("setbuf", "setvbuf");
@@ -90,13 +90,13 @@ static StringRef getReplacementForAdditional(StringRef FunctionName,
 /// safer alternative.
 static StringRef getRationaleFor(StringRef FunctionName) {
   return StringSwitch<StringRef>(FunctionName)
-      .Cases("asctime", "asctime_r", "ctime",
+      .Cases({"asctime", "asctime_r", "ctime"},
              "is not bounds-checking and non-reentrant")
-      .Cases("bcmp", "bcopy", "bzero", "is deprecated")
-      .Cases("fopen", "freopen", "has no exclusive access to the opened file")
+      .Cases({"bcmp", "bcopy", "bzero"}, "is deprecated")
+      .Cases({"fopen", "freopen"}, "has no exclusive access to the opened file")
       .Case("gets", "is insecure, was deprecated and removed in C11 and C++14")
       .Case("getpw", "is dangerous as it may overflow the provided buffer")
-      .Cases("rewind", "setbuf", "has no error detection")
+      .Cases({"rewind", "setbuf"}, "has no error detection")
       .Case("vfork", "is insecure as it can lead to denial of service "
                      "situations in the parent process")
       .Default("is not bounds-checking");
diff --git a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
index 1f7b525c8c2b2..cbef64f351341 100644
--- a/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
+++ b/clang-tools-extra/clangd/index/CanonicalIncludes.cpp
@@ -16,6 +16,9 @@ namespace clang {
 namespace clangd {
 namespace {
 const std::pair<llvm::StringRef, llvm::StringRef> IncludeMappings[] = {
+    {"include/__float_float.h", "<cfloat>"},
+    {"include/__float_header_macro.h", "<cfloat>"},
+    {"include/__float_infinity_nan.h", "<cfloat>"},
     {"include/__stdarg___gnuc_va_list.h", "<cstdarg>"},
     {"include/__stdarg___va_copy.h", "<cstdarg>"},
     {"include/__stdarg_header_macro.h", "<cstdarg>"},
diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/switch-missing-default-case.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/switch-missing-default-case.rst
index 648c2c208a4ec..3ce862ff8afcc 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/switch-missing-default-case.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/switch-missing-default-case.rst
@@ -9,8 +9,8 @@ on covering cases with non-enums where the compiler may not issue warnings.
 Switch statements without a default case can lead to unexpected
 behavior and incomplete handling of all possible cases. When a switch statement
 lacks a default case, if a value is encountered that does not match any of the
-specified cases, the program will continue execution without any defined
-behavior or handling.
+specified cases, the switch statement will do nothing and the program will
+continue execution without handling the value.
 
 This check helps identify switch statements that are missing a default case,
 allowing developers to ensure that all possible cases are handled properly.
diff --git a/clang/bindings/python/clang/cindex.py b/clang/bindings/python/clang/cindex.py
index da97bd375608c..2786add27f5e8 100644
--- a/clang/bindings/python/clang/cindex.py
+++ b/clang/bindings/python/clang/cindex.py
@@ -3015,7 +3015,7 @@ class _CXUnsavedFile(Structure):
 
 
 # Functions calls through the python interface are rather slow. Fortunately,
-# for most symboles, we do not need to perform a function call. Their spelling
+# for most symbols, we do not need to perform a function call. Their spelling
 # never changes and is consequently provided by this spelling cache.
 SPELLING_CACHE = {
     # 0: CompletionChunk.Kind("Optional"),
diff --git a/clang/cmake/caches/Release.cmake b/clang/cmake/caches/Release.cmake
index a523cc561b3f9..82bfdc0cfd565 100644
--- a/clang/cmake/caches/Release.cmake
+++ b/clang/cmake/caches/Release.cmake
@@ -44,6 +44,19 @@ set(LLVM_RELEASE_ENABLE_LTO THIN CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PGO ON CACHE BOOL "")
 set(LLVM_RELEASE_ENABLE_RUNTIMES ${DEFAULT_RUNTIMES} CACHE STRING "")
 set(LLVM_RELEASE_ENABLE_PROJECTS ${DEFAULT_PROJECTS} CACHE STRING "")
+
+# This option enables linking stage2 clang statically with the runtimes
+# (libc++ and compiler-rt) from stage1.  In theory this will give the
+# binaries better performance and make them more portable.  However,
+# this configuration is not well tested and causes build failures with
+# the flang-rt tests cases, since the -stclib=libc++ flag does not
+# get propagated to the runtimes build.  There is also a separate
+# issue on Darwin where clang will use the local libc++ headers, but
+# link with system libc++ which can cause some incompatibilities.
+# See https://github.com/llvm/llvm-project/issues/77653
+# Because of these problems, this option will default to OFF.
+set(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES OFF CACHE BOOL "")
+
 # Note we don't need to add install here, since it is one of the pre-defined
 # steps.
 set(LLVM_RELEASE_FINAL_STAGE_TARGETS "clang;package;check-all;check-llvm;check-clang" CACHE STRING "")
@@ -55,8 +68,12 @@ set(CLANG_ENABLE_BOOTSTRAP ON CACHE BOOL "")
 
 set(STAGE1_PROJECTS "clang")
 
+# Need to build compiler-rt in order to use PGO for later stages.
+set(STAGE1_RUNTIMES "compiler-rt")
 # Build all runtimes so we can statically link them into the stage2 compiler.
-set(STAGE1_RUNTIMES "compiler-rt;libcxx;libcxxabi;libunwind")
+if(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES)
+  list(APPEND STAGE1_RUNTIMES "libcxx;libcxxabi;libunwind")
+endif()
 
 if (LLVM_RELEASE_ENABLE_PGO)
   list(APPEND STAGE1_PROJECTS "lld")
@@ -118,11 +135,13 @@ set_instrument_and_final_stage_var(LLVM_ENABLE_LTO "${LLVM_RELEASE_ENABLE_LTO}"
 if (LLVM_RELEASE_ENABLE_LTO)
   set_instrument_and_final_stage_var(LLVM_ENABLE_LLD "ON" BOOL)
 endif()
-set_instrument_and_final_stage_var(LLVM_ENABLE_LIBCXX "ON" BOOL)
-set_instrument_and_final_stage_var(LLVM_STATIC_LINK_CXX_STDLIB "ON" BOOL)
-set(RELEASE_LINKER_FLAGS "-rtlib=compiler-rt --unwindlib=libunwind")
-if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
-  set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc")
+if(LLVM_RELEASE_ENABLE_LINK_LOCAL_RUNTIMES)
+  set_instrument_and_final_stage_var(LLVM_ENABLE_LIBCXX "ON" BOOL)
+  set_instrument_and_final_stage_var(LLVM_STATIC_LINK_CXX_STDLIB "ON" BOOL)
+  set(RELEASE_LINKER_FLAGS "-rtlib=compiler-rt --unwindlib=libunwind")
+  if(NOT ${CMAKE_HOST_SYSTEM_NAME} MATCHES "Darwin")
+    set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -static-libgcc")
+  endif()
 endif()
 
 # Set flags for bolt
@@ -130,9 +149,11 @@ if (${CMAKE_HOST_SYSTEM_NAME} MATCHES "Linux")
   set(RELEASE_LINKER_FLAGS "${RELEASE_LINKER_FLAGS} -Wl,--emit-relocs,-znow")
 endif()
 
-set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
-set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
-set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+if (RELEASE_LINKER_FLAGS)
+  set_instrument_and_final_stage_var(CMAKE_EXE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+  set_instrument_and_final_stage_var(CMAKE_SHARED_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+  set_instrument_and_final_stage_var(CMAKE_MODULE_LINKER_FLAGS ${RELEASE_LINKER_FLAGS} STRING)
+endif()
 
 # Final Stage Config (stage2)
 set_final_stage_var(LLVM_ENABLE_RUNTIMES "${LLVM_RELEASE_ENABLE_RUNTIMES}" STRING)
diff --git a/clang/docs/AllocToken.rst b/clang/docs/AllocToken.rst
index bda84669456ce..b65e18ccfa967 100644
--- a/clang/docs/AllocToken.rst
+++ b/clang/docs/AllocToken.rst
@@ -37,8 +37,8 @@ The default mode to calculate tokens is:
   pointers.
 
 Other token ID assignment modes are supported, but they may be subject to
-change or removal. These may (experimentally) be selected with ``-mllvm
--alloc-token-mode=<mode>``:
+change or removal. These may (experimentally) be selected with ``-Xclang
+-falloc-token-mode=<mode>``:
 
 * ``typehash``: This mode assigns a token ID based on the hash of the allocated
   type's name.
diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
index 29b4288a18c35..cabc193de91f2 100644
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -32,7 +32,7 @@ Feature Checking Macros
 =======================
 
 Language extensions can be very useful, but only if you know you can depend on
-them.  In order to allow fine-grain features checks, we support three builtin
+them.  In order to allow fine-grained feature checks, we support three builtin
 function-like macros.  This allows you to directly test for a feature in your
 code without having to resort to something like autoconf or fragile "compiler
 version checks".
@@ -134,7 +134,7 @@ feature) or 0 if not.  They can be used like this:
 .. _langext-has-feature-back-compat:
 
 For backward compatibility, ``__has_feature`` can also be used to test
-for support for non-standardized features, i.e. features not prefixed ``c_``,
+for support for non-standardized features, i.e., features not prefixed ``c_``,
 ``cxx_`` or ``objc_``.
 
 Another use of ``__has_feature`` is to check for compiler features not related
@@ -156,7 +156,7 @@ of ``cxx_rvalue_references``.
 
 This function-like macro is available in C++20 by default, and is provided as an
 extension in earlier language standards. It takes a single argument that is the
-name of a double-square-bracket-style attribute. The argument can either be a
+name of a double-square-bracket-style attribute. The argument can be either a
 single identifier or a scoped identifier. If the attribute is supported, a
 nonzero value is returned. If the attribute is a standards-based attribute, this
 macro returns a nonzero value based on the year and month in which the attribute
@@ -317,7 +317,7 @@ To test for this feature, use ``#if defined(__has_include)``:
 
 .. code-block:: c++
 
-  // To avoid problem with non-clang compilers not having this macro.
+  // To avoid problems with non-clang compilers not having this macro.
   #if defined(__has_include)
   #if __has_include("myinclude.h")
   # include "myinclude.h"
@@ -342,7 +342,7 @@ or 0 otherwise:
   # include_next "myinclude.h"
   #endif
 
-  // To avoid problem with non-clang compilers not having this macro.
+  // To avoid problems with non-clang compilers not having this macro.
   #if defined(__has_include_next)
   #if __has_include_next("myinclude.h")
   # include_next "myinclude.h"
@@ -358,7 +358,7 @@ absolute path is used in the file argument.
 -----------------
 
 This function-like macro takes a string literal that represents a command line
-option for a warning and returns true if that is a valid warning option.
+option for a warning and returns ``true`` if that is a valid warning option.
 
 .. code-block:: c++
 
@@ -393,7 +393,7 @@ Builtin Macros
   file.
 
 ``__clang__``
-  Defined when compiling with Clang
+  Defined when compiling with Clang.
 
 ``__clang_major__``
   Defined to the major marketing version number of Clang (e.g., the 2 in
@@ -596,7 +596,7 @@ NEON vector types are created using ``neon_vector_type`` and
 
 GCC vector types are created using the ``vector_size(N)`` attribute.  The
 argument ``N`` specifies the number of bytes that will be allocated for an
-object of this type.  The size has to be multiple of the size of the vector
+object of this type.  The size has to be a multiple of the size of the vector
 element type. For example:
 
 .. code-block:: c++
@@ -617,7 +617,7 @@ element type. For example:
 Boolean Vectors
 ---------------
 
-Clang also supports the ext_vector_type attribute with boolean element types in
+Clang also supports the ``ext_vector_type`` attribute with boolean element types in
 C and C++.  For example:
 
 .. code-block:: c++
@@ -671,7 +671,7 @@ Vector Literals
 
 Vector literals can be used to create vectors from a set of scalars, or
 vectors.  Either parentheses or braces form can be used.  In the parentheses
-form the number of literal values specified must be one, i.e. referring to a
+form the number of literal values specified must be one, i.e., referring to a
 scalar value, or must match the size of the vector type being created.  If a
 single scalar literal value is specified, the scalar literal value will be
 replicated to all the components of the vector type.  In the brackets form any
@@ -722,12 +722,12 @@ address &v[i]                    no      no        no [#]_     no     no
 
 See also :ref:`langext-__builtin_shufflevector`, :ref:`langext-__builtin_convertvector`.
 
-.. [#] ternary operator(?:) has different behaviors depending on condition
-  operand's vector type. If the condition is a GNU vector (i.e. __vector_size__),
+.. [#] ternary operator(?:) has different behaviors depending on the condition
+  operand's vector type. If the condition is a GNU vector (i.e., ``__vector_size__``),
   a NEON vector or an SVE vector, it's only available in C++ and uses normal bool
   conversions (that is, != 0).
   If it's an extension (OpenCL) vector, it's only available in C and OpenCL C.
-  And it selects based on signedness of the condition operands (OpenCL v1.1 s6.3.9).
+  And it selects based on the signedness of the condition operands (OpenCL v1.1 s6.3.9).
 .. [#] sizeof can only be used on vector length specific SVE types.
 .. [#] Clang does not allow the address of an element to be taken while GCC
    allows this. This is intentional for vectors with a boolean element type and
@@ -744,7 +744,7 @@ to perform additional operations on certain scalar and vector types.
 Let ``T`` be one of the following types:
 
 * an integer type (as in C23 6.2.5p22), but excluding enumerated types and ``bool``
-* the standard floating types float or double
+* the standard floating types ``float`` or ``double``
 * a half-precision floating point type, if one is supported on the target
 * a vector type.
 
@@ -830,7 +830,7 @@ of different sizes and signs is forbidden in binary and ternary builtins.
  T __builtin_elementwise_canonicalize(T x)      return the platform specific canonical encoding                        floating point types
                                                 of a floating-point number
  T __builtin_elementwise_copysign(T x, T y)     return the magnitude of x with the sign of y.                          floating point types
- T __builtin_elementwise_fmod(T x, T y)         return The floating-point remainder of (x/y) whose sign                floating point types
+ T __builtin_elementwise_fmod(T x, T y)         return the floating-point remainder of (x/y) whose sign                floating point types
                                                 matches the sign of x.
  T __builtin_elementwise_max(T x, T y)          return x or y, whichever is larger                                     integer and floating point types
                                                 For floating point types, follows semantics of maxNum
@@ -945,7 +945,7 @@ Let ``VT`` be a vector type and ``ET`` the element type of ``VT``.
 
 Each builtin accesses memory according to a provided boolean mask. These are
 provided as ``__builtin_masked_load`` and ``__builtin_masked_store``. The first
-argument is always boolean mask vector. The ``__builtin_masked_load`` builtin
+argument is always a boolean mask vector. The ``__builtin_masked_load`` builtin
 takes an optional third vector argument that will be used for the result of the
 masked-off lanes. These builtins assume the memory is unaligned, use
 ``__builtin_assume_aligned`` if alignment is desired.
@@ -1106,7 +1106,7 @@ to ``float``; see below for more information on this emulation.
 
 ``__fp16`` and ``_Float16`` both use the binary16 format from IEEE
 754-2008, which provides a 5-bit exponent and an 11-bit significand
-(counting the implicit leading 1). ``__bf16`` uses the `bfloat16
+(including the implicit leading 1). ``__bf16`` uses the `bfloat16
 <https://en.wikipedia.org/wiki/Bfloat16_floating-point_format>`_ format,
 which provides an 8-bit exponent and an 8-bit significand; this is the same
 exponent range as `float`, just with greatly reduced precision.
@@ -1140,7 +1140,7 @@ types with the ``-ffloat16-excess-precision=`` and
 
 * ``none``: meaning to perform strict operation-by-operation emulation
 * ``standard``: meaning that excess precision is permitted under the rules
-  described in the standard, i.e. never across explicit casts or statements
+  described in the standard, i.e., never across explicit casts or statements
 * ``fast``: meaning that excess precision is permitted whenever the
   optimizer sees an opportunity to avoid truncations; currently this has no
   effect beyond ``standard``
@@ -1896,7 +1896,7 @@ Type Trait Primitives
 
 Type trait primitives are special builtin constant expressions that can be used
 by the standard C++ library to facilitate or simplify the implementation of
-user-facing type traits in the <type_traits> header.
+user-facing type traits in the ``<type_traits>`` header.
 
 They are not intended to be used directly by user code because they are
 implementation-defined and subject to change -- as such they're tied closely to
@@ -2051,7 +2051,7 @@ The following type trait primitives are supported by Clang. Those traits marked
 * ``__is_volatile`` (C++, Embarcadero)
 * ``__reference_binds_to_temporary(T, U)`` (Clang):  Determines whether a
   reference of type ``T`` bound to an expression of type ``U`` would bind to a
-  materialized temporary object. If ``T`` is not a reference type the result
+  materialized temporary object. If ``T`` is not a reference type, the result
   is false. Note this trait will also return false when the initialization of
   ``T`` from ``U`` is ill-formed.
   Deprecated, use ``__reference_constructs_from_temporary``.
@@ -2179,7 +2179,7 @@ Prior to clang-16, the output may only be used safely when the indirect
 branches are not taken.  Query for this difference with
 ``__has_extension(gnu_asm_goto_with_outputs_full)``.
 
-When using tied-outputs (i.e. outputs that are inputs and outputs, not just
+When using tied-outputs (i.e., outputs that are inputs and outputs, not just
 outputs) with the `+r` constraint, there is a hidden input that's created
 before the label, so numeric references to operands must account for that.
 
@@ -2358,7 +2358,7 @@ Loading from a ``__weak`` variable always implicitly retains the
 loaded value.  In non-ARC modes, this retain is normally balanced
 by an implicit autorelease.  This autorelease can be suppressed
 by performing the load in the receiver position of a ``-retain``
-message send (e.g. ``[weakReference retain]``); note that this performs
+message send (e.g., ``[weakReference retain]``); note that this performs
 only a single retain (the retain done when primitively loading from
 the weak reference).
 
@@ -2559,7 +2559,7 @@ When a method that's introduced in the OS newer than the target OS is called, a
 .. code-block:: objc
 
   void my_fun(NSSomeClass* var) {
-    // If fancyNewMethod was added in e.g. macOS 10.12, but the code is
+    // If fancyNewMethod was added in e.g., macOS 10.12, but the code is
     // built with -mmacos-version-min=10.11, then this unconditional call
     // will emit a -Wunguarded-availability warning:
     [var fancyNewMethod];
@@ -2722,7 +2722,7 @@ correctly in any circumstances. It can be used if:
   metaprogramming algorithms to be able to specify/detect types generically.
 
 - the generated kernel binary does not contain indirect calls because they
-  are eliminated using compiler optimizations e.g. devirtualization.
+  are eliminated using compiler optimizations e.g., devirtualization.
 
 - the selected target supports the function pointer like functionality e.g.
   most CPU targets.
@@ -2752,12 +2752,12 @@ Extension Specification, section 1.2
 <https://www.khronos.org/registry/OpenCL/specs/3.0-unified/html/OpenCL_Ext.html#extensions-overview>`_.
 
 This is not conformant behavior and it can only be used portably when the
-functions with variadic prototypes do not get generated in binary e.g. the
+functions with variadic prototypes do not get generated in binary e.g., the
 variadic prototype is used to specify a function type with any number of
 arguments in metaprogramming algorithms in C++ for OpenCL.
 
 This extension can also be used when the kernel code is intended for targets
-supporting the variadic arguments e.g. majority of CPU targets.
+supporting the variadic arguments e.g., majority of CPU targets.
 
 **Example of Use**:
 
@@ -5167,8 +5167,8 @@ __builtin_amdgcn_fence
 
 ``__builtin_amdgcn_fence`` emits a fence.
 
-* ``unsigned`` atomic ordering, e.g. ``__ATOMIC_ACQUIRE``
-* ``const char *`` synchronization scope, e.g. ``workgroup``
+* ``unsigned`` atomic ordering, e.g., ``__ATOMIC_ACQUIRE``
+* ``const char *`` synchronization scope, e.g., ``workgroup``
 * Zero or more ``const char *`` address spaces names.
 
 The address spaces arguments must be one of the following string literals:
@@ -5315,7 +5315,7 @@ boolean argument as a bit for every lane of the current wave that is currently
 active (i.e., that is converged with the executing thread), and a 0 bit for
 every lane that is not active.
 
-The result is uniform, i.e. it is the same in every active thread of the wave.
+The result is uniform, i.e., it is the same in every active thread of the wave.
 
 __builtin_amdgcn_inverse_ballot_w{32,64}
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
@@ -5531,7 +5531,7 @@ followed by ``off`` or ``on``.
 
 All function definitions in the region between an ``off`` and the following
 ``on`` will be decorated with the ``optnone`` attribute unless doing so would
-conflict with explicit attributes already present on the function (e.g. the
+conflict with explicit attributes already present on the function (e.g., the
 ones that control inlining).
 
 .. code-block:: c++
@@ -5629,12 +5629,12 @@ provides options for vectorization, interleaving, predication, unrolling and
 distribution. Loop hints can be specified before any loop and will be ignored if
 the optimization is not safe to apply.
 
-There are loop hints that control transformations (e.g. vectorization, loop
+There are loop hints that control transformations (e.g., vectorization, loop
 unrolling) and there are loop hints that set transformation options (e.g.
 ``vectorize_width``, ``unroll_count``).  Pragmas setting transformation options
-imply the transformation is enabled, as if it was enabled via the corresponding
-transformation pragma (e.g. ``vectorize(enable)``). If the transformation is
-disabled  (e.g. ``vectorize(disable)``), that takes precedence over
+imply the transformation is enabled, as if it were enabled via the corresponding
+transformation pragma (e.g., ``vectorize(enable)``). If the transformation is
+disabled  (e.g., ``vectorize(disable)``), that takes precedence over
 transformations option pragmas implying that transformation.
 
 Vectorization, Interleaving, and Predication
@@ -5861,7 +5861,7 @@ operation when supported by the target.
 The pragma can take three values: ``on``, ``fast`` and ``off``.  The ``on``
 option is identical to using ``#pragma STDC FP_CONTRACT(ON)`` and it allows
 fusion as specified the language standard.  The ``fast`` option allows fusion
-in cases when the language standard does not make this possible (e.g. across
+in cases when the language standard does not make this possible (e.g., across
 statements in C).
 
 .. code-block:: c++
@@ -6016,14 +6016,14 @@ and ``ignore_denormal_mode``, each optionally prefixed with ``no_``. The meaning
 are as follows:
 
 - ``remote_memory`` means atomic operations may be performed on remote
-  memory, i.e. memory accessed through off-chip interconnects (e.g., PCIe).
+  memory, i.e., memory accessed through off-chip interconnects (e.g., PCIe).
   On ROCm platforms using HIP, remote memory refers to memory accessed via
   PCIe and is subject to specific atomic operation support. See
   `ROCm PCIe Atomics <https://rocm.docs.amd.com/en/latest/conceptual/
   pcie-atomics.html>`_ for further details. Prefixing with ``no_remote_memory`` indicates that
   atomic operations should not be performed on remote memory.
 - ``fine_grained_memory`` means atomic operations may be performed on fine-grained
-  memory, i.e. memory regions that support fine-grained coherence, where updates to
+  memory, i.e., memory regions that support fine-grained coherence, where updates to
   memory are visible to other parts of the system even while modifications are ongoing.
   For example, in HIP, fine-grained coherence ensures that host and device share
   up-to-date data without explicit synchronization (see
@@ -6245,7 +6245,7 @@ match rules are specified after the attribute. The compiler expects an
 identifier that corresponds to the subject set specifier. The ``apply_to``
 specifier is currently the only supported subject set specifier. It allows you
 to specify match rules that form a subset of the attribute's allowed subject
-set, i.e. the compiler doesn't require all of the attribute's subjects. For
+set, i.e., the compiler doesn't require all of the attribute's subjects. For
 example, an attribute like ``[[nodiscard]]`` whose subject set includes
 ``enum``, ``record`` and ``hasType(functionType)``, requires the presence of at
 least one of these rules after ``apply_to``:
diff --git a/clang/docs/OpenMPSupport.rst b/clang/docs/OpenMPSupport.rst
index c75c1703a8dc3..61b5babbd18a8 100644
--- a/clang/docs/OpenMPSupport.rst
+++ b/clang/docs/OpenMPSupport.rst
@@ -256,7 +256,7 @@ implementation.
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | device-specific environment variables                        | :none:`unclaimed`        |                                                                       |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
-| device                       | omp_target_is_accessible routine                             | :part:`In Progress`      | https://github.com/llvm/llvm-project/pull/138294                      |
+| device                       | omp_target_is_accessible routine                             | :good:`done`             | https://github.com/llvm/llvm-project/pull/138294                      |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
 | device                       | omp_get_mapped_ptr routine                                   | :good:`done`             | D141545                                                               |
 +------------------------------+--------------------------------------------------------------+--------------------------+-----------------------------------------------------------------------+
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index caa2802994ed0..190da055c3afc 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -695,6 +695,7 @@ Bug Fixes to Attribute Support
 - Using ``[[gnu::cleanup(some_func)]]`` where some_func is annotated with
   ``[[gnu::error("some error")]]`` now correctly triggers an error. (#GH146520)
 - Fix a crash when the function name is empty in the `swift_name` attribute. (#GH157075)
+- Fixes crashes or missing diagnostics with the `device_kernel` attribute. (#GH161905)
 
 Bug Fixes to C++ Support
 ^^^^^^^^^^^^^^^^^^^^^^^^
diff --git a/clang/docs/UsersManual.rst b/clang/docs/UsersManual.rst
index e82b16f24c73f..fb22ad3c90af4 100644
--- a/clang/docs/UsersManual.rst
+++ b/clang/docs/UsersManual.rst
@@ -11,7 +11,7 @@ Introduction
 ============
 
 The Clang Compiler is an open-source compiler for the C family of
-programming languages, aiming to be the best in class implementation of
+programming languages, aiming to be the best-in-class implementation of
 these languages. Clang builds on the LLVM optimizer and code generator,
 allowing it to provide high-quality optimization and code generation
 support for many targets. For more general information, please see the
@@ -56,7 +56,7 @@ migration from GCC to Clang. In most cases, code "just works".
 Clang also provides an alternative driver, :ref:`clang-cl`, that is designed
 to be compatible with the Visual C++ compiler, cl.exe.
 
-In addition to language specific features, Clang has a variety of
+In addition to language-specific features, Clang has a variety of
 features that depend on what CPU architecture or operating system is
 being compiled for. Please see the :ref:`Target-Specific Features and
 Limitations <target_features>` section for more details.
@@ -299,7 +299,7 @@ output format of the diagnostics that it generates.
 
    This option controls the output format of the filename, line number,
    and column printed in diagnostic messages. The options, and their
-   affect on formatting a simple conversion diagnostic, follow:
+   effect on formatting a simple conversion diagnostic, follow:
 
    **clang** (default)
        ::
@@ -360,7 +360,7 @@ output format of the diagnostics that it generates.
          t.c:3:11: warning: conversion specifies type 'char *' but the argument has type 'int' [-Wformat,Format String]
 
    This category can be used by clients that want to group diagnostics
-   by category, so it should be a high level category. We want dozens
+   by category, so it should be a high-level category. We want dozens
    of these, not hundreds or thousands of them.
 
 .. _opt_fsave-optimization-record:
@@ -750,7 +750,7 @@ control the crash diagnostics.
 
   Disable auto-generation of preprocessed source files during a clang crash.
 
-  The -fno-crash-diagnostics flag can be helpful for speeding the process
+  The ``-fno-crash-diagnostics`` flag can be helpful for speeding the process
   of generating a delta reduced test case.
 
 .. option:: -fcrash-diagnostics-dir=<dir>
@@ -779,7 +779,7 @@ Options to Emit Optimization Reports
 ------------------------------------
 
 Optimization reports trace, at a high-level, all the major decisions
-done by compiler transformations. For instance, when the inliner
+made by compiler transformations. For instance, when the inliner
 decides to inline function ``foo()`` into ``bar()``, or the loop unroller
 decides to unroll a loop N times, or the vectorizer decides to
 vectorize a loop body.
@@ -845,11 +845,11 @@ compilations steps.
 
 .. option:: -fproc-stat-report=
 
-  This option requests driver to print used memory and execution time of each
+  This option requests the driver to print used memory and execution time of each
   compilation step. The ``clang`` driver during execution calls different tools,
   like compiler, assembler, linker etc. With this option the driver reports
   total execution time, the execution time spent in user mode and peak memory
-  usage of each the called tool. Value of the option specifies where the report
+  usage of each called tool. Value of the option specifies where the report
   is sent to. If it specifies a regular file, the data are saved to this file in
   CSV format:
 
@@ -869,7 +869,7 @@ compilations steps.
   * peak memory usage in Kb.
 
   It is possible to specify this option without any value. In this case statistics
-  are printed on standard output in human readable format:
+  are printed on standard output in human-readable format:
 
   .. code-block:: console
 
@@ -884,7 +884,7 @@ compilations steps.
 
   You can also use environment variables to control the process statistics reporting.
   Setting ``CC_PRINT_PROC_STAT`` to ``1`` enables the feature, the report goes to
-  stdout in human readable format.
+  stdout in human-readable format.
   Setting ``CC_PRINT_PROC_STAT_FILE`` to a fully qualified file path makes it report
   process statistics to the given file in the CSV format. Specifying a relative
   path will likely lead to multiple files with the same name created in different
@@ -922,7 +922,7 @@ Clang options that don't fit neatly into other categories.
   most filenames can be written to the file without any special formatting.
   Different Make tools will treat different sets of characters as "special"
   and use different conventions for telling the Make tool that the character
-  is actually part of the filename. Normally Clang uses backslash to "escape"
+  is actually part of the filename. Normally, Clang uses backslash to "escape"
   a special character, which is the convention used by GNU Make. The -MV
   option tells Clang to put double-quotes around the entire filename, which
   is the convention used by NMake and Jom.
@@ -957,7 +957,7 @@ Configuration files
 
 Configuration files group command-line options and allow all of them to be
 specified just by referencing the configuration file. They may be used, for
-example, to collect options required to tune compilation for particular
+example, to collect options required to tune compilation for a particular
 target, such as ``-L``, ``-I``, ``-l``, ``--sysroot``, codegen options, etc.
 
 Configuration files can be either specified on the command line or loaded
@@ -986,7 +986,7 @@ either during build or during runtime. At build time, use
 ``CLANG_CONFIG_FILE_USER_DIR`` and ``CLANG_CONFIG_FILE_SYSTEM_DIR``. At run
 time use the ``--config-user-dir=`` and ``--config-system-dir=`` command line
 options. Specifying config directories at runtime overrides the config
-directories set at build time The first file found is used. It is an error if
+directories set at build time. The first file found is used. It is an error if
 the required file cannot be found.
 
 The default configuration files are searched for in the same directories
@@ -996,7 +996,7 @@ the ``--no-default-config`` flag.
 
 First, the algorithm searches for a configuration file named
 ``<triple>-<driver>.cfg`` where `triple` is the triple for the target being
-built for, and `driver` is the name of the currently used driver. The algorithm
+built, and `driver` is the name of the currently used driver. The algorithm
 first attempts to use the canonical name for the driver used, then falls back
 to the one found in the executable name.
 
@@ -1047,7 +1047,7 @@ It is not an error if either of these files is not found.
 The configuration file consists of command-line options specified on one or
 more lines. Lines composed of whitespace characters only are ignored as well as
 lines in which the first non-blank character is ``#``. Long options may be split
-between several lines by a trailing backslash. Here is example of a
+between several lines by a trailing backslash. Here is an example of a
 configuration file:
 
 ::
@@ -1229,7 +1229,7 @@ Clang also allows you to push and pop the current warning state. This is
 particularly useful when writing a header file that will be compiled by
 other people, because you don't know what warning flags they build with.
 
-In the below example :option:`-Wextra-tokens` is ignored for only a single line
+In the example below, :option:`-Wextra-tokens` is ignored for only a single line
 of code, after which the diagnostics return to whatever state had previously
 existed.
 
@@ -1253,7 +1253,7 @@ of warnings, so even when using GCC-compatible #pragmas there is no
 guarantee that they will have identical behaviour on both compilers.
 
 Clang also doesn't yet support GCC behavior for ``#pragma diagnostic pop``
-that doesn't have a corresponding ``#pragma diagnostic push``. In this case
+that doesn't have a corresponding ``#pragma diagnostic push``. In this case,
 GCC pretends that there is a ``#pragma diagnostic push`` at the very beginning
 of the source file, so "unpaired" ``#pragma diagnostic pop`` matches that
 implicit push. This makes a difference for ``#pragma GCC diagnostic ignored``
@@ -1406,7 +1406,7 @@ project even if there are violations in some headers.
   # directory. But it'll still complain for all the other sources, e.g:
   $ cat foo/bar.cc
   #include "dir/include.h" // Clang flags unused declarations here.
-  #include "foo/include.h" // but unused warnings under this source is omitted.
+  #include "foo/include.h" // but unused warnings under this source are omitted.
   #include "next_to_bar_cc.h" // as are unused warnings from this header file.
   // Further, unused warnings in the remainder of bar.cc are also omitted.
 
@@ -1648,7 +1648,7 @@ for more details.
 
 .. option:: -fno-fast-math
 
-   Disable fast-math mode.  This options disables unsafe floating-point
+   Disable fast-math mode.  This option disables unsafe floating-point
    optimizations by preventing the compiler from making any transformations that
    could affect the results.
 
@@ -1766,7 +1766,7 @@ for more details.
 
    * ``fast``: enable fusion across statements disregarding pragmas, breaking
      compliance with the C and C++ standards (default for CUDA).
-   * ``on``: enable C and C++ standard complaint fusion in the same statement
+   * ``on``: enable C and C++ standard compliant fusion in the same statement
      unless dictated by pragmas (default for languages other than CUDA/HIP)
    * ``off``: disable fusion
    * ``fast-honor-pragmas``: fuse across statements unless dictated by pragmas
@@ -1919,7 +1919,7 @@ for more details.
    a single expression of the code.
 
    Valid values are: ``source``, ``double``, and ``extended``.
-   For 64-bit targets, the default value is ``source``. For 32-bit x86 targets
+   For 64-bit targets, the default value is ``source``. For 32-bit x86 targets,
    however, in the case of NETBSD 6.99.26 and under, the default value is
    ``double``; in the case of NETBSD greater than 6.99.26, with NoSSE, the
    default value is ``extended``, with SSE the default value is ``source``.
@@ -3881,9 +3881,9 @@ See :doc:`LanguageExtensions`.
 Differences between various standard modes
 ------------------------------------------
 
-clang supports the -std option, which changes what language mode clang uses.
+clang supports the ``-std`` option, which changes what language mode clang uses.
 The supported modes for C are c89, gnu89, c94, c99, gnu99, c11, gnu11, c17,
-gnu17, c23, gnu23, c2y, gnu2y, and various aliases for those modes. If no -std
+gnu17, c23, gnu23, c2y, gnu2y, and various aliases for those modes. If no ``-std``
 option is specified, clang defaults to gnu17 mode. Many C99 and C11 features
 are supported in earlier modes as a conforming extension, with a warning. Use
 ``-pedantic-errors`` to request an error if a feature from a later standard
@@ -4609,7 +4609,7 @@ codebases.
 
 On ``x86_64-mingw32``, passing i128(by value) is incompatible with the
 Microsoft x64 calling convention. You might need to tweak
-``WinX86_64ABIInfo::classify()`` in lib/CodeGen/Targets/X86.cpp.
+``WinX86_64ABIInfo::classify()`` in ``lib/CodeGen/Targets/X86.cpp``.
 
 For the X86 target, clang supports the `-m16` command line
 argument which enables 16-bit code output. This is broadly similar to
@@ -4760,8 +4760,8 @@ is imported, the linker will generate fixup code for reading or writing to the
 variable.
 
 When multiple toc-data options are used, the last option used has the affect.
-For example: -mno-tocdata=g5,g1 -mtocdata=g1,g2 -mno-tocdata=g2 -mtocdata=g3,g4
-results in -mtocdata=g1,g3,g4
+For example: ``-mno-tocdata=g5,g1 -mtocdata=g1,g2 -mno-tocdata=g2 -mtocdata=g3,g4``
+results in ``-mtocdata=g1,g3,g4``
 
 Names of variables not having external linkage will be ignored.
 
@@ -5143,16 +5143,16 @@ Execute ``clang-cl /?`` to see a list of supported options:
                               Instrument only functions from files where names match any regex separated by a semi-colon
       -fprofile-generate=<dirname>
                               Generate instrumented code to collect execution counts into a raw profile file in the directory specified by the argument. The filename uses default_%m.profraw pattern
-                              (overridden by LLVM_PROFILE_FILE env var)
+                              (overridden by ``LLVM_PROFILE_FILE`` env var)
       -fprofile-generate
                               Generate instrumented code to collect execution counts into default_%m.profraw file
-                              (overridden by '=' form of option or LLVM_PROFILE_FILE env var)
+                              (overridden by '=' form of option or ``LLVM_PROFILE_FILE`` env var)
       -fprofile-instr-generate=<file_name_pattern>
                               Generate instrumented code to collect execution counts into the file whose name pattern is specified as the argument
-                              (overridden by LLVM_PROFILE_FILE env var)
+                              (overridden by ``LLVM_PROFILE_FILE`` env var)
       -fprofile-instr-generate
                               Generate instrumented code to collect execution counts into default.profraw file
-                              (overridden by '=' form of option or LLVM_PROFILE_FILE env var)
+                              (overridden by '=' form of option or ``LLVM_PROFILE_FILE`` env var)
       -fprofile-instr-use=<value>
                               Use instrumentation data for coverage testing or profile-guided optimization
       -fprofile-use=<value>
diff --git a/clang/include/clang/AST/Attr.h b/clang/include/clang/AST/Attr.h
index fe388b9fa045e..ce273c167aa22 100644
--- a/clang/include/clang/AST/Attr.h
+++ b/clang/include/clang/AST/Attr.h
@@ -239,6 +239,8 @@ class HLSLSemanticAttr : public HLSLAnnotationAttr {
   LLVM_PREFERRED_TYPE(bool)
   unsigned SemanticExplicitIndex : 1;
 
+  Decl *TargetDecl = nullptr;
+
 protected:
   HLSLSemanticAttr(ASTContext &Context, const AttributeCommonInfo &CommonInfo,
                    attr::Kind AK, bool IsLateParsed,
@@ -259,6 +261,11 @@ class HLSLSemanticAttr : public HLSLAnnotationAttr {
 
   unsigned getSemanticIndex() const { return SemanticIndex; }
 
+  bool isSemanticIndexExplicit() const { return SemanticExplicitIndex; }
+
+  void setTargetDecl(Decl *D) { TargetDecl = D; }
+  Decl *getTargetDecl() const { return TargetDecl; }
+
   // Implement isa/cast/dyncast/etc.
   static bool classof(const Attr *A) {
     return A->getKind() >= attr::FirstHLSLSemanticAttr &&
diff --git a/clang/include/clang/AST/InferAlloc.h b/clang/include/clang/AST/InferAlloc.h
new file mode 100644
index 0000000000000..c3dc30204feaf
--- /dev/null
+++ b/clang/include/clang/AST/InferAlloc.h
@@ -0,0 +1,35 @@
+//===--- InferAlloc.h - Allocation type inference ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines interfaces for allocation-related type inference.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_AST_INFERALLOC_H
+#define LLVM_CLANG_AST_INFERALLOC_H
+
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Expr.h"
+#include "llvm/Support/AllocToken.h"
+#include <optional>
+
+namespace clang {
+namespace infer_alloc {
+
+/// Infer the possible allocated type from an allocation call expression.
+QualType inferPossibleType(const CallExpr *E, const ASTContext &Ctx,
+                           const CastExpr *CastE);
+
+/// Get the information required for construction of an allocation token ID.
+std::optional<llvm::AllocTokenMetadata>
+getAllocTokenMetadata(QualType T, const ASTContext &Ctx);
+
+} // namespace infer_alloc
+} // namespace clang
+
+#endif // LLVM_CLANG_AST_INFERALLOC_H
diff --git a/clang/include/clang/AST/StmtOpenACC.h b/clang/include/clang/AST/StmtOpenACC.h
index 8b4554e996326..ae8029797a36e 100644
--- a/clang/include/clang/AST/StmtOpenACC.h
+++ b/clang/include/clang/AST/StmtOpenACC.h
@@ -815,6 +815,20 @@ class OpenACCAtomicConstruct final
   Stmt *getAssociatedStmt() {
     return OpenACCAssociatedStmtConstruct::getAssociatedStmt();
   }
+
+  // A struct to represent a broken-down version of the associated statement,
+  // providing the information specified in OpenACC3.3 Section 2.12.
+  struct StmtInfo {
+    const Expr *V;
+    const Expr *X;
+    // Listed as 'expr' in the standard, this is typically a generic expression
+    // as a component.
+    const Expr *RefExpr;
+    // TODO: OpenACC: We should expand this as we're implementing the other
+    // atomic construct kinds.
+  };
+
+  const StmtInfo getAssociatedStmtInfo() const;
 };
 
 } // namespace clang
diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index eb48a0c01fd1e..749f531ec9ab1 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -787,6 +787,8 @@ class HLSLSemanticAttr<bit Indexable> : HLSLAnnotationAttr {
   let Spellings = [];
   let Subjects = SubjectList<[ParmVar, Field, Function]>;
   let LangOpts = [HLSL];
+  let Args = [DeclArgument<Named, "Target">, IntArgument<"SemanticIndex">,
+              BoolArgument<"SemanticExplicitIndex">];
 }
 
 /// A target-specific attribute.  This class is meant to be used as a mixin
@@ -1623,7 +1625,7 @@ def SYCLKernel : InheritableAttr {
   let Documentation = [SYCLKernelDocs];
 }
 
-def DeviceKernel : DeclOrTypeAttr {
+def DeviceKernel : InheritableAttr {
   let Spellings = [Clang<"device_kernel">,
                    Clang<"nvptx_kernel">, Clang<"amdgpu_kernel">,
                    CustomKeyword<"__kernel">, CustomKeyword<"kernel">];
diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index a350acdf146ab..a2c202158522f 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4727,6 +4727,13 @@ def PtrauthStringDiscriminator : Builtin {
   let Prototype = "size_t(char const*)";
 }
 
+// AllocToken builtins.
+def InferAllocToken : Builtin {
+  let Spellings = ["__builtin_infer_alloc_token"];
+  let Attributes = [NoThrow, Const, Pure, CustomTypeChecking, Constexpr, UnevaluatedArguments];
+  let Prototype = "size_t(...)";
+}
+
 // OpenCL v2.0 s6.13.16, s9.17.3.5 - Pipe functions.
 // We need the generic prototype, since the packet type could be anything.
 def ReadPipe : OCLPipeLangBuiltin {
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index f98266f9a86fe..677c2f671bcc9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -838,6 +838,15 @@ TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b4_u4, "V2UiUiUiV2Ui", "nc", "tensor-c
 TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b6_u4, "V3UiUiULiV2Ui", "nc", "tensor-cvt-lut-insts")
 TARGET_BUILTIN(__builtin_amdgcn_perm_pk16_b8_u4, "V4UiULiULiV2Ui", "nc", "tensor-cvt-lut-insts")
 
+TARGET_BUILTIN(__builtin_amdgcn_add_max_i32, "iiiiIb", "nc", "add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_add_max_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_add_min_i32, "iiiiIb", "nc", "add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_add_min_u32, "UiUiUiUiIb", "nc", "add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_pk_add_max_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_i16, "V2sV2sV2sV2sIb", "nc", "pk-add-min-max-insts")
+TARGET_BUILTIN(__builtin_amdgcn_pk_add_min_u16, "V2UsV2UsV2UsV2UsIb", "nc", "pk-add-min-max-insts")
+
 // GFX1250 WMMA builtins
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
diff --git a/clang/include/clang/Basic/BuiltinsHexagon.td b/clang/include/clang/Basic/BuiltinsHexagon.td
index 0727c67346697..cf18359e7bf60 100644
--- a/clang/include/clang/Basic/BuiltinsHexagon.td
+++ b/clang/include/clang/Basic/BuiltinsHexagon.td
@@ -20,8 +20,9 @@ class V<string version, VFeatures newer> : VFeatures {
   let Features = !strconcat("v", version, "|", newer.Features);
 }
 
-let Features = "v79" in def V79 : VFeatures;
+let Features = "v81" in def V81 : VFeatures;
 
+def V79 : V<"79", V81>;
 def V75 : V<"75", V79>;
 def V73 : V<"73", V75>;
 def V71 : V<"71", V73>;
@@ -43,8 +44,9 @@ class HVXV<string version, HVXVFeatures newer> : HVXVFeatures {
   let Features = !strconcat("hvxv", version, "|", newer.Features);
 }
 
-let Features = "hvxv79" in def HVXV79 : HVXVFeatures;
+let Features = "hvxv81" in def HVXV81 : HVXVFeatures;
 
+def HVXV79 : HVXV<"79", HVXV81>;
 def HVXV75 : HVXV<"75", HVXV79>;
 def HVXV73 : HVXV<"73", HVXV75>;
 def HVXV71 : HVXV<"71", HVXV73>;
diff --git a/clang/include/clang/Basic/BuiltinsX86.td b/clang/include/clang/Basic/BuiltinsX86.td
index 54b3ce08f5625..0c85e280e748b 100644
--- a/clang/include/clang/Basic/BuiltinsX86.td
+++ b/clang/include/clang/Basic/BuiltinsX86.td
@@ -123,13 +123,13 @@ let Attributes = [Const, NoThrow, RequiredVectorWidth<128>] in {
     }
   }
 
-  let Features = "ssse3" in {
-    def psignb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
-    def psignw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
-    def psignd128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
-  }
-
   let Features = "ssse3", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+    def psignb128
+        : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
+    def psignw128
+        : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
+    def psignd128
+        : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
     def pmulhrsw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
     def pmaddubsw128 : X86Builtin<"_Vector<8, short>(_Vector<16, char>, _Vector<16, char>)">;
     def pshufb128 : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Vector<16, char>)">;
@@ -198,6 +198,10 @@ let Features = "sse", Header = "xmmintrin.h", Attributes = [NoThrow, RequireDecl
   def _mm_sfence : X86LibBuiltin<"void()">;
 }
 
+let Features = "sse", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<128>] in {
+  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
+}
+
 let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def rcpps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def rcpss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
@@ -205,7 +209,6 @@ let Features = "sse", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in
   def rsqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def sqrtps : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
   def sqrtss : X86Builtin<"_Vector<4, float>(_Vector<4, float>)">;
-  def shufps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, float>, _Constant int)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, RequiredVectorWidth<128>] in {
@@ -222,13 +225,13 @@ let Features = "sse2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def pshufhw : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Constant int)">;
   def movmskpd : X86Builtin<"int(_Vector<2, double>)">;
   def pmovmskb128 : X86Builtin<"int(_Vector<16, char>)">;
+  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
 }
 
 let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] in {
   def psadbw128 : X86Builtin<"_Vector<2, long long int>(_Vector<16, char>, _Vector<16, char>)">;
   def sqrtpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
   def sqrtsd : X86Builtin<"_Vector<2, double>(_Vector<2, double>)">;
-  def shufpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, double>, _Constant int)">;
   def cvtpd2dq : X86Builtin<"_Vector<2, long long int>(_Vector<2, double>)">;
   def cvtpd2ps : X86Builtin<"_Vector<4, float>(_Vector<2, double>)">;
   def cvttpd2dq : X86Builtin<"_Vector<4, int>(_Vector<2, double>)">;
@@ -280,8 +283,6 @@ let Features = "sse2", Attributes = [NoThrow, Const, RequiredVectorWidth<128>] i
   def psllw128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, _Vector<8, short>)">;
   def pslld128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, _Vector<4, int>)">;
   def psllq128 : X86Builtin<"_Vector<2, long long int>(_Vector<2, long long int>, _Vector<2, long long int>)">;
-  def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
-  def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
 }
 
 let Features = "sse2",
@@ -300,6 +301,9 @@ let Features = "sse2",
 
   def psrawi128 : X86Builtin<"_Vector<8, short>(_Vector<8, short>, int)">;
   def psradi128 : X86Builtin<"_Vector<4, int>(_Vector<4, int>, int)">;
+
+  def pslldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
+  def psrldqi128_byteshift : X86Builtin<"_Vector<16, char>(_Vector<16, char>, _Constant int)">;
 }
 
 let Features = "sse3", Attributes = [NoThrow] in {
@@ -487,13 +491,16 @@ let Features = "avx512f,vpclmulqdq", Attributes = [NoThrow, Const, RequiredVecto
   def pclmulqdq512 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant char)">;
 }
 
+let Features = "avx", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<256>] in {
+  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
+  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
+}
+
 let Features = "avx", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] in {
   def vpermilvarpd : X86Builtin<"_Vector<2, double>(_Vector<2, double>, _Vector<2, long long int>)">;
   def vpermilvarps : X86Builtin<"_Vector<4, float>(_Vector<4, float>, _Vector<4, int>)">;
   def vpermilvarpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, long long int>)">;
   def vpermilvarps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, int>)">;
-  def shufpd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant int)">;
-  def shufps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant int)">;
   def dpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
   def cmppd256 : X86Builtin<"_Vector<4, double>(_Vector<4, double>, _Vector<4, double>, _Constant char)">;
   def cmpps256 : X86Builtin<"_Vector<8, float>(_Vector<8, float>, _Vector<8, float>, _Constant char)">;
@@ -603,17 +610,14 @@ let Features = "avx2", Attributes = [NoThrow, Const, RequiredVectorWidth<256>] i
   def mpsadbw256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>, _Constant char)">;
   def palignr256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, "
                               "_Vector<32, char>, _Constant int)">;
-  def psadbw256 : X86Builtin<"_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
-  def psignb256 : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
-  def psignw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  def psignd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+  def psadbw256
+      : X86Builtin<
+            "_Vector<4, long long int>(_Vector<32, char>, _Vector<32, char>)">;
   def psllw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
-  def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
   def pslld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
   def psllq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
   def psraw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
   def psrad256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
-  def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
   def psrlw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<8, short>)">;
   def psrld256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<4, int>)">;
   def psrlq256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, _Vector<2, long long int>)">;
@@ -647,10 +651,12 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def psllwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def pslldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
   def psllqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def pslldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
 
   def psrlwi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def psrldi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
   def psrlqi256 : X86Builtin<"_Vector<4, long long int>(_Vector<4, long long int>, int)">;
+  def psrldqi256_byteshift : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Constant int)">;
 
   def psrawi256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, int)">;
   def psradi256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, int)">;
@@ -677,7 +683,15 @@ let Features = "avx2", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWi
   def phsubw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
   def phsubd256 : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
   def phsubsw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
-  
+
+  def psignb256
+      : X86Builtin<"_Vector<32, char>(_Vector<32, char>, _Vector<32, char>)">;
+  def psignw256
+      : X86Builtin<
+            "_Vector<16, short>(_Vector<16, short>, _Vector<16, short>)">;
+  def psignd256
+      : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Vector<8, int>)">;
+
   def pshuflw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
   def pshufhw256 : X86Builtin<"_Vector<16, short>(_Vector<16, short>, _Constant int)">;
   def pshufd256  : X86Builtin<"_Vector<8, int>(_Vector<8, int>, _Constant int)">;
@@ -2083,6 +2097,9 @@ let Features = "avx512bw", Attributes = [NoThrow, Const, RequiredVectorWidth<512
       : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
   def psrlw512
       : X86Builtin<"_Vector<32, short>(_Vector<32, short>, _Vector<8, short>)">;
+}
+
+let Features = "avx512bw", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def pslldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
   def psrldqi512_byteshift : X86Builtin<"_Vector<64, char>(_Vector<64, char>, _Constant int)">;
 }
@@ -2463,6 +2480,9 @@ let Features = "avx512f", Attributes = [NoThrow, Const, RequiredVectorWidth<512>
   def shuf_f64x2 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
   def shuf_i32x4 : X86Builtin<"_Vector<16, int>(_Vector<16, int>, _Vector<16, int>, _Constant int)">;
   def shuf_i64x2 : X86Builtin<"_Vector<8, long long int>(_Vector<8, long long int>, _Vector<8, long long int>, _Constant int)">;
+}
+
+let Features = "avx512f", Attributes = [NoThrow, Const, Constexpr, RequiredVectorWidth<512>] in {
   def shufpd512 : X86Builtin<"_Vector<8, double>(_Vector<8, double>, _Vector<8, double>, _Constant int)">;
   def shufps512 : X86Builtin<"_Vector<16, float>(_Vector<16, float>, _Vector<16, float>, _Constant int)">;
 }
diff --git a/clang/include/clang/Basic/CodeGenOptions.h b/clang/include/clang/Basic/CodeGenOptions.h
index ecf9bd27ca438..65d47c5e8b31d 100644
--- a/clang/include/clang/Basic/CodeGenOptions.h
+++ b/clang/include/clang/Basic/CodeGenOptions.h
@@ -455,10 +455,6 @@ class CodeGenOptions : public CodeGenOptionsBase {
 
   std::optional<double> AllowRuntimeCheckSkipHotCutoff;
 
-  /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use
-  /// pass default).
-  std::optional<uint64_t> AllocTokenMax;
-
   /// List of backend command-line options for -fembed-bitcode.
   std::vector<uint8_t> CmdArgs;
 
diff --git a/clang/include/clang/Basic/DiagnosticASTKinds.td b/clang/include/clang/Basic/DiagnosticASTKinds.td
index 0be9146f70364..5c462f9646b3b 100644
--- a/clang/include/clang/Basic/DiagnosticASTKinds.td
+++ b/clang/include/clang/Basic/DiagnosticASTKinds.td
@@ -403,6 +403,12 @@ def note_constexpr_assumption_failed : Note<
 def note_constexpr_countzeroes_zero : Note<
   "evaluation of %select{__builtin_elementwise_clzg|__builtin_elementwise_ctzg}0 "
   "with a zero value is undefined">;
+def note_constexpr_infer_alloc_token_type_inference_failed : Note<
+  "could not infer allocation type for __builtin_infer_alloc_token">;
+def note_constexpr_infer_alloc_token_no_metadata : Note<
+  "could not get token metadata for inferred type">;
+def note_constexpr_infer_alloc_token_stateful_mode : Note<
+  "stateful alloc token mode not supported in constexpr">;
 def err_experimental_clang_interp_failed : Error<
   "the experimental clang interpreter failed to evaluate an expression">;
 
diff --git a/clang/include/clang/Basic/DiagnosticFrontendKinds.td b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
index 64391def905ab..9e344160ff934 100644
--- a/clang/include/clang/Basic/DiagnosticFrontendKinds.td
+++ b/clang/include/clang/Basic/DiagnosticFrontendKinds.td
@@ -404,10 +404,6 @@ def warn_hlsl_langstd_minimal :
           "recommend using %1 instead">,
   InGroup<HLSLDXCCompat>;
 
-def err_hlsl_semantic_missing : Error<"semantic annotations must be present "
-                                      "for all input and outputs of an entry "
-                                      "function or patch constant function">;
-
 // ClangIR frontend errors
 def err_cir_to_cir_transform_failed : Error<
     "CIR-to-CIR transformation failed">, DefaultFatal;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index dd1fbc68c17e2..4c019c5407952 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2418,9 +2418,9 @@ def err_init_conversion_failed : Error<
   "cannot initialize %select{a variable|a parameter|template parameter|"
   "return object|statement expression result|an "
   "exception object|a member subobject|an array element|a new value|a value|a "
-  "base class|a constructor delegation|a vector element|a block element|a "
-  "block element|a complex element|a lambda capture|a compound literal "
-  "initializer|a related result|a parameter of CF audited function|a "
+  "base class|a constructor delegation|a vector element|a matrix element|a "
+  "block element|a block element|a complex element|a lambda capture|a compound"
+  " literal initializer|a related result|a parameter of CF audited function|a "
   "structured binding|a member subobject}0 "
   "%diff{of type $ with an %select{rvalue|lvalue}2 of type $|"
   "with an %select{rvalue|lvalue}2 of incompatible type}1,3"
@@ -4122,6 +4122,9 @@ def warn_missing_sdksettings_for_availability_checking : Warning<
   "%0 availability is ignored without a valid 'SDKSettings.json' in the SDK">,
   InGroup<DiagGroup<"ignored-availability-without-sdk-settings">>;
 
+def err_hidden_device_kernel
+    : Error<"%0 is specified as a device kernel but it is not externally visible">;
+
 // Thread Safety Attributes
 def warn_thread_attribute_ignored : Warning<
   "ignoring %0 attribute because its argument is invalid">,
@@ -6542,9 +6545,9 @@ def warn_extern_init : Warning<"'extern' variable has an initializer">,
 def err_variable_object_no_init : Error<
   "variable-sized object may not be initialized">;
 def err_excess_initializers : Error<
-  "excess elements in %select{array|vector|scalar|union|struct}0 initializer">;
+  "excess elements in %select{array|vector|matrix|scalar|union|struct}0 initializer">;
 def ext_excess_initializers : ExtWarn<
-  "excess elements in %select{array|vector|scalar|union|struct}0 initializer">,
+  "excess elements in %select{array|vector|matrix|scalar|union|struct}0 initializer">,
   InGroup<ExcessInitializers>;
 def err_excess_initializers_for_sizeless_type : Error<
   "excess elements in initializer for indivisible sizeless type %0">;
@@ -13145,6 +13148,7 @@ def err_riscv_attribute_interrupt_requires_extension : Error<
   "RISC-V 'interrupt' attribute '%0' requires extension '%1'">;
 def err_riscv_attribute_interrupt_invalid_combination : Error<
   "RISC-V 'interrupt' attribute contains invalid combination of interrupt types">;
+def err_riscv_builtin_invalid_twiden : Error<"RISC-V XSfmm twiden must be 1, 2 or 4">;
 
 def err_std_source_location_impl_not_found : Error<
   "'std::source_location::__impl' was not found; it must be defined before '__builtin_source_location' is called">;
@@ -13170,6 +13174,7 @@ def err_hlsl_duplicate_parameter_modifier : Error<"duplicate parameter modifier
 def err_hlsl_missing_semantic_annotation : Error<
   "semantic annotations must be present for all parameters of an entry "
   "function or patch constant function">;
+def note_hlsl_semantic_used_here : Note<"%0 used here">;
 def err_hlsl_unknown_semantic : Error<"unknown HLSL semantic %0">;
 def err_hlsl_semantic_output_not_supported
     : Error<"semantic %0 does not support output">;
diff --git a/clang/include/clang/Basic/FileManager.h b/clang/include/clang/Basic/FileManager.h
index 337911e99b303..fa7552b6ae41d 100644
--- a/clang/include/clang/Basic/FileManager.h
+++ b/clang/include/clang/Basic/FileManager.h
@@ -287,8 +287,12 @@ class FileManager : public RefCountedBase<FileManager> {
   /// If path is not absolute and FileSystemOptions set the working
   /// directory, the path is modified to be relative to the given
   /// working directory.
-  /// \returns true if \c path changed.
-  bool FixupRelativePath(SmallVectorImpl<char> &path) const;
+  /// \returns true if \c Path changed.
+  bool FixupRelativePath(SmallVectorImpl<char> &Path) const {
+    return fixupRelativePath(FileSystemOpts, Path);
+  }
+  static bool fixupRelativePath(const FileSystemOptions &FileSystemOpts,
+                                SmallVectorImpl<char> &Path);
 
   /// Makes \c Path absolute taking into account FileSystemOptions and the
   /// working directory option.
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index 260a7537edb9d..8aa89d8c8c807 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -25,6 +25,7 @@
 #include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/DXContainer.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/TargetParser/Triple.h"
 #include <optional>
 #include <string>
@@ -565,6 +566,13 @@ class LangOptions : public LangOptionsBase {
   bool AtomicFineGrainedMemory = false;
   bool AtomicIgnoreDenormalMode = false;
 
+  /// Maximum number of allocation tokens (0 = no max), nullopt if none set (use
+  /// target default).
+  std::optional<uint64_t> AllocTokenMax;
+
+  /// The allocation token mode.
+  std::optional<llvm::AllocTokenMode> AllocTokenMode;
+
   LangOptions();
 
   /// Set language defaults for the given input language and
diff --git a/clang/include/clang/Basic/riscv_sifive_vector.td b/clang/include/clang/Basic/riscv_sifive_vector.td
index 772fd3ef4201f..89e644a078682 100644
--- a/clang/include/clang/Basic/riscv_sifive_vector.td
+++ b/clang/include/clang/Basic/riscv_sifive_vector.td
@@ -14,6 +14,10 @@
 
 include "riscv_vector_common.td"
 
+class IsFloat<string type> {
+  bit val = !or(!eq(type, "x"), !eq(type, "f"), !eq(type, "d"), !eq(type, "y"));
+}
+
 //===----------------------------------------------------------------------===//
 // Instruction definitions
 //===----------------------------------------------------------------------===//
@@ -198,3 +202,169 @@ let ManualCodegen = [{
   defm sf_vfnrclip_xu_f_qf : RVVVFNRCLIPBuiltinSet<"Uv", "UvFqf", "c">;
 }
 }
+
+multiclass RVVSFTileLoadStoreBuiltinSet<list<string> types,
+                                        list<string> RequiredFeatures = []> {
+  let OverloadedName = NAME,
+      Name = NAME,
+      IRName = NAME,
+      Log2LMUL = [0],
+      HasMasked = false,
+      ManualCodegen = [{IntrinsicTypes = {Ops.back()->getType()};}] in
+    foreach type = types in {
+      let RequiredFeatures = !listconcat(RequiredFeatures,
+                                         !cond(!eq(type, "x"): ["zvfhmin"],
+                                               !eq(type, "y"): ["zvfbfmin"],
+                                               true:           []<string>)) in {
+        def : RVVBuiltin<"e", "0zPCe", type>;
+        if !not(IsFloat<type>.val) then
+          def : RVVBuiltin<"Ue", "0zPCUe", type>;
+      }
+    }
+}
+
+multiclass RVVSFTileMoveBuiltinSet<list<list<string>> suffixes_prototypes,
+                                   list<int> intrinsic_types,
+                                   string type,
+                                   list<string> RequiredFeatures = []> {
+  foreach sp = suffixes_prototypes in
+    let RequiredFeatures = !listconcat(RequiredFeatures,
+                                       !cond(!eq(type, "x"): ["zvfhmin"],
+                                             !eq(type, "y"): ["zvfbfmin"],
+                                             true:           []<string>)),
+        SupportOverloading = false,
+        HasMasked = false,
+        Name = NAME,
+        IRName = NAME,
+        HasVL = true,
+        Log2LMUL = [3],
+        IntrinsicTypes = intrinsic_types in
+      def : RVVBuiltin<sp[0], sp[1], type>;
+}
+
+multiclass RVVSFTileMoveVTBuiltinSet<list<string> RequiredFeatures = []> {
+  foreach type = ["c", "s", "i", "l"] in
+    defm NAME :
+        RVVSFTileMoveBuiltinSet<[["v", "vz"], ["Uv", "Uvz"]], [-1], type,
+                                RequiredFeatures>;
+  foreach type = ["x", "y", "f", "d"] in
+    defm NAME :
+        RVVSFTileMoveBuiltinSet<[["v", "vz"]], [-1], type, RequiredFeatures>;
+}
+
+multiclass RVVSFTileMoveTVBuiltinSet<list<string> RequiredFeatures = []> {
+  let SupportOverloading = true, OverloadedName = NAME in {
+    foreach type = ["c", "s", "i", "l"] in
+      defm NAME :
+          RVVSFTileMoveBuiltinSet<[["v", "0zv"], ["Uv", "0zUv"]], [1], type,
+                                  RequiredFeatures>;
+    foreach type = ["x", "y", "f", "d"] in
+      defm NAME :
+          RVVSFTileMoveBuiltinSet<[["v", "0zv"]], [1], type, RequiredFeatures>;
+  }
+}
+
+multiclass RVVOp0Op1Op2BuiltinSet<string intrinsic_name, string type_range,
+                                  list<list<string>> suffixes_prototypes>
+    : RVVBuiltinSet<intrinsic_name, type_range, suffixes_prototypes, [0, 1, 2]>;
+
+multiclass RVVSFMatMulBuiltinSet<string prototype, string suffix,
+                                 string type_range, list<int> widens> {
+  foreach widen = widens in
+    let OverloadedName = NAME,
+        TWiden = widen,
+        HasVL = false,
+        Log2LMUL = [3],
+        HasMasked = false in
+      defm NAME : RVVOp0Op1Op2BuiltinSet<NAME, type_range,
+          [[!strconcat("w", !cast<string>(widen)), suffix, prototype]]>;
+}
+
+multiclass RVVSFMatMulFloatBuiltinSet<string name, string prototype, string suffix,
+                                      list<string> type_range, int widen> {
+  // Currently the XSfmm spec doesn't support w8.
+  foreach type = type_range in
+    let OverloadedName = name # !strconcat("_w", !cast<string>(widen)),
+        TWiden = widen,
+        HasVL = false,
+        Log2LMUL = [3],
+        Name = name # "_" # !strconcat("w", !cast<string>(widen)),
+        HasMasked = false in
+      defm : RVVOp0Op1BuiltinSet<name, type, [["", suffix, prototype]]>;
+}
+
+multiclass RVVSFVTZeroBuiltinSet {
+  let SupportOverloading = false,
+      HasVL = false,
+      HasMasked = false,
+      Name = NAME,
+      IRName = NAME,
+      Log2LMUL = [0] in
+    defm : RVVOp0BuiltinSet<NAME, "i", [["", "", "0Kzzzzz"]]>;
+}
+
+multiclass RVVSFVTDiscardBuiltinSet {
+  let SupportOverloading = false,
+      HasVL = false,
+      HasMasked = false,
+      Name = NAME,
+      IRName = NAME,
+      Log2LMUL = [0] in
+    defm : RVVBuiltinSet<NAME, "i", [["", "", "0"]], []>;
+}
+
+let RequiredFeatures = ["xsfmmbase"] in {
+  let SupportOverloading = false,
+      HasVL = false,
+      HasMasked = false,
+      Log2LMUL = [0],
+      ManualCodegen = [{IntrinsicTypes = {ResultType};}] in // Set XLEN type
+  {
+    // let HasBuiltinAlias = false in
+    def sf_vsettnt : RVVBuiltin<"", "zzKzKz", "i">;
+    def sf_vsettm  : RVVBuiltin<"", "zzKzKz", "i">;
+    let IRName = "sf_vsettnt" in
+      def sf_vsettn  : RVVBuiltin<"", "zzKzKz", "i">;
+    def sf_vsettk  : RVVBuiltin<"", "zzKzKz", "i">;
+  }
+  defm sf_vtzero_t : RVVSFVTZeroBuiltinSet;
+  defm sf_vtdiscard : RVVSFVTDiscardBuiltinSet;
+}
+
+defm sf_vtmv_v_t : RVVSFTileMoveVTBuiltinSet<["xsfmmbase"]>;
+defm sf_vtmv_t_v : RVVSFTileMoveTVBuiltinSet<["xsfmmbase"]>;
+
+defm sf_vlte8  : RVVSFTileLoadStoreBuiltinSet<["c"], ["xsfmmbase"]>;
+defm sf_vlte16 : RVVSFTileLoadStoreBuiltinSet<["s", "x", "y"], ["xsfmmbase"]>;
+defm sf_vlte32 : RVVSFTileLoadStoreBuiltinSet<["i", "f"], ["xsfmmbase"]>;
+defm sf_vlte64 : RVVSFTileLoadStoreBuiltinSet<["l", "d"], ["xsfmmbase"]>;
+
+defm sf_vste8  : RVVSFTileLoadStoreBuiltinSet<["c"], ["xsfmmbase"]>;
+defm sf_vste16 : RVVSFTileLoadStoreBuiltinSet<["s", "x", "y"], ["xsfmmbase"]>;
+defm sf_vste32 : RVVSFTileLoadStoreBuiltinSet<["i", "f"], ["xsfmmbase"]>;
+defm sf_vste64 : RVVSFTileLoadStoreBuiltinSet<["l", "d"], ["xsfmmbase"]>;
+
+let RequiredFeatures = ["xsfmm32a8i"] in {
+  defm sf_mm_u_u   : RVVSFMatMulBuiltinSet<"0KzUvUvzzz", "UvUv", "c", [4]>;
+  defm sf_mm_s_u   : RVVSFMatMulBuiltinSet<"0KzvUvzzz", "vUv", "c", [4]>;
+  defm sf_mm_u_s   : RVVSFMatMulBuiltinSet<"0KzUvvzzz", "Uvv", "c", [4]>;
+  defm sf_mm_s_s   : RVVSFMatMulBuiltinSet<"0Kzvvzzz", "vv", "c", [4]>;
+
+}
+
+let RequiredFeatures = ["xsfmm32a16f"] in
+  defm : RVVSFMatMulFloatBuiltinSet<"sf_mm_f_f", "0Kzvvzzz", "v", ["x", "y"], 2>;
+
+let RequiredFeatures = ["xsfmm32a32f"] in
+  defm : RVVSFMatMulFloatBuiltinSet<"sf_mm_f_f", "0Kzvvzzz", "v", ["f"], 1>;
+
+let RequiredFeatures = ["xsfmm32a8f"] in
+  foreach e1 = [5, 4] in
+    foreach e2 = [5, 4] in
+      let OverloadedName = "sf_mm_e" # e1 # "m" # !sub(7, e1) # "_e" # e2 # "m" # !sub(7, e2) in
+        defm : RVVSFMatMulFloatBuiltinSet<
+            "sf_mm_e" # e1 # "m" # !sub(7, e1) # "_e" # e2 # "m" # !sub(7, e2),
+            "0KzUvUvzzz", "UvUv", ["c"], 4>;
+
+let RequiredFeatures = ["xsfmm64a64f"] in
+  defm : RVVSFMatMulFloatBuiltinSet<"sf_mm_f_f", "0Kzvvzzz", "v", ["d"], 1>;
diff --git a/clang/include/clang/Basic/riscv_vector_common.td b/clang/include/clang/Basic/riscv_vector_common.td
index eaa2ba43885b0..7e2d339291713 100644
--- a/clang/include/clang/Basic/riscv_vector_common.td
+++ b/clang/include/clang/Basic/riscv_vector_common.td
@@ -247,6 +247,9 @@ class RVVBuiltin<string suffix, string prototype, string type_range,
   // Set to true if the builtin has a parameter that models floating-point
   // rounding mode control
   bit HasFRMRoundModeOp = false;
+
+  // TWiden for XSfmm.
+  int TWiden = 0;
 }
 
 // This is the code emitted in the header.
diff --git a/clang/include/clang/CIR/Dialect/IR/CIROps.td b/clang/include/clang/CIR/Dialect/IR/CIROps.td
index 86d09d72fe6ca..2b361ed0982c6 100644
--- a/clang/include/clang/CIR/Dialect/IR/CIROps.td
+++ b/clang/include/clang/CIR/Dialect/IR/CIROps.td
@@ -4052,6 +4052,43 @@ def CIR_ExpectOp : CIR_Op<"expect", [
   }];
 }
 
+//===----------------------------------------------------------------------===//
+// PrefetchOp
+//===----------------------------------------------------------------------===//
+
+def CIR_PrefetchOp : CIR_Op<"prefetch"> {
+  let summary = "Prefetch operation";
+  let description = [{
+    The `cir.prefetch` operation is a hint to the code generator to insert a
+    prefetch instruction if supported; otherwise, it is a noop. Prefetches
+    have no effect on the behavior of the program but can change its
+    performance characteristics.
+
+    ```mlir
+    cir.prefetch(%0 : !cir.ptr<!void>) locality(1) write
+    ```
+
+    $locality is a temporal locality specifier ranging from (0) - no locality,
+    to (3) - extremely local, keep in cache. If $locality is not present, the
+    default value is 3.
+    
+    $isWrite specifies whether the prefetch is for a 'read' or 'write'. If
+    $isWrite is not specified, it means that prefetch is prepared for 'read'.
+  }];
+
+  let arguments = (ins CIR_VoidPtrType:$addr,
+      DefaultValuedAttr<ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<3>]>,
+                        "3">:$locality,
+      UnitAttr:$isWrite);
+
+  let assemblyFormat = [{
+        (`write` $isWrite^) : (`read`)?
+        `locality` `(` $locality `)`
+        $addr `:` qualified(type($addr))
+        attr-dict
+    }];
+}
+
 //===----------------------------------------------------------------------===//
 // PtrDiffOp
 //===----------------------------------------------------------------------===//
diff --git a/clang/include/clang/CIR/MissingFeatures.h b/clang/include/clang/CIR/MissingFeatures.h
index 2374c8d1850ac..48ef8be9fb782 100644
--- a/clang/include/clang/CIR/MissingFeatures.h
+++ b/clang/include/clang/CIR/MissingFeatures.h
@@ -150,11 +150,9 @@ struct MissingFeatures {
   static bool zeroSizeRecordMembers() { return false; }
 
   // Coroutines
-  static bool coroAllocBuiltinCall() { return false; }
-  static bool coroBeginBuiltinCall() { return false; }
   static bool coroEndBuiltinCall() { return false; }
-  static bool coroSizeBuiltinCall() { return false; }
   static bool coroutineFrame() { return false; }
+  static bool emitBodyAndFallthrough() { return false; }
 
   // Various handling of deferred processing in CIRGenModule.
   static bool cgmRelease() { return false; }
@@ -219,6 +217,9 @@ struct MissingFeatures {
   static bool checkBitfieldClipping() { return false; }
   static bool cirgenABIInfo() { return false; }
   static bool cleanupAfterErrorDiags() { return false; }
+  static bool cleanupAppendInsts() { return false; }
+  static bool cleanupBranchThrough() { return false; }
+  static bool cleanupIndexAndBIAdjustment() { return false; }
   static bool cleanupsToDeactivate() { return false; }
   static bool constEmitterAggILE() { return false; }
   static bool constEmitterArrayILE() { return false; }
@@ -228,6 +229,7 @@ struct MissingFeatures {
   static bool coverageMapping() { return false; }
   static bool createInvariantGroup() { return false; }
   static bool createProfileWeightsForLoop() { return false; }
+  static bool ctorConstLvalueToRvalueConversion() { return false; }
   static bool ctorMemcpyizer() { return false; }
   static bool cudaSupport() { return false; }
   static bool cxxRecordStaticMembers() { return false; }
@@ -237,8 +239,10 @@ struct MissingFeatures {
   static bool dataLayoutPtrHandlingBasedOnLangAS() { return false; }
   static bool deferredCXXGlobalInit() { return false; }
   static bool deleteArray() { return false; }
+  static bool devirtualizeDestructor() { return false; }
   static bool devirtualizeMemberFunction() { return false; }
   static bool ehCleanupFlags() { return false; }
+  static bool ehCleanupHasPrebranchedFallthrough() { return false; }
   static bool ehCleanupScope() { return false; }
   static bool ehCleanupScopeRequiresEHCleanup() { return false; }
   static bool ehCleanupBranchFixups() { return false; }
@@ -257,6 +261,7 @@ struct MissingFeatures {
   static bool generateDebugInfo() { return false; }
   static bool globalViewIndices() { return false; }
   static bool globalViewIntLowering() { return false; }
+  static bool handleBuiltinICEArguments() { return false; }
   static bool hip() { return false; }
   static bool incrementProfileCounter() { return false; }
   static bool innermostEHScope() { return false; }
@@ -295,6 +300,7 @@ struct MissingFeatures {
   static bool setNonGC() { return false; }
   static bool setObjCGCLValueClass() { return false; }
   static bool setTargetAttributes() { return false; }
+  static bool simplifyCleanupEntry() { return false; }
   static bool sourceLanguageCases() { return false; }
   static bool stackBase() { return false; }
   static bool stackSaveOp() { return false; }
diff --git a/clang/include/clang/Driver/CommonArgs.h b/clang/include/clang/Driver/CommonArgs.h
index 85f4a45752614..56c2a65ce8c7b 100644
--- a/clang/include/clang/Driver/CommonArgs.h
+++ b/clang/include/clang/Driver/CommonArgs.h
@@ -89,6 +89,9 @@ void SplitDebugInfo(const ToolChain &TC, Compilation &C, const Tool &T,
                     const JobAction &JA, const llvm::opt::ArgList &Args,
                     const InputInfo &Output, const char *OutFile);
 
+void addDTLTOOptions(const ToolChain &ToolChain, const llvm::opt::ArgList &Args,
+                     llvm::opt::ArgStringList &CmdArgs);
+
 void addLTOOptions(const ToolChain &ToolChain, const llvm::opt::ArgList &Args,
                    llvm::opt::ArgStringList &CmdArgs, const InputInfo &Output,
                    const InputInfoList &Inputs, bool IsThinLTO);
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index f6a434001152b..e024ac254e1ab 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -2760,6 +2760,10 @@ def falloc_token_max_EQ : Joined<["-"], "falloc-token-max=">,
   MetaVarName<"<N>">,
   HelpText<"Limit to maximum N allocation tokens (0 = no max)">;
 
+def falloc_token_mode_EQ : Joined<["-"], "falloc-token-mode=">,
+  Group<f_Group>, Visibility<[CC1Option]>,
+  HelpText<"Set the allocation token mode (experimental)">;
+
 def fallow_runtime_check_skip_hot_cutoff_EQ
     : Joined<["-"], "fallow-runtime-check-skip-hot-cutoff=">,
       Group<f_clang_Group>,
@@ -6706,6 +6710,8 @@ def mv75 : Flag<["-"], "mv75">, Group<m_hexagon_Features_Group>,
   Alias<mcpu_EQ>, AliasArgs<["hexagonv75"]>;
 def mv79 : Flag<["-"], "mv79">, Group<m_hexagon_Features_Group>,
   Alias<mcpu_EQ>, AliasArgs<["hexagonv79"]>;
+def mv81 : Flag<["-"], "mv81">, Group<m_hexagon_Features_Group>,
+  Alias<mcpu_EQ>, AliasArgs<["hexagonv81"]>;
 def mhexagon_hvx : Flag<["-"], "mhvx">, Group<m_hexagon_Features_HVX_Group>,
   HelpText<"Enable Hexagon Vector eXtensions">;
 def mhexagon_hvx_EQ : Joined<["-"], "mhvx=">,
diff --git a/clang/include/clang/Frontend/ASTUnit.h b/clang/include/clang/Frontend/ASTUnit.h
index f66df89aad904..3cea159afa33c 100644
--- a/clang/include/clang/Frontend/ASTUnit.h
+++ b/clang/include/clang/Frontend/ASTUnit.h
@@ -499,6 +499,11 @@ class ASTUnit {
     return *PPOpts;
   }
 
+  IntrusiveRefCntPtr<llvm::vfs::FileSystem> getVirtualFileSystemPtr() {
+    // FIXME: Don't defer VFS ownership to the FileManager.
+    return FileMgr->getVirtualFileSystemPtr();
+  }
+
   const FileManager &getFileManager() const { return *FileMgr; }
   FileManager &getFileManager() { return *FileMgr; }
   IntrusiveRefCntPtr<FileManager> getFileManagerPtr() { return FileMgr; }
diff --git a/clang/include/clang/Frontend/CompilerInstance.h b/clang/include/clang/Frontend/CompilerInstance.h
index 44fff69c217c5..2403cbbb652dd 100644
--- a/clang/include/clang/Frontend/CompilerInstance.h
+++ b/clang/include/clang/Frontend/CompilerInstance.h
@@ -460,7 +460,7 @@ class CompilerInstance : public ModuleLoader {
     FileMgr.resetWithoutRelease();
   }
 
-  /// Replace the current file manager and virtual file system.
+  /// Replace the current file manager.
   void setFileManager(IntrusiveRefCntPtr<FileManager> Value);
 
   /// @}
diff --git a/clang/include/clang/Sema/Initialization.h b/clang/include/clang/Sema/Initialization.h
index 5e96317ffb7fe..0e2891f1ff5c2 100644
--- a/clang/include/clang/Sema/Initialization.h
+++ b/clang/include/clang/Sema/Initialization.h
@@ -91,6 +91,10 @@ class alignas(8) InitializedEntity {
     /// or vector.
     EK_VectorElement,
 
+    /// The entity being initialized is an element of a matrix.
+    /// or matrix.
+    EK_MatrixElement,
+
     /// The entity being initialized is a field of block descriptor for
     /// the copied-in c++ object.
     EK_BlockElement,
@@ -205,8 +209,8 @@ class alignas(8) InitializedEntity {
     /// virtual base.
     llvm::PointerIntPair<const CXXBaseSpecifier *, 1> Base;
 
-    /// When Kind == EK_ArrayElement, EK_VectorElement, or
-    /// EK_ComplexElement, the index of the array or vector element being
+    /// When Kind == EK_ArrayElement, EK_VectorElement, EK_MatrixElement,
+    /// or EK_ComplexElement, the index of the array or vector element being
     /// initialized.
     unsigned Index;
 
@@ -536,7 +540,7 @@ class alignas(8) InitializedEntity {
   /// element's index.
   unsigned getElementIndex() const {
     assert(getKind() == EK_ArrayElement || getKind() == EK_VectorElement ||
-           getKind() == EK_ComplexElement);
+           getKind() == EK_MatrixElement || getKind() == EK_ComplexElement);
     return Index;
   }
 
@@ -544,7 +548,7 @@ class alignas(8) InitializedEntity {
   /// element, sets the element index.
   void setElementIndex(unsigned Index) {
     assert(getKind() == EK_ArrayElement || getKind() == EK_VectorElement ||
-           getKind() == EK_ComplexElement);
+           getKind() == EK_MatrixElement || getKind() == EK_ComplexElement);
     this->Index = Index;
   }
 
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 87b96c2d5ad09..52904c72d1cfc 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -1321,15 +1321,11 @@ class Sema final : public SemaBase {
 
   /// Callback to the parser to parse templated functions when needed.
   typedef void LateTemplateParserCB(void *P, LateParsedTemplate &LPT);
-  typedef void LateTemplateParserCleanupCB(void *P);
   LateTemplateParserCB *LateTemplateParser;
-  LateTemplateParserCleanupCB *LateTemplateParserCleanup;
   void *OpaqueParser;
 
-  void SetLateTemplateParser(LateTemplateParserCB *LTP,
-                             LateTemplateParserCleanupCB *LTPCleanup, void *P) {
+  void SetLateTemplateParser(LateTemplateParserCB *LTP, void *P) {
     LateTemplateParser = LTP;
-    LateTemplateParserCleanup = LTPCleanup;
     OpaqueParser = P;
   }
 
@@ -2685,11 +2681,6 @@ class Sema final : public SemaBase {
   /// function without this attribute.
   bool DiscardingCFIUncheckedCallee(QualType From, QualType To) const;
 
-  /// Returns true if `From` is a function or pointer to a function without the
-  /// `cfi_unchecked_callee` attribute but `To` is a function or pointer to
-  /// function with this attribute.
-  bool AddingCFIUncheckedCallee(QualType From, QualType To) const;
-
   /// This function calls Action when it determines that E designates a
   /// misaligned member due to the packed attribute. This is used to emit
   /// local diagnostics like in reference binding.
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index f9d3a4ea94806..8c3b6ae176389 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -130,9 +130,6 @@ class SemaHLSL : public SemaBase {
   bool ActOnUninitializedVarDecl(VarDecl *D);
   void ActOnEndOfTranslationUnit(TranslationUnitDecl *TU);
   void CheckEntryPoint(FunctionDecl *FD);
-  bool isSemanticValid(FunctionDecl *FD, DeclaratorDecl *D);
-  void CheckSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
-                               const HLSLAnnotationAttr *AnnotationAttr);
   bool CheckResourceBinOp(BinaryOperatorKind Opc, Expr *LHSExpr, Expr *RHSExpr,
                           SourceLocation Loc);
   void DiagnoseAttrStageMismatch(
@@ -179,17 +176,17 @@ class SemaHLSL : public SemaBase {
   bool handleResourceTypeAttr(QualType T, const ParsedAttr &AL);
 
   template <typename T>
-  T *createSemanticAttr(const ParsedAttr &AL,
+  T *createSemanticAttr(const AttributeCommonInfo &ACI, NamedDecl *TargetDecl,
                         std::optional<unsigned> Location) {
-    T *Attr = ::new (getASTContext()) T(getASTContext(), AL);
-    if (Attr->isSemanticIndexable())
-      Attr->setSemanticIndex(Location ? *Location : 0);
-    else if (Location.has_value()) {
+    T *Attr =
+        ::new (getASTContext()) T(getASTContext(), ACI, TargetDecl,
+                                  Location.value_or(0), Location.has_value());
+
+    if (!Attr->isSemanticIndexable() && Location.has_value()) {
       Diag(Attr->getLocation(), diag::err_hlsl_semantic_indexing_not_supported)
           << Attr->getAttrName()->getName();
       return nullptr;
     }
-
     return Attr;
   }
 
@@ -247,10 +244,25 @@ class SemaHLSL : public SemaBase {
 
   IdentifierInfo *RootSigOverrideIdent = nullptr;
 
+  struct SemanticInfo {
+    HLSLSemanticAttr *Semantic;
+    std::optional<uint32_t> Index;
+  };
+
 private:
   void collectResourceBindingsOnVarDecl(VarDecl *D);
   void collectResourceBindingsOnUserRecordDecl(const VarDecl *VD,
                                                const RecordType *RT);
+
+  void checkSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
+                               const HLSLSemanticAttr *SemanticAttr);
+  HLSLSemanticAttr *createSemantic(const SemanticInfo &Semantic,
+                                   DeclaratorDecl *TargetDecl);
+  bool determineActiveSemanticOnScalar(FunctionDecl *FD, DeclaratorDecl *D,
+                                       SemanticInfo &ActiveSemantic);
+  bool determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
+                               SemanticInfo &ActiveSemantic);
+
   void processExplicitBindingsOnDecl(VarDecl *D);
 
   void diagnoseAvailabilityViolations(TranslationUnitDecl *TU);
diff --git a/clang/include/clang/Support/RISCVVIntrinsicUtils.h b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
index 29a07f1985722..67149f23116e0 100644
--- a/clang/include/clang/Support/RISCVVIntrinsicUtils.h
+++ b/clang/include/clang/Support/RISCVVIntrinsicUtils.h
@@ -402,6 +402,7 @@ class RVVIntrinsic {
   std::vector<int64_t> IntrinsicTypes;
   unsigned NF = 1;
   Policy PolicyAttrs;
+  unsigned TWiden = 0;
 
 public:
   RVVIntrinsic(llvm::StringRef Name, llvm::StringRef Suffix,
@@ -410,8 +411,8 @@ class RVVIntrinsic {
                bool HasVL, PolicyScheme Scheme, bool SupportOverloading,
                bool HasBuiltinAlias, llvm::StringRef ManualCodegen,
                const RVVTypes &Types,
-               const std::vector<int64_t> &IntrinsicTypes,
-               unsigned NF, Policy PolicyAttrs, bool HasFRMRoundModeOp);
+               const std::vector<int64_t> &IntrinsicTypes, unsigned NF,
+               Policy PolicyAttrs, bool HasFRMRoundModeOp, unsigned TWiden);
   ~RVVIntrinsic() = default;
 
   RVVTypePtr getOutputType() const { return OutputType; }
@@ -435,6 +436,7 @@ class RVVIntrinsic {
   llvm::StringRef getManualCodegen() const { return ManualCodegen; }
   PolicyScheme getPolicyScheme() const { return Scheme; }
   unsigned getNF() const { return NF; }
+  unsigned getTWiden() const { return TWiden; }
   const std::vector<int64_t> &getIntrinsicTypes() const {
     return IntrinsicTypes;
   }
diff --git a/clang/lib/AST/ASTConcept.cpp b/clang/lib/AST/ASTConcept.cpp
index 9ea104c4c3c9d..fd12bc4e83827 100644
--- a/clang/lib/AST/ASTConcept.cpp
+++ b/clang/lib/AST/ASTConcept.cpp
@@ -86,7 +86,7 @@ void ConstraintSatisfaction::Profile(llvm::FoldingSetNodeID &ID,
   ID.AddPointer(ConstraintOwner);
   ID.AddInteger(TemplateArgs.size());
   for (auto &Arg : TemplateArgs)
-    C.getCanonicalTemplateArgument(Arg).Profile(ID, C);
+    Arg.Profile(ID, C);
 }
 
 ConceptReference *
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index a98574445bf33..aafeb35a801fa 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1653,6 +1653,9 @@ ASTContext::findPointerAuthContent(QualType T) const {
   if (!RD)
     return PointerAuthContent::None;
 
+  if (RD->isInvalidDecl())
+    return PointerAuthContent::None;
+
   if (auto Existing = RecordContainsAddressDiscriminatedPointerAuth.find(RD);
       Existing != RecordContainsAddressDiscriminatedPointerAuth.end())
     return Existing->second;
@@ -3522,7 +3525,6 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
 uint16_t ASTContext::getPointerAuthTypeDiscriminator(QualType T) {
   assert(!T->isDependentType() &&
          "cannot compute type discriminator of a dependent type");
-
   SmallString<256> Str;
   llvm::raw_svector_ostream Out(Str);
 
diff --git a/clang/lib/AST/ByteCode/Compiler.cpp b/clang/lib/AST/ByteCode/Compiler.cpp
index 6b989276e6d7d..f4ddbf40da951 100644
--- a/clang/lib/AST/ByteCode/Compiler.cpp
+++ b/clang/lib/AST/ByteCode/Compiler.cpp
@@ -1842,7 +1842,6 @@ bool Compiler<Emitter>::visitInitList(ArrayRef<const Expr *> Inits,
                                   const Expr *Init, PrimType T,
                                   bool Activate = false) -> bool {
       InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(Init));
-      InitLinkScope<Emitter> ILS(this, InitLink::Field(FieldToInit->Offset));
       if (!this->visit(Init))
         return false;
 
@@ -3274,34 +3273,43 @@ bool Compiler<Emitter>::VisitCXXConstructExpr(const CXXConstructExpr *E) {
   }
 
   if (T->isArrayType()) {
-    const ConstantArrayType *CAT =
-        Ctx.getASTContext().getAsConstantArrayType(E->getType());
-    if (!CAT)
-      return false;
-
-    size_t NumElems = CAT->getZExtSize();
     const Function *Func = getFunction(E->getConstructor());
     if (!Func)
       return false;
 
-    // FIXME(perf): We're calling the constructor once per array element here,
-    //   in the old intepreter we had a special-case for trivial constructors.
-    for (size_t I = 0; I != NumElems; ++I) {
-      if (!this->emitConstUint64(I, E))
-        return false;
-      if (!this->emitArrayElemPtrUint64(E))
-        return false;
+    if (!this->emitDupPtr(E))
+      return false;
 
-      // Constructor arguments.
-      for (const auto *Arg : E->arguments()) {
-        if (!this->visit(Arg))
-          return false;
+    std::function<bool(QualType)> initArrayDimension;
+    initArrayDimension = [&](QualType T) -> bool {
+      if (!T->isArrayType()) {
+        // Constructor arguments.
+        for (const auto *Arg : E->arguments()) {
+          if (!this->visit(Arg))
+            return false;
+        }
+
+        return this->emitCall(Func, 0, E);
       }
 
-      if (!this->emitCall(Func, 0, E))
+      const ConstantArrayType *CAT =
+          Ctx.getASTContext().getAsConstantArrayType(T);
+      if (!CAT)
         return false;
-    }
-    return true;
+      QualType ElemTy = CAT->getElementType();
+      unsigned NumElems = CAT->getZExtSize();
+      for (size_t I = 0; I != NumElems; ++I) {
+        if (!this->emitConstUint64(I, E))
+          return false;
+        if (!this->emitArrayElemPtrUint64(E))
+          return false;
+        if (!initArrayDimension(ElemTy))
+          return false;
+      }
+      return this->emitPopPtr(E);
+    };
+
+    return initArrayDimension(E->getType());
   }
 
   return false;
@@ -3600,8 +3608,6 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
     if (PlacementDest) {
       if (!this->visit(PlacementDest))
         return false;
-      if (!this->emitStartLifetime(E))
-        return false;
       if (!this->emitGetLocal(SizeT, ArrayLen, E))
         return false;
       if (!this->emitCheckNewTypeMismatchArray(SizeT, E, E))
@@ -3741,10 +3747,9 @@ bool Compiler<Emitter>::VisitCXXNewExpr(const CXXNewExpr *E) {
     if (PlacementDest) {
       if (!this->visit(PlacementDest))
         return false;
-      if (!this->emitStartLifetime(E))
-        return false;
       if (!this->emitCheckNewTypeMismatch(E, E))
         return false;
+
     } else {
       // Allocate just one element.
       if (!this->emitAlloc(Desc, E))
@@ -5385,55 +5390,57 @@ bool Compiler<Emitter>::VisitCXXThisExpr(const CXXThisExpr *E) {
   // instance pointer of the current function frame, but e.g. to the declaration
   // currently being initialized. Here we emit the necessary instruction(s) for
   // this scenario.
-  if (!InitStackActive)
+  if (!InitStackActive || InitStack.empty())
     return this->emitThis(E);
 
-  if (!InitStack.empty()) {
-    // If our init stack is, for example:
-    // 0 Stack: 3 (decl)
-    // 1 Stack: 6 (init list)
-    // 2 Stack: 1 (field)
-    // 3 Stack: 6 (init list)
-    // 4 Stack: 1 (field)
-    //
-    // We want to find the LAST element in it that's an init list,
-    // which is marked with the K_InitList marker. The index right
-    // before that points to an init list. We need to find the
-    // elements before the K_InitList element that point to a base
-    // (e.g. a decl or This), optionally followed by field, elem, etc.
-    // In the example above, we want to emit elements [0..2].
-    unsigned StartIndex = 0;
-    unsigned EndIndex = 0;
-    // Find the init list.
-    for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
-      if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
-          InitStack[StartIndex].Kind == InitLink::K_This) {
-        EndIndex = StartIndex;
-        --StartIndex;
-        break;
-      }
+  // If our init stack is, for example:
+  // 0 Stack: 3 (decl)
+  // 1 Stack: 6 (init list)
+  // 2 Stack: 1 (field)
+  // 3 Stack: 6 (init list)
+  // 4 Stack: 1 (field)
+  //
+  // We want to find the LAST element in it that's an init list,
+  // which is marked with the K_InitList marker. The index right
+  // before that points to an init list. We need to find the
+  // elements before the K_InitList element that point to a base
+  // (e.g. a decl or This), optionally followed by field, elem, etc.
+  // In the example above, we want to emit elements [0..2].
+  unsigned StartIndex = 0;
+  unsigned EndIndex = 0;
+  // Find the init list.
+  for (StartIndex = InitStack.size() - 1; StartIndex > 0; --StartIndex) {
+    if (InitStack[StartIndex].Kind == InitLink::K_InitList ||
+        InitStack[StartIndex].Kind == InitLink::K_This) {
+      EndIndex = StartIndex;
+      --StartIndex;
+      break;
     }
+  }
 
-    // Walk backwards to find the base.
-    for (; StartIndex > 0; --StartIndex) {
-      if (InitStack[StartIndex].Kind == InitLink::K_InitList)
-        continue;
+  // Walk backwards to find the base.
+  for (; StartIndex > 0; --StartIndex) {
+    if (InitStack[StartIndex].Kind == InitLink::K_InitList)
+      continue;
 
-      if (InitStack[StartIndex].Kind != InitLink::K_Field &&
-          InitStack[StartIndex].Kind != InitLink::K_Elem)
-        break;
-    }
+    if (InitStack[StartIndex].Kind != InitLink::K_Field &&
+        InitStack[StartIndex].Kind != InitLink::K_Elem)
+      break;
+  }
 
-    // Emit the instructions.
-    for (unsigned I = StartIndex; I != EndIndex; ++I) {
-      if (InitStack[I].Kind == InitLink::K_InitList)
-        continue;
-      if (!InitStack[I].template emit<Emitter>(this, E))
-        return false;
-    }
-    return true;
+  if (StartIndex == 0 && EndIndex == 0)
+    EndIndex = InitStack.size() - 1;
+
+  assert(StartIndex < EndIndex);
+
+  // Emit the instructions.
+  for (unsigned I = StartIndex; I != (EndIndex + 1); ++I) {
+    if (InitStack[I].Kind == InitLink::K_InitList)
+      continue;
+    if (!InitStack[I].template emit<Emitter>(this, E))
+      return false;
   }
-  return this->emitThis(E);
+  return true;
 }
 
 template <class Emitter> bool Compiler<Emitter>::visitStmt(const Stmt *S) {
@@ -6295,6 +6302,10 @@ bool Compiler<Emitter>::compileConstructor(const CXXConstructorDecl *Ctor) {
       }
       assert(NestedField);
 
+      unsigned FirstLinkOffset =
+          R->getField(cast<FieldDecl>(IFD->chain()[0]))->Offset;
+      InitStackScope<Emitter> ISS(this, isa<CXXDefaultInitExpr>(InitExpr));
+      InitLinkScope<Emitter> ILS(this, InitLink::Field(FirstLinkOffset));
       if (!emitFieldInitializer(NestedField, NestedFieldOffset, InitExpr,
                                 IsUnion))
         return false;
@@ -6432,6 +6443,13 @@ bool Compiler<Emitter>::visitFunc(const FunctionDecl *F) {
   return this->emitNoRet(SourceInfo{});
 }
 
+static uint32_t getBitWidth(const Expr *E) {
+  assert(E->refersToBitField());
+  const auto *ME = cast<MemberExpr>(E);
+  const auto *FD = cast<FieldDecl>(ME->getMemberDecl());
+  return FD->getBitWidthValue();
+}
+
 template <class Emitter>
 bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
   const Expr *SubExpr = E->getSubExpr();
@@ -6460,10 +6478,15 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return DiscardResult ? this->emitPopPtr(E) : true;
     }
 
-    if (T == PT_Float) {
+    if (T == PT_Float)
       return DiscardResult ? this->emitIncfPop(getFPOptions(E), E)
                            : this->emitIncf(getFPOptions(E), E);
-    }
+
+    if (SubExpr->refersToBitField())
+      return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(),
+                                                      getBitWidth(SubExpr), E)
+                           : this->emitIncBitfield(*T, E->canOverflow(),
+                                                   getBitWidth(SubExpr), E);
 
     return DiscardResult ? this->emitIncPop(*T, E->canOverflow(), E)
                          : this->emitInc(*T, E->canOverflow(), E);
@@ -6484,9 +6507,15 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
       return DiscardResult ? this->emitPopPtr(E) : true;
     }
 
-    if (T == PT_Float) {
+    if (T == PT_Float)
       return DiscardResult ? this->emitDecfPop(getFPOptions(E), E)
                            : this->emitDecf(getFPOptions(E), E);
+
+    if (SubExpr->refersToBitField()) {
+      return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(),
+                                                      getBitWidth(SubExpr), E)
+                           : this->emitDecBitfield(*T, E->canOverflow(),
+                                                   getBitWidth(SubExpr), E);
     }
 
     return DiscardResult ? this->emitDecPop(*T, E->canOverflow(), E)
@@ -6515,6 +6544,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (DiscardResult) {
       if (T == PT_Float)
         return this->emitIncfPop(getFPOptions(E), E);
+      if (SubExpr->refersToBitField())
+        return DiscardResult ? this->emitIncPopBitfield(*T, E->canOverflow(),
+                                                        getBitWidth(SubExpr), E)
+                             : this->emitIncBitfield(*T, E->canOverflow(),
+                                                     getBitWidth(SubExpr), E);
       return this->emitIncPop(*T, E->canOverflow(), E);
     }
 
@@ -6530,6 +6564,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
         return false;
       if (!this->emitStoreFloat(E))
         return false;
+    } else if (SubExpr->refersToBitField()) {
+      assert(isIntegralType(*T));
+      if (!this->emitPreIncBitfield(*T, E->canOverflow(), getBitWidth(SubExpr),
+                                    E))
+        return false;
     } else {
       assert(isIntegralType(*T));
       if (!this->emitPreInc(*T, E->canOverflow(), E))
@@ -6560,6 +6599,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
     if (DiscardResult) {
       if (T == PT_Float)
         return this->emitDecfPop(getFPOptions(E), E);
+      if (SubExpr->refersToBitField())
+        return DiscardResult ? this->emitDecPopBitfield(*T, E->canOverflow(),
+                                                        getBitWidth(SubExpr), E)
+                             : this->emitDecBitfield(*T, E->canOverflow(),
+                                                     getBitWidth(SubExpr), E);
       return this->emitDecPop(*T, E->canOverflow(), E);
     }
 
@@ -6575,6 +6619,11 @@ bool Compiler<Emitter>::VisitUnaryOperator(const UnaryOperator *E) {
         return false;
       if (!this->emitStoreFloat(E))
         return false;
+    } else if (SubExpr->refersToBitField()) {
+      assert(isIntegralType(*T));
+      if (!this->emitPreDecBitfield(*T, E->canOverflow(), getBitWidth(SubExpr),
+                                    E))
+        return false;
     } else {
       assert(isIntegralType(*T));
       if (!this->emitPreDec(*T, E->canOverflow(), E))
diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index a72282caf5e73..169a9a2c7b811 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1903,12 +1903,19 @@ bool CheckNewTypeMismatch(InterpState &S, CodePtr OpPC, const Expr *E,
   if (Ptr.inUnion() && Ptr.getBase().getRecord()->isUnion())
     Ptr.activate();
 
+  if (Ptr.isZero()) {
+    S.FFDiag(S.Current->getSource(OpPC), diag::note_constexpr_access_null)
+        << AK_Construct;
+    return false;
+  }
+
   if (!Ptr.isBlockPointer())
     return false;
 
+  startLifetimeRecurse(Ptr);
+
   // Similar to CheckStore(), but with the additional CheckTemporary() call and
   // the AccessKinds are different.
-
   if (!Ptr.block()->isAccessible()) {
     if (!CheckExtern(S, OpPC, Ptr))
       return false;
diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index d8529daf591ee..89f6fbefb1907 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -702,7 +702,7 @@ enum class IncDecOp {
 
 template <typename T, IncDecOp Op, PushVal DoPush>
 bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
-                  bool CanOverflow) {
+                  bool CanOverflow, UnsignedOrNone BitWidth = std::nullopt) {
   assert(!Ptr.isDummy());
 
   if (!S.inConstantContext()) {
@@ -725,12 +725,18 @@ bool IncDecHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
 
   if constexpr (Op == IncDecOp::Inc) {
     if (!T::increment(Value, &Result) || !CanOverflow) {
-      Ptr.deref<T>() = Result;
+      if (BitWidth)
+        Ptr.deref<T>() = Result.truncate(*BitWidth);
+      else
+        Ptr.deref<T>() = Result;
       return true;
     }
   } else {
     if (!T::decrement(Value, &Result) || !CanOverflow) {
-      Ptr.deref<T>() = Result;
+      if (BitWidth)
+        Ptr.deref<T>() = Result.truncate(*BitWidth);
+      else
+        Ptr.deref<T>() = Result;
       return true;
     }
   }
@@ -774,6 +780,17 @@ bool Inc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
                                                       CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool IncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                 unsigned BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::Yes>(S, OpPC, Ptr, CanOverflow,
+                                                      BitWidth);
+}
+
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
 /// 3) Writes the value increased by one back to the pointer
@@ -786,6 +803,17 @@ bool IncPop(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool IncPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
@@ -795,6 +823,17 @@ bool PreInc(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool PreIncBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Increment))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Inc, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
 /// 3) Writes the value decreased by one back to the pointer
@@ -808,6 +847,16 @@ bool Dec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr,
                                                       CanOverflow);
 }
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool DecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                 uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::Yes>(S, OpPC, Ptr, CanOverflow,
+                                                      BitWidth);
+}
 
 /// 1) Pops a pointer from the stack
 /// 2) Load the value from the pointer
@@ -821,6 +870,17 @@ bool DecPop(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool DecPopBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   const Pointer &Ptr = S.Stk.peek<Pointer>();
@@ -829,6 +889,16 @@ bool PreDec(InterpState &S, CodePtr OpPC, bool CanOverflow) {
   return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow);
 }
 
+template <PrimType Name, class T = typename PrimConv<Name>::T>
+bool PreDecBitfield(InterpState &S, CodePtr OpPC, bool CanOverflow,
+                    uint32_t BitWidth) {
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+  if (!CheckLoad(S, OpPC, Ptr, AK_Decrement))
+    return false;
+  return IncDecHelper<T, IncDecOp::Dec, PushVal::No>(S, OpPC, Ptr, CanOverflow,
+                                                     BitWidth);
+}
+
 template <IncDecOp Op, PushVal DoPush>
 bool IncDecFloatHelper(InterpState &S, CodePtr OpPC, const Pointer &Ptr,
                        uint32_t FPOI) {
diff --git a/clang/lib/AST/ByteCode/InterpBuiltin.cpp b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
index ff83c52b0c8f6..d0b97a18e1815 100644
--- a/clang/lib/AST/ByteCode/InterpBuiltin.cpp
+++ b/clang/lib/AST/ByteCode/InterpBuiltin.cpp
@@ -12,12 +12,14 @@
 #include "InterpHelpers.h"
 #include "PrimType.h"
 #include "Program.h"
+#include "clang/AST/InferAlloc.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/RecordLayout.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/TargetBuiltins.h"
 #include "clang/Basic/TargetInfo.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SipHash.h"
 
@@ -1307,6 +1309,45 @@ interp__builtin_ptrauth_string_discriminator(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_infer_alloc_token(InterpState &S, CodePtr OpPC,
+                                              const InterpFrame *Frame,
+                                              const CallExpr *Call) {
+  const ASTContext &ASTCtx = S.getASTContext();
+  uint64_t BitWidth = ASTCtx.getTypeSize(ASTCtx.getSizeType());
+  auto Mode =
+      ASTCtx.getLangOpts().AllocTokenMode.value_or(llvm::DefaultAllocTokenMode);
+  uint64_t MaxTokens =
+      ASTCtx.getLangOpts().AllocTokenMax.value_or(~0ULL >> (64 - BitWidth));
+
+  // We do not read any of the arguments; discard them.
+  for (int I = Call->getNumArgs() - 1; I >= 0; --I)
+    discard(S.Stk, *S.getContext().classify(Call->getArg(I)));
+
+  // Note: Type inference from a surrounding cast is not supported in
+  // constexpr evaluation.
+  QualType AllocType = infer_alloc::inferPossibleType(Call, ASTCtx, nullptr);
+  if (AllocType.isNull()) {
+    S.CCEDiag(Call,
+              diag::note_constexpr_infer_alloc_token_type_inference_failed);
+    return false;
+  }
+
+  auto ATMD = infer_alloc::getAllocTokenMetadata(AllocType, ASTCtx);
+  if (!ATMD) {
+    S.CCEDiag(Call, diag::note_constexpr_infer_alloc_token_no_metadata);
+    return false;
+  }
+
+  auto MaybeToken = llvm::getAllocToken(Mode, *ATMD, MaxTokens);
+  if (!MaybeToken) {
+    S.CCEDiag(Call, diag::note_constexpr_infer_alloc_token_stateful_mode);
+    return false;
+  }
+
+  pushInteger(S, llvm::APInt(BitWidth, *MaybeToken), ASTCtx.getSizeType());
+  return true;
+}
+
 static bool interp__builtin_operator_new(InterpState &S, CodePtr OpPC,
                                          const InterpFrame *Frame,
                                          const CallExpr *Call) {
@@ -3279,6 +3320,65 @@ static bool interp__builtin_ia32_vpconflict(InterpState &S, CodePtr OpPC,
   return true;
 }
 
+static bool interp__builtin_x86_byteshift(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call, unsigned ID,
+    llvm::function_ref<APInt(const Pointer &, unsigned Lane, unsigned I,
+                             unsigned Shift)>
+        Fn) {
+  assert(Call->getNumArgs() == 2);
+
+  APSInt ImmAPS = popToAPSInt(S, Call->getArg(1));
+  uint64_t Shift = ImmAPS.getZExtValue() & 0xff;
+
+  const Pointer &Src = S.Stk.pop<Pointer>();
+  if (!Src.getFieldDesc()->isPrimitiveArray())
+    return false;
+
+  unsigned NumElems = Src.getNumElems();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+  PrimType ElemT = Src.getFieldDesc()->getPrimType();
+
+  for (unsigned Lane = 0; Lane != NumElems; Lane += 16) {
+    for (unsigned I = 0; I != 16; ++I) {
+      unsigned Base = Lane + I;
+      APSInt Result = APSInt(Fn(Src, Lane, I, Shift));
+      INT_TYPE_SWITCH_NO_BOOL(ElemT,
+                              { Dst.elem<T>(Base) = static_cast<T>(Result); });
+    }
+  }
+
+  Dst.initializeAllElements();
+
+  return true;
+}
+
+static bool interp__builtin_ia32_shuffle_generic(
+    InterpState &S, CodePtr OpPC, const CallExpr *Call,
+    llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
+        GetSourceIndex) {
+
+  assert(Call->getNumArgs() == 3);
+  unsigned ShuffleMask = popToAPSInt(S, Call->getArg(2)).getZExtValue();
+
+  QualType Arg0Type = Call->getArg(0)->getType();
+  const auto *VecT = Arg0Type->castAs<VectorType>();
+  PrimType ElemT = *S.getContext().classify(VecT->getElementType());
+  unsigned NumElems = VecT->getNumElements();
+
+  const Pointer &B = S.Stk.pop<Pointer>();
+  const Pointer &A = S.Stk.pop<Pointer>();
+  const Pointer &Dst = S.Stk.peek<Pointer>();
+
+  for (unsigned DstIdx = 0; DstIdx != NumElems; ++DstIdx) {
+    auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+    const Pointer &Src = (SrcVecIdx == 0) ? A : B;
+    TYPE_SWITCH(ElemT, { Dst.elem<T>(DstIdx) = Src.elem<T>(SrcIdx); });
+  }
+  Dst.initializeAllElements();
+
+  return true;
+}
+
 bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
                       uint32_t BuiltinID) {
   if (!S.getASTContext().BuiltinInfo.isConstantEvaluated(BuiltinID))
@@ -3471,7 +3571,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_lrotl:
   case Builtin::BI_rotl64:
     return interp__builtin_elementwise_int_binop(
-        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) {
           return Value.rotl(Amount);
         });
 
@@ -3485,7 +3585,7 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI_lrotr:
   case Builtin::BI_rotr64:
     return interp__builtin_elementwise_int_binop(
-        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) -> APInt {
+        S, OpPC, Call, [](const APSInt &Value, const APSInt &Amount) {
           return Value.rotr(Amount);
         });
 
@@ -3694,6 +3794,9 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case Builtin::BI__builtin_ptrauth_string_discriminator:
     return interp__builtin_ptrauth_string_discriminator(S, OpPC, Frame, Call);
 
+  case Builtin::BI__builtin_infer_alloc_token:
+    return interp__builtin_infer_alloc_token(S, OpPC, Frame, Call);
+
   case Builtin::BI__noop:
     pushInteger(S, 0, Call->getType());
     return true;
@@ -3809,6 +3912,21 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
     return interp__builtin_ia32_movmsk_op(S, OpPC, Call);
   }
 
+  case X86::BI__builtin_ia32_psignb128:
+  case X86::BI__builtin_ia32_psignb256:
+  case X86::BI__builtin_ia32_psignw128:
+  case X86::BI__builtin_ia32_psignw256:
+  case X86::BI__builtin_ia32_psignd128:
+  case X86::BI__builtin_ia32_psignd256:
+    return interp__builtin_elementwise_int_binop(
+        S, OpPC, Call, [](const APInt &AElem, const APInt &BElem) {
+          if (BElem.isZero())
+            return APInt::getZero(AElem.getBitWidth());
+          if (BElem.isNegative())
+            return -AElem;
+          return AElem;
+        });
+
   case clang::X86::BI__builtin_ia32_pavgb128:
   case clang::X86::BI__builtin_ia32_pavgw128:
   case clang::X86::BI__builtin_ia32_pavgb256:
@@ -4191,6 +4309,42 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_selectpd_512:
     return interp__builtin_select(S, OpPC, Call);
 
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned NumElemPerLane = 4;
+          unsigned NumSelectableElems = NumElemPerLane / 2;
+          unsigned BitsPerElem = 2;
+          unsigned IndexMask = 0x3;
+          unsigned MaskBits = 8;
+          unsigned Lane = DstIdx / NumElemPerLane;
+          unsigned ElemInLane = DstIdx % NumElemPerLane;
+          unsigned LaneOffset = Lane * NumElemPerLane;
+          unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
+          unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
+          unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+          return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
+        });
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_shufpd512:
+    return interp__builtin_ia32_shuffle_generic(
+        S, OpPC, Call, [](unsigned DstIdx, unsigned ShuffleMask) {
+          unsigned NumElemPerLane = 2;
+          unsigned NumSelectableElems = NumElemPerLane / 2;
+          unsigned BitsPerElem = 1;
+          unsigned IndexMask = 0x1;
+          unsigned MaskBits = 8;
+          unsigned Lane = DstIdx / NumElemPerLane;
+          unsigned ElemInLane = DstIdx % NumElemPerLane;
+          unsigned LaneOffset = Lane * NumElemPerLane;
+          unsigned SrcIdx = ElemInLane >= NumSelectableElems ? 1 : 0;
+          unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
+          unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+          return std::pair<unsigned, unsigned>{SrcIdx, LaneOffset + Index};
+        });
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512:
@@ -4331,6 +4485,39 @@ bool InterpretBuiltin(InterpState &S, CodePtr OpPC, const CallExpr *Call,
   case X86::BI__builtin_ia32_vec_set_v4di:
     return interp__builtin_vec_set(S, OpPC, Call, BuiltinID);
 
+  case X86::BI__builtin_ia32_pslldqi128_byteshift:
+  case X86::BI__builtin_ia32_pslldqi256_byteshift:
+  case X86::BI__builtin_ia32_pslldqi512_byteshift:
+    // These SLLDQ intrinsics always operate on byte elements (8 bits).
+    // The lane width is hardcoded to 16 to match the SIMD register size,
+    // but the algorithm processes one byte per iteration,
+    // so APInt(8, ...) is correct and intentional.
+    return interp__builtin_x86_byteshift(
+        S, OpPC, Call, BuiltinID,
+        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
+          if (I < Shift) {
+            return APInt(8, 0);
+          }
+          return APInt(8, Src.elem<uint8_t>(Lane + I - Shift));
+        });
+
+  case X86::BI__builtin_ia32_psrldqi128_byteshift:
+  case X86::BI__builtin_ia32_psrldqi256_byteshift:
+  case X86::BI__builtin_ia32_psrldqi512_byteshift:
+    // These SRLDQ intrinsics always operate on byte elements (8 bits).
+    // The lane width is hardcoded to 16 to match the SIMD register size,
+    // but the algorithm processes one byte per iteration,
+    // so APInt(8, ...) is correct and intentional.
+    return interp__builtin_x86_byteshift(
+        S, OpPC, Call, BuiltinID,
+        [](const Pointer &Src, unsigned Lane, unsigned I, unsigned Shift) {
+          if (I + Shift < 16) {
+            return APInt(8, Src.elem<uint8_t>(Lane + I + Shift));
+          }
+
+          return APInt(8, 0);
+        });
+
   default:
     S.FFDiag(S.Current->getLocation(OpPC),
              diag::note_invalid_subexpr_in_const_expr)
diff --git a/clang/lib/AST/ByteCode/Opcodes.td b/clang/lib/AST/ByteCode/Opcodes.td
index 532c4448e6f40..1c17ad9e95d05 100644
--- a/clang/lib/AST/ByteCode/Opcodes.td
+++ b/clang/lib/AST/ByteCode/Opcodes.td
@@ -612,12 +612,25 @@ class OverflowOpcode : Opcode {
   let HasGroup = 1;
 }
 
+class OverflowBitfieldOpcode : Opcode {
+  let Types = [AluTypeClass];
+  let Args = [ArgBool, ArgUint32];
+  let HasGroup = 1;
+}
+
 def Inc : OverflowOpcode;
+def IncBitfield : OverflowBitfieldOpcode;
 def IncPop : OverflowOpcode;
+def IncPopBitfield : OverflowBitfieldOpcode;
 def PreInc : OverflowOpcode;
+def PreIncBitfield : OverflowBitfieldOpcode;
+
 def Dec : OverflowOpcode;
+def DecBitfield : OverflowBitfieldOpcode;
 def DecPop : OverflowOpcode;
+def DecPopBitfield : OverflowBitfieldOpcode;
 def PreDec : OverflowOpcode;
+def PreDecBitfield : OverflowBitfieldOpcode;
 
 // Float increment and decrement.
 def Incf: FloatOpcode;
@@ -853,19 +866,13 @@ def Free : Opcode {
   let Args = [ArgBool, ArgBool];
 }
 
-def CheckNewTypeMismatch : Opcode {
-  let Args = [ArgExpr];
-}
-
-def InvalidNewDeleteExpr : Opcode {
-  let Args = [ArgExpr];
-}
-
+def CheckNewTypeMismatch : Opcode { let Args = [ArgExpr]; }
 def CheckNewTypeMismatchArray : Opcode {
   let Types = [IntegerTypeClass];
   let Args = [ArgExpr];
   let HasGroup = 1;
 }
+def InvalidNewDeleteExpr : Opcode { let Args = [ArgExpr]; }
 
 def IsConstantContext: Opcode;
 def CheckAllocations : Opcode;
diff --git a/clang/lib/AST/CMakeLists.txt b/clang/lib/AST/CMakeLists.txt
index d4fd7a7f16d53..fd50e956bb865 100644
--- a/clang/lib/AST/CMakeLists.txt
+++ b/clang/lib/AST/CMakeLists.txt
@@ -66,6 +66,7 @@ add_clang_library(clangAST
   ExternalASTMerger.cpp
   ExternalASTSource.cpp
   FormatString.cpp
+  InferAlloc.cpp
   InheritViz.cpp
   ByteCode/BitcastBuffer.cpp
   ByteCode/ByteCodeEmitter.cpp
diff --git a/clang/lib/AST/CommentSema.cpp b/clang/lib/AST/CommentSema.cpp
index 649fba94a2b76..27ff5ab1f0c6b 100644
--- a/clang/lib/AST/CommentSema.cpp
+++ b/clang/lib/AST/CommentSema.cpp
@@ -1061,8 +1061,8 @@ InlineCommandRenderKind Sema::getInlineCommandRenderKind(StringRef Name) const {
 
   return llvm::StringSwitch<InlineCommandRenderKind>(Name)
       .Case("b", InlineCommandRenderKind::Bold)
-      .Cases("c", "p", InlineCommandRenderKind::Monospaced)
-      .Cases("a", "e", "em", InlineCommandRenderKind::Emphasized)
+      .Cases({"c", "p"}, InlineCommandRenderKind::Monospaced)
+      .Cases({"a", "e", "em"}, InlineCommandRenderKind::Emphasized)
       .Case("anchor", InlineCommandRenderKind::Anchor)
       .Default(InlineCommandRenderKind::Normal);
 }
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 00aaaab957591..29ee089505125 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -44,6 +44,7 @@
 #include "clang/AST/CharUnits.h"
 #include "clang/AST/CurrentSourceLocExprScope.h"
 #include "clang/AST/Expr.h"
+#include "clang/AST/InferAlloc.h"
 #include "clang/AST/OSLog.h"
 #include "clang/AST/OptionalDiagnostic.h"
 #include "clang/AST/RecordLayout.h"
@@ -11618,6 +11619,39 @@ static bool evalPackBuiltin(const CallExpr *E, EvalInfo &Info, APValue &Result,
   return true;
 }
 
+static bool evalShuffleGeneric(
+    EvalInfo &Info, const CallExpr *Call, APValue &Out,
+    llvm::function_ref<std::pair<unsigned, unsigned>(unsigned, unsigned)>
+        GetSourceIndex) {
+
+  const auto *VT = Call->getType()->getAs<VectorType>();
+  if (!VT)
+    return false;
+
+  APSInt MaskImm;
+  if (!EvaluateInteger(Call->getArg(2), MaskImm, Info))
+    return false;
+  unsigned ShuffleMask = static_cast<unsigned>(MaskImm.getZExtValue());
+
+  APValue A, B;
+  if (!EvaluateAsRValue(Info, Call->getArg(0), A) ||
+      !EvaluateAsRValue(Info, Call->getArg(1), B))
+    return false;
+
+  unsigned NumElts = VT->getNumElements();
+  SmallVector<APValue, 16> ResultElements;
+  ResultElements.reserve(NumElts);
+
+  for (unsigned DstIdx = 0; DstIdx != NumElts; ++DstIdx) {
+    auto [SrcVecIdx, SrcIdx] = GetSourceIndex(DstIdx, ShuffleMask);
+    const APValue &Src = (SrcVecIdx == 0) ? A : B;
+    ResultElements.push_back(Src.getVectorElt(SrcIdx));
+  }
+
+  Out = APValue(ResultElements.data(), ResultElements.size());
+  return true;
+}
+
 static bool evalPshufbBuiltin(EvalInfo &Info, const CallExpr *Call,
                               APValue &Out) {
   APValue SrcVec, ControlVec;
@@ -12312,6 +12346,20 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
 
+  case X86::BI__builtin_ia32_psignb128:
+  case X86::BI__builtin_ia32_psignb256:
+  case X86::BI__builtin_ia32_psignw128:
+  case X86::BI__builtin_ia32_psignw256:
+  case X86::BI__builtin_ia32_psignd128:
+  case X86::BI__builtin_ia32_psignd256:
+    return EvaluateBinOpExpr([](const APInt &AElem, const APInt &BElem) {
+      if (BElem.isZero())
+        return APInt::getZero(AElem.getBitWidth());
+      if (BElem.isNegative())
+        return -AElem;
+      return AElem;
+    });
+
   case X86::BI__builtin_ia32_blendvpd:
   case X86::BI__builtin_ia32_blendvpd256:
   case X86::BI__builtin_ia32_blendvps:
@@ -12383,7 +12431,56 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(ResultElements.data(), ResultElements.size()), E);
   }
-
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512: {
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx,
+               unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
+              constexpr unsigned LaneBits = 128u;
+              unsigned NumElemPerLane = LaneBits / 32;
+              unsigned NumSelectableElems = NumElemPerLane / 2;
+              unsigned BitsPerElem = 2;
+              unsigned IndexMask = (1u << BitsPerElem) - 1;
+              unsigned MaskBits = 8;
+              unsigned Lane = DstIdx / NumElemPerLane;
+              unsigned ElemInLane = DstIdx % NumElemPerLane;
+              unsigned LaneOffset = Lane * NumElemPerLane;
+              unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
+              unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
+              unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+              return {SrcIdx, LaneOffset + Index};
+            }))
+      return false;
+    return Success(R, E);
+  }
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_shufpd512: {
+    APValue R;
+    if (!evalShuffleGeneric(
+            Info, E, R,
+            [](unsigned DstIdx,
+               unsigned ShuffleMask) -> std::pair<unsigned, unsigned> {
+              constexpr unsigned LaneBits = 128u;
+              unsigned NumElemPerLane = LaneBits / 64;
+              unsigned NumSelectableElems = NumElemPerLane / 2;
+              unsigned BitsPerElem = 1;
+              unsigned IndexMask = (1u << BitsPerElem) - 1;
+              unsigned MaskBits = 8;
+              unsigned Lane = DstIdx / NumElemPerLane;
+              unsigned ElemInLane = DstIdx % NumElemPerLane;
+              unsigned LaneOffset = Lane * NumElemPerLane;
+              unsigned BitIndex = (DstIdx * BitsPerElem) % MaskBits;
+              unsigned SrcIdx = (ElemInLane < NumSelectableElems) ? 0 : 1;
+              unsigned Index = (ShuffleMask >> BitIndex) & IndexMask;
+              return {SrcIdx, LaneOffset + Index};
+            }))
+      return false;
+    return Success(R, E);
+  }
   case X86::BI__builtin_ia32_pshufb128:
   case X86::BI__builtin_ia32_pshufb256:
   case X86::BI__builtin_ia32_pshufb512: {
@@ -12891,6 +12988,66 @@ bool VectorExprEvaluator::VisitCallExpr(const CallExpr *E) {
 
     return Success(APValue(Elems.data(), NumElems), E);
   }
+
+  case X86::BI__builtin_ia32_pslldqi128_byteshift:
+  case X86::BI__builtin_ia32_pslldqi256_byteshift:
+  case X86::BI__builtin_ia32_pslldqi512_byteshift: {
+    assert(E->getNumArgs() == 2);
+
+    APValue Src;
+    APSInt Imm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
+        !EvaluateInteger(E->getArg(1), Imm, Info))
+      return false;
+
+    unsigned VecLen = Src.getVectorLength();
+    unsigned Shift = Imm.getZExtValue() & 0xff;
+
+    SmallVector<APValue> ResultElements;
+    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
+      for (unsigned I = 0; I != 16; ++I) {
+        if (I < Shift) {
+          APSInt Zero(8, /*isUnsigned=*/true);
+          Zero = 0;
+          ResultElements.push_back(APValue(Zero));
+        } else {
+          ResultElements.push_back(Src.getVectorElt(Lane + I - Shift));
+        }
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
+
+  case X86::BI__builtin_ia32_psrldqi128_byteshift:
+  case X86::BI__builtin_ia32_psrldqi256_byteshift:
+  case X86::BI__builtin_ia32_psrldqi512_byteshift: {
+    assert(E->getNumArgs() == 2);
+
+    APValue Src;
+    APSInt Imm;
+    if (!EvaluateAsRValue(Info, E->getArg(0), Src) ||
+        !EvaluateInteger(E->getArg(1), Imm, Info))
+      return false;
+
+    unsigned VecLen = Src.getVectorLength();
+    unsigned Shift = Imm.getZExtValue() & 0xff;
+
+    SmallVector<APValue> ResultElements;
+    for (unsigned Lane = 0; Lane != VecLen; Lane += 16) {
+      for (unsigned I = 0; I != 16; ++I) {
+        if (I + Shift < 16) {
+          ResultElements.push_back(Src.getVectorElt(Lane + I + Shift));
+        } else {
+          APSInt Zero(8, /*isUnsigned=*/true);
+          Zero = 0;
+          ResultElements.push_back(APValue(Zero));
+        }
+      }
+    }
+
+    return Success(APValue(ResultElements.data(), ResultElements.size()), E);
+  }
   }
 }
 
@@ -14649,6 +14806,27 @@ bool IntExprEvaluator::VisitBuiltinCallExpr(const CallExpr *E,
     return Success(Result, E);
   }
 
+  case Builtin::BI__builtin_infer_alloc_token: {
+    // If we fail to infer a type, this fails to be a constant expression; this
+    // can be checked with __builtin_constant_p(...).
+    QualType AllocType = infer_alloc::inferPossibleType(E, Info.Ctx, nullptr);
+    if (AllocType.isNull())
+      return Error(
+          E, diag::note_constexpr_infer_alloc_token_type_inference_failed);
+    auto ATMD = infer_alloc::getAllocTokenMetadata(AllocType, Info.Ctx);
+    if (!ATMD)
+      return Error(E, diag::note_constexpr_infer_alloc_token_no_metadata);
+    auto Mode =
+        Info.getLangOpts().AllocTokenMode.value_or(llvm::DefaultAllocTokenMode);
+    uint64_t BitWidth = Info.Ctx.getTypeSize(Info.Ctx.getSizeType());
+    uint64_t MaxTokens =
+        Info.getLangOpts().AllocTokenMax.value_or(~0ULL >> (64 - BitWidth));
+    auto MaybeToken = llvm::getAllocToken(Mode, *ATMD, MaxTokens);
+    if (!MaybeToken)
+      return Error(E, diag::note_constexpr_infer_alloc_token_stateful_mode);
+    return Success(llvm::APInt(BitWidth, *MaybeToken), E);
+  }
+
   case Builtin::BI__builtin_ffs:
   case Builtin::BI__builtin_ffsl:
   case Builtin::BI__builtin_ffsll: {
diff --git a/clang/lib/AST/InferAlloc.cpp b/clang/lib/AST/InferAlloc.cpp
new file mode 100644
index 0000000000000..e439ed4dbb386
--- /dev/null
+++ b/clang/lib/AST/InferAlloc.cpp
@@ -0,0 +1,201 @@
+//===--- InferAlloc.cpp - Allocation type inference -----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements allocation-related type inference.
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/AST/InferAlloc.h"
+#include "clang/AST/ASTContext.h"
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/Basic/IdentifierTable.h"
+#include "llvm/ADT/SmallPtrSet.h"
+
+using namespace clang;
+using namespace infer_alloc;
+
+static bool
+typeContainsPointer(QualType T,
+                    llvm::SmallPtrSet<const RecordDecl *, 4> &VisitedRD,
+                    bool &IncompleteType) {
+  QualType CanonicalType = T.getCanonicalType();
+  if (CanonicalType->isPointerType())
+    return true; // base case
+
+  // Look through typedef chain to check for special types.
+  for (QualType CurrentT = T; const auto *TT = CurrentT->getAs<TypedefType>();
+       CurrentT = TT->getDecl()->getUnderlyingType()) {
+    const IdentifierInfo *II = TT->getDecl()->getIdentifier();
+    // Special Case: Syntactically uintptr_t is not a pointer; semantically,
+    // however, very likely used as such. Therefore, classify uintptr_t as a
+    // pointer, too.
+    if (II && II->isStr("uintptr_t"))
+      return true;
+  }
+
+  // The type is an array; check the element type.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(CanonicalType))
+    return typeContainsPointer(AT->getElementType(), VisitedRD, IncompleteType);
+  // The type is a struct, class, or union.
+  if (const RecordDecl *RD = CanonicalType->getAsRecordDecl()) {
+    if (!RD->isCompleteDefinition()) {
+      IncompleteType = true;
+      return false;
+    }
+    if (!VisitedRD.insert(RD).second)
+      return false; // already visited
+    // Check all fields.
+    for (const FieldDecl *Field : RD->fields()) {
+      if (typeContainsPointer(Field->getType(), VisitedRD, IncompleteType))
+        return true;
+    }
+    // For C++ classes, also check base classes.
+    if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
+      // Polymorphic types require a vptr.
+      if (CXXRD->isDynamicClass())
+        return true;
+      for (const CXXBaseSpecifier &Base : CXXRD->bases()) {
+        if (typeContainsPointer(Base.getType(), VisitedRD, IncompleteType))
+          return true;
+      }
+    }
+  }
+  return false;
+}
+
+/// Infer type from a simple sizeof expression.
+static QualType inferTypeFromSizeofExpr(const Expr *E) {
+  const Expr *Arg = E->IgnoreParenImpCasts();
+  if (const auto *UET = dyn_cast<UnaryExprOrTypeTraitExpr>(Arg)) {
+    if (UET->getKind() == UETT_SizeOf) {
+      if (UET->isArgumentType())
+        return UET->getArgumentTypeInfo()->getType();
+      else
+        return UET->getArgumentExpr()->getType();
+    }
+  }
+  return QualType();
+}
+
+/// Infer type from an arithmetic expression involving a sizeof. For example:
+///
+///   malloc(sizeof(MyType) + padding);  // infers 'MyType'
+///   malloc(sizeof(MyType) * 32);       // infers 'MyType'
+///   malloc(32 * sizeof(MyType));       // infers 'MyType'
+///   malloc(sizeof(MyType) << 1);       // infers 'MyType'
+///   ...
+///
+/// More complex arithmetic expressions are supported, but are a heuristic, e.g.
+/// when considering allocations for structs with flexible array members:
+///
+///   malloc(sizeof(HasFlexArray) + sizeof(int) * 32);  // infers 'HasFlexArray'
+///
+static QualType inferPossibleTypeFromArithSizeofExpr(const Expr *E) {
+  const Expr *Arg = E->IgnoreParenImpCasts();
+  // The argument is a lone sizeof expression.
+  if (QualType T = inferTypeFromSizeofExpr(Arg); !T.isNull())
+    return T;
+  if (const auto *BO = dyn_cast<BinaryOperator>(Arg)) {
+    // Argument is an arithmetic expression. Cover common arithmetic patterns
+    // involving sizeof.
+    switch (BO->getOpcode()) {
+    case BO_Add:
+    case BO_Div:
+    case BO_Mul:
+    case BO_Shl:
+    case BO_Shr:
+    case BO_Sub:
+      if (QualType T = inferPossibleTypeFromArithSizeofExpr(BO->getLHS());
+          !T.isNull())
+        return T;
+      if (QualType T = inferPossibleTypeFromArithSizeofExpr(BO->getRHS());
+          !T.isNull())
+        return T;
+      break;
+    default:
+      break;
+    }
+  }
+  return QualType();
+}
+
+/// If the expression E is a reference to a variable, infer the type from a
+/// variable's initializer if it contains a sizeof. Beware, this is a heuristic
+/// and ignores if a variable is later reassigned. For example:
+///
+///   size_t my_size = sizeof(MyType);
+///   void *x = malloc(my_size);  // infers 'MyType'
+///
+static QualType inferPossibleTypeFromVarInitSizeofExpr(const Expr *E) {
+  const Expr *Arg = E->IgnoreParenImpCasts();
+  if (const auto *DRE = dyn_cast<DeclRefExpr>(Arg)) {
+    if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
+      if (const Expr *Init = VD->getInit())
+        return inferPossibleTypeFromArithSizeofExpr(Init);
+    }
+  }
+  return QualType();
+}
+
+/// Deduces the allocated type by checking if the allocation call's result
+/// is immediately used in a cast expression. For example:
+///
+///   MyType *x = (MyType *)malloc(4096);  // infers 'MyType'
+///
+static QualType inferPossibleTypeFromCastExpr(const CallExpr *CallE,
+                                              const CastExpr *CastE) {
+  if (!CastE)
+    return QualType();
+  QualType PtrType = CastE->getType();
+  if (PtrType->isPointerType())
+    return PtrType->getPointeeType();
+  return QualType();
+}
+
+QualType infer_alloc::inferPossibleType(const CallExpr *E,
+                                        const ASTContext &Ctx,
+                                        const CastExpr *CastE) {
+  QualType AllocType;
+  // First check arguments.
+  for (const Expr *Arg : E->arguments()) {
+    AllocType = inferPossibleTypeFromArithSizeofExpr(Arg);
+    if (AllocType.isNull())
+      AllocType = inferPossibleTypeFromVarInitSizeofExpr(Arg);
+    if (!AllocType.isNull())
+      break;
+  }
+  // Then check later casts.
+  if (AllocType.isNull())
+    AllocType = inferPossibleTypeFromCastExpr(E, CastE);
+  return AllocType;
+}
+
+std::optional<llvm::AllocTokenMetadata>
+infer_alloc::getAllocTokenMetadata(QualType T, const ASTContext &Ctx) {
+  llvm::AllocTokenMetadata ATMD;
+
+  // Get unique type name.
+  PrintingPolicy Policy(Ctx.getLangOpts());
+  Policy.SuppressTagKeyword = true;
+  Policy.FullyQualifiedName = true;
+  llvm::raw_svector_ostream TypeNameOS(ATMD.TypeName);
+  T.getCanonicalType().print(TypeNameOS, Policy);
+
+  // Check if QualType contains a pointer. Implements a simple DFS to
+  // recursively check if a type contains a pointer type.
+  llvm::SmallPtrSet<const RecordDecl *, 4> VisitedRD;
+  bool IncompleteType = false;
+  ATMD.ContainsPointer = typeContainsPointer(T, VisitedRD, IncompleteType);
+  if (!ATMD.ContainsPointer && IncompleteType)
+    return std::nullopt;
+
+  return ATMD;
+}
diff --git a/clang/lib/AST/StmtOpenACC.cpp b/clang/lib/AST/StmtOpenACC.cpp
index 07e3de8eeb00d..39dfa19002da8 100644
--- a/clang/lib/AST/StmtOpenACC.cpp
+++ b/clang/lib/AST/StmtOpenACC.cpp
@@ -12,7 +12,9 @@
 
 #include "clang/AST/StmtOpenACC.h"
 #include "clang/AST/ASTContext.h"
+#include "clang/AST/ExprCXX.h"
 #include "clang/AST/StmtCXX.h"
+
 using namespace clang;
 
 OpenACCComputeConstruct *
@@ -322,6 +324,86 @@ OpenACCAtomicConstruct *OpenACCAtomicConstruct::Create(
   return Inst;
 }
 
+static std::pair<const Expr *, const Expr *> getBinaryOpArgs(const Expr *Op) {
+  if (const auto *BO = dyn_cast<BinaryOperator>(Op)) {
+    assert(BO->isAssignmentOp());
+    return {BO->getLHS(), BO->getRHS()};
+  }
+
+  const auto *OO = cast<CXXOperatorCallExpr>(Op);
+  assert(OO->isAssignmentOp());
+  return {OO->getArg(0), OO->getArg(1)};
+}
+
+static std::pair<bool, const Expr *> getUnaryOpArgs(const Expr *Op) {
+  if (const auto *UO = dyn_cast<UnaryOperator>(Op))
+    return {true, UO->getSubExpr()};
+
+  if (const auto *OpCall = dyn_cast<CXXOperatorCallExpr>(Op)) {
+    // Post-inc/dec have a second unused argument to differentiate it, so we
+    // accept -- or ++ as unary, or any operator call with only 1 arg.
+    if (OpCall->getNumArgs() == 1 || OpCall->getOperator() != OO_PlusPlus ||
+        OpCall->getOperator() != OO_MinusMinus)
+      return {true, OpCall->getArg(0)};
+  }
+
+  return {false, nullptr};
+}
+
+const OpenACCAtomicConstruct::StmtInfo
+OpenACCAtomicConstruct::getAssociatedStmtInfo() const {
+  // This ends up being a vastly simplified version of SemaOpenACCAtomic, since
+  // it doesn't have to worry about erroring out, but we should do a lot of
+  // asserts to ensure we don't get off into the weeds.
+  assert(getAssociatedStmt() && "invalid associated stmt?");
+
+  const Expr *AssocStmt = cast<const Expr>(getAssociatedStmt());
+  switch (AtomicKind) {
+  case OpenACCAtomicKind::Capture:
+    assert(false && "Only 'read'/'write'/'update' have been implemented here");
+    return {};
+  case OpenACCAtomicKind::Read: {
+    // Read only supports the format 'v = x'; where both sides are a scalar
+    // expression. This can come in 2 forms; BinaryOperator or
+    // CXXOperatorCallExpr (rarely).
+    std::pair<const Expr *, const Expr *> BinaryArgs =
+        getBinaryOpArgs(AssocStmt);
+    // We want the L-value for each side, so we ignore implicit casts.
+    return {BinaryArgs.first->IgnoreImpCasts(),
+            BinaryArgs.second->IgnoreImpCasts(), /*expr=*/nullptr};
+  }
+  case OpenACCAtomicKind::Write: {
+    // Write supports only the format 'x = expr', where the expression is scalar
+    // type, and 'x' is a scalar l value. As above, this can come in 2 forms;
+    // Binary Operator or CXXOperatorCallExpr.
+    std::pair<const Expr *, const Expr *> BinaryArgs =
+        getBinaryOpArgs(AssocStmt);
+    // We want the L-value for ONLY the X side, so we ignore implicit casts. For
+    // the right side (the expr), we emit it as an r-value so we need to
+    // maintain implicit casts.
+    return {/*v=*/nullptr, BinaryArgs.first->IgnoreImpCasts(),
+            BinaryArgs.second};
+  }
+  case OpenACCAtomicKind::None:
+  case OpenACCAtomicKind::Update: {
+    std::pair<bool, const Expr *> UnaryArgs = getUnaryOpArgs(AssocStmt);
+    if (UnaryArgs.first)
+      return {/*v=*/nullptr, UnaryArgs.second->IgnoreImpCasts(),
+              /*expr=*/nullptr};
+
+    std::pair<const Expr *, const Expr *> BinaryArgs =
+        getBinaryOpArgs(AssocStmt);
+    // For binary args, we just store the RHS as an expression (in the
+    // expression slot), since the codegen just wants the whole thing for a
+    // recipe.
+    return {/*v=*/nullptr, BinaryArgs.first->IgnoreImpCasts(),
+            BinaryArgs.second};
+  }
+  }
+
+  llvm_unreachable("unknown OpenACC atomic kind");
+}
+
 OpenACCCacheConstruct *OpenACCCacheConstruct::CreateEmpty(const ASTContext &C,
                                                           unsigned NumVars) {
   void *Mem =
diff --git a/clang/lib/AST/TypePrinter.cpp b/clang/lib/AST/TypePrinter.cpp
index 2da7789fe8117..c18b2eafc722c 100644
--- a/clang/lib/AST/TypePrinter.cpp
+++ b/clang/lib/AST/TypePrinter.cpp
@@ -2147,9 +2147,6 @@ void TypePrinter::printAttributedAfter(const AttributedType *T,
   }
   case attr::AArch64VectorPcs: OS << "aarch64_vector_pcs"; break;
   case attr::AArch64SVEPcs: OS << "aarch64_sve_pcs"; break;
-  case attr::DeviceKernel:
-    OS << T->getAttr()->getSpelling();
-    break;
   case attr::IntelOclBicc:
     OS << "inteloclbicc";
     break;
diff --git a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp
index c88a4702cae07..90551c22e0734 100644
--- a/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp
+++ b/clang/lib/Analysis/FlowSensitive/Models/UncheckedStatusOrAccessModel.cpp
@@ -24,6 +24,7 @@
 #include "clang/Analysis/FlowSensitive/DataflowAnalysis.h"
 #include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
 #include "clang/Analysis/FlowSensitive/MatchSwitch.h"
+#include "clang/Analysis/FlowSensitive/RecordOps.h"
 #include "clang/Analysis/FlowSensitive/StorageLocation.h"
 #include "clang/Analysis/FlowSensitive/Value.h"
 #include "clang/Basic/LLVM.h"
@@ -95,6 +96,18 @@ static QualType getStatusOrValueType(ClassTemplateSpecializationDecl *TRD) {
   return TRD->getTemplateArgs().get(0).getAsType();
 }
 
+static auto ofClassStatus() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return ofClass(hasName("::absl::Status"));
+}
+
+static auto isStatusMemberCallWithName(llvm::StringRef member_name) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return cxxMemberCallExpr(
+      on(expr(unless(cxxThisExpr()))),
+      callee(cxxMethodDecl(hasName(member_name), ofClassStatus())));
+}
+
 static auto isStatusOrMemberCallWithName(llvm::StringRef member_name) {
   using namespace ::clang::ast_matchers; // NOLINT: Too many names
   return cxxMemberCallExpr(
@@ -124,6 +137,46 @@ static auto valueOperatorCall() {
                     isStatusOrOperatorCallWithName("->")));
 }
 
+static clang::ast_matchers::TypeMatcher statusType() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return hasCanonicalType(qualType(hasDeclaration(statusClass())));
+}
+
+static auto isComparisonOperatorCall(llvm::StringRef operator_name) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return cxxOperatorCallExpr(
+      hasOverloadedOperatorName(operator_name), argumentCountIs(2),
+      hasArgument(0, anyOf(hasType(statusType()), hasType(statusOrType()))),
+      hasArgument(1, anyOf(hasType(statusType()), hasType(statusOrType()))));
+}
+
+static auto isOkStatusCall() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return callExpr(callee(functionDecl(hasName("::absl::OkStatus"))));
+}
+
+static auto isNotOkStatusCall() {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return callExpr(callee(functionDecl(hasAnyName(
+      "::absl::AbortedError", "::absl::AlreadyExistsError",
+      "::absl::CancelledError", "::absl::DataLossError",
+      "::absl::DeadlineExceededError", "::absl::FailedPreconditionError",
+      "::absl::InternalError", "::absl::InvalidArgumentError",
+      "::absl::NotFoundError", "::absl::OutOfRangeError",
+      "::absl::PermissionDeniedError", "::absl::ResourceExhaustedError",
+      "::absl::UnauthenticatedError", "::absl::UnavailableError",
+      "::absl::UnimplementedError", "::absl::UnknownError"))));
+}
+
+static auto isPointerComparisonOperatorCall(std::string operator_name) {
+  using namespace ::clang::ast_matchers; // NOLINT: Too many names
+  return binaryOperator(hasOperatorName(operator_name),
+                        hasLHS(hasType(hasCanonicalType(pointerType(
+                            pointee(anyOf(statusOrType(), statusType())))))),
+                        hasRHS(hasType(hasCanonicalType(pointerType(
+                            pointee(anyOf(statusOrType(), statusType())))))));
+}
+
 static auto
 buildDiagnoseMatchSwitch(const UncheckedStatusOrAccessModelOptions &Options) {
   return CFGMatchSwitchBuilder<const Environment,
@@ -244,6 +297,237 @@ static void transferStatusOrOkCall(const CXXMemberCallExpr *Expr,
   State.Env.setValue(*Expr, OkVal);
 }
 
+static void transferStatusCall(const CXXMemberCallExpr *Expr,
+                               const MatchFinder::MatchResult &,
+                               LatticeTransferState &State) {
+  RecordStorageLocation *StatusOrLoc =
+      getImplicitObjectLocation(*Expr, State.Env);
+  if (StatusOrLoc == nullptr)
+    return;
+
+  RecordStorageLocation &StatusLoc = locForStatus(*StatusOrLoc);
+
+  if (State.Env.getValue(locForOk(StatusLoc)) == nullptr)
+    initializeStatusOr(*StatusOrLoc, State.Env);
+
+  if (Expr->isPRValue())
+    copyRecord(StatusLoc, State.Env.getResultObjectLocation(*Expr), State.Env);
+  else
+    State.Env.setStorageLocation(*Expr, StatusLoc);
+}
+
+static void transferStatusOkCall(const CXXMemberCallExpr *Expr,
+                                 const MatchFinder::MatchResult &,
+                                 LatticeTransferState &State) {
+  RecordStorageLocation *StatusLoc =
+      getImplicitObjectLocation(*Expr, State.Env);
+  if (StatusLoc == nullptr)
+    return;
+
+  if (Value *Val = State.Env.getValue(locForOk(*StatusLoc)))
+    State.Env.setValue(*Expr, *Val);
+}
+
+static void transferStatusUpdateCall(const CXXMemberCallExpr *Expr,
+                                     const MatchFinder::MatchResult &,
+                                     LatticeTransferState &State) {
+  // S.Update(OtherS) sets S to the error code of OtherS if it is OK,
+  // otherwise does nothing.
+  assert(Expr->getNumArgs() == 1);
+  auto *Arg = Expr->getArg(0);
+  RecordStorageLocation *ArgRecord =
+      Arg->isPRValue() ? &State.Env.getResultObjectLocation(*Arg)
+                       : State.Env.get<RecordStorageLocation>(*Arg);
+  RecordStorageLocation *ThisLoc = getImplicitObjectLocation(*Expr, State.Env);
+  if (ThisLoc == nullptr || ArgRecord == nullptr)
+    return;
+
+  auto &ThisOkVal = valForOk(*ThisLoc, State.Env);
+  auto &ArgOkVal = valForOk(*ArgRecord, State.Env);
+  auto &A = State.Env.arena();
+  auto &NewVal = State.Env.makeAtomicBoolValue();
+  State.Env.assume(A.makeImplies(A.makeNot(ThisOkVal.formula()),
+                                 A.makeNot(NewVal.formula())));
+  State.Env.assume(A.makeImplies(NewVal.formula(), ArgOkVal.formula()));
+  State.Env.setValue(locForOk(*ThisLoc), NewVal);
+}
+
+static BoolValue *evaluateStatusEquality(RecordStorageLocation &LhsStatusLoc,
+                                         RecordStorageLocation &RhsStatusLoc,
+                                         Environment &Env) {
+  auto &A = Env.arena();
+  // Logically, a Status object is composed of an error code that could take one
+  // of multiple possible values, including the "ok" value. We track whether a
+  // Status object has an "ok" value and represent this as an `ok` bit. Equality
+  // of Status objects compares their error codes. Therefore, merely comparing
+  // the `ok` bits isn't sufficient: when two Status objects are assigned non-ok
+  // error codes the equality of their respective error codes matters. Since we
+  // only track the `ok` bits, we can't make any conclusions about equality when
+  // we know that two Status objects have non-ok values.
+
+  auto &LhsOkVal = valForOk(LhsStatusLoc, Env);
+  auto &RhsOkVal = valForOk(RhsStatusLoc, Env);
+
+  auto &Res = Env.makeAtomicBoolValue();
+
+  // lhs && rhs => res (a.k.a. !res => !lhs || !rhs)
+  Env.assume(A.makeImplies(A.makeAnd(LhsOkVal.formula(), RhsOkVal.formula()),
+                           Res.formula()));
+  // res => (lhs == rhs)
+  Env.assume(A.makeImplies(
+      Res.formula(), A.makeEquals(LhsOkVal.formula(), RhsOkVal.formula())));
+
+  return &Res;
+}
+
+static BoolValue *
+evaluateStatusOrEquality(RecordStorageLocation &LhsStatusOrLoc,
+                         RecordStorageLocation &RhsStatusOrLoc,
+                         Environment &Env) {
+  auto &A = Env.arena();
+  // Logically, a StatusOr<T> object is composed of two values - a Status and a
+  // value of type T. Equality of StatusOr objects compares both values.
+  // Therefore, merely comparing the `ok` bits of the Status values isn't
+  // sufficient. When two StatusOr objects are engaged, the equality of their
+  // respective values of type T matters. Similarly, when two StatusOr objects
+  // have Status values that have non-ok error codes, the equality of the error
+  // codes matters. Since we only track the `ok` bits of the Status values, we
+  // can't make any conclusions about equality when we know that two StatusOr
+  // objects are engaged or when their Status values contain non-ok error codes.
+  auto &LhsOkVal = valForOk(locForStatus(LhsStatusOrLoc), Env);
+  auto &RhsOkVal = valForOk(locForStatus(RhsStatusOrLoc), Env);
+  auto &res = Env.makeAtomicBoolValue();
+
+  // res => (lhs == rhs)
+  Env.assume(A.makeImplies(
+      res.formula(), A.makeEquals(LhsOkVal.formula(), RhsOkVal.formula())));
+  return &res;
+}
+
+static BoolValue *evaluateEquality(const Expr *LhsExpr, const Expr *RhsExpr,
+                                   Environment &Env) {
+  // Check the type of both sides in case an operator== is added that admits
+  // different types.
+  if (isStatusOrType(LhsExpr->getType()) &&
+      isStatusOrType(RhsExpr->getType())) {
+    auto *LhsStatusOrLoc = Env.get<RecordStorageLocation>(*LhsExpr);
+    if (LhsStatusOrLoc == nullptr)
+      return nullptr;
+    auto *RhsStatusOrLoc = Env.get<RecordStorageLocation>(*RhsExpr);
+    if (RhsStatusOrLoc == nullptr)
+      return nullptr;
+
+    return evaluateStatusOrEquality(*LhsStatusOrLoc, *RhsStatusOrLoc, Env);
+  }
+  if (isStatusType(LhsExpr->getType()) && isStatusType(RhsExpr->getType())) {
+    auto *LhsStatusLoc = Env.get<RecordStorageLocation>(*LhsExpr);
+    if (LhsStatusLoc == nullptr)
+      return nullptr;
+
+    auto *RhsStatusLoc = Env.get<RecordStorageLocation>(*RhsExpr);
+    if (RhsStatusLoc == nullptr)
+      return nullptr;
+
+    return evaluateStatusEquality(*LhsStatusLoc, *RhsStatusLoc, Env);
+  }
+  return nullptr;
+}
+
+static void transferComparisonOperator(const CXXOperatorCallExpr *Expr,
+                                       LatticeTransferState &State,
+                                       bool IsNegative) {
+  auto *LhsAndRhsVal =
+      evaluateEquality(Expr->getArg(0), Expr->getArg(1), State.Env);
+  if (LhsAndRhsVal == nullptr)
+    return;
+
+  if (IsNegative)
+    State.Env.setValue(*Expr, State.Env.makeNot(*LhsAndRhsVal));
+  else
+    State.Env.setValue(*Expr, *LhsAndRhsVal);
+}
+
+static RecordStorageLocation *getPointeeLocation(const Expr &Expr,
+                                                 Environment &Env) {
+  if (auto *PointerVal = Env.get<PointerValue>(Expr))
+    return dyn_cast<RecordStorageLocation>(&PointerVal->getPointeeLoc());
+  return nullptr;
+}
+
+static BoolValue *evaluatePointerEquality(const Expr *LhsExpr,
+                                          const Expr *RhsExpr,
+                                          Environment &Env) {
+  assert(LhsExpr->getType()->isPointerType());
+  assert(RhsExpr->getType()->isPointerType());
+  RecordStorageLocation *LhsStatusLoc = nullptr;
+  RecordStorageLocation *RhsStatusLoc = nullptr;
+  if (isStatusOrType(LhsExpr->getType()->getPointeeType()) &&
+      isStatusOrType(RhsExpr->getType()->getPointeeType())) {
+    auto *LhsStatusOrLoc = getPointeeLocation(*LhsExpr, Env);
+    auto *RhsStatusOrLoc = getPointeeLocation(*RhsExpr, Env);
+    if (LhsStatusOrLoc == nullptr || RhsStatusOrLoc == nullptr)
+      return nullptr;
+    LhsStatusLoc = &locForStatus(*LhsStatusOrLoc);
+    RhsStatusLoc = &locForStatus(*RhsStatusOrLoc);
+  } else if (isStatusType(LhsExpr->getType()->getPointeeType()) &&
+             isStatusType(RhsExpr->getType()->getPointeeType())) {
+    LhsStatusLoc = getPointeeLocation(*LhsExpr, Env);
+    RhsStatusLoc = getPointeeLocation(*RhsExpr, Env);
+  }
+  if (LhsStatusLoc == nullptr || RhsStatusLoc == nullptr)
+    return nullptr;
+  auto &LhsOkVal = valForOk(*LhsStatusLoc, Env);
+  auto &RhsOkVal = valForOk(*RhsStatusLoc, Env);
+  auto &Res = Env.makeAtomicBoolValue();
+  auto &A = Env.arena();
+  Env.assume(A.makeImplies(
+      Res.formula(), A.makeEquals(LhsOkVal.formula(), RhsOkVal.formula())));
+  return &Res;
+}
+
+static void transferPointerComparisonOperator(const BinaryOperator *Expr,
+                                              LatticeTransferState &State,
+                                              bool IsNegative) {
+  auto *LhsAndRhsVal =
+      evaluatePointerEquality(Expr->getLHS(), Expr->getRHS(), State.Env);
+  if (LhsAndRhsVal == nullptr)
+    return;
+
+  if (IsNegative)
+    State.Env.setValue(*Expr, State.Env.makeNot(*LhsAndRhsVal));
+  else
+    State.Env.setValue(*Expr, *LhsAndRhsVal);
+}
+
+static void transferOkStatusCall(const CallExpr *Expr,
+                                 const MatchFinder::MatchResult &,
+                                 LatticeTransferState &State) {
+  auto &OkVal =
+      initializeStatus(State.Env.getResultObjectLocation(*Expr), State.Env);
+  State.Env.assume(OkVal.formula());
+}
+
+static void transferNotOkStatusCall(const CallExpr *Expr,
+                                    const MatchFinder::MatchResult &,
+                                    LatticeTransferState &State) {
+  auto &OkVal =
+      initializeStatus(State.Env.getResultObjectLocation(*Expr), State.Env);
+  auto &A = State.Env.arena();
+  State.Env.assume(A.makeNot(OkVal.formula()));
+}
+
+static void transferEmplaceCall(const CXXMemberCallExpr *Expr,
+                                const MatchFinder::MatchResult &,
+                                LatticeTransferState &State) {
+  RecordStorageLocation *StatusOrLoc =
+      getImplicitObjectLocation(*Expr, State.Env);
+  if (StatusOrLoc == nullptr)
+    return;
+
+  auto &OkVal = valForOk(locForStatus(*StatusOrLoc), State.Env);
+  State.Env.assume(OkVal.formula());
+}
+
 CFGMatchSwitch<LatticeTransferState>
 buildTransferMatchSwitch(ASTContext &Ctx,
                          CFGMatchSwitchBuilder<LatticeTransferState> Builder) {
@@ -251,6 +535,44 @@ buildTransferMatchSwitch(ASTContext &Ctx,
   return std::move(Builder)
       .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusOrMemberCallWithName("ok"),
                                         transferStatusOrOkCall)
+      .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusOrMemberCallWithName("status"),
+                                        transferStatusCall)
+      .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusMemberCallWithName("ok"),
+                                        transferStatusOkCall)
+      .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusMemberCallWithName("Update"),
+                                        transferStatusUpdateCall)
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isComparisonOperatorCall("=="),
+          [](const CXXOperatorCallExpr *Expr, const MatchFinder::MatchResult &,
+             LatticeTransferState &State) {
+            transferComparisonOperator(Expr, State,
+                                       /*IsNegative=*/false);
+          })
+      .CaseOfCFGStmt<CXXOperatorCallExpr>(
+          isComparisonOperatorCall("!="),
+          [](const CXXOperatorCallExpr *Expr, const MatchFinder::MatchResult &,
+             LatticeTransferState &State) {
+            transferComparisonOperator(Expr, State,
+                                       /*IsNegative=*/true);
+          })
+      .CaseOfCFGStmt<BinaryOperator>(
+          isPointerComparisonOperatorCall("=="),
+          [](const BinaryOperator *Expr, const MatchFinder::MatchResult &,
+             LatticeTransferState &State) {
+            transferPointerComparisonOperator(Expr, State,
+                                              /*IsNegative=*/false);
+          })
+      .CaseOfCFGStmt<BinaryOperator>(
+          isPointerComparisonOperatorCall("!="),
+          [](const BinaryOperator *Expr, const MatchFinder::MatchResult &,
+             LatticeTransferState &State) {
+            transferPointerComparisonOperator(Expr, State,
+                                              /*IsNegative=*/true);
+          })
+      .CaseOfCFGStmt<CallExpr>(isOkStatusCall(), transferOkStatusCall)
+      .CaseOfCFGStmt<CallExpr>(isNotOkStatusCall(), transferNotOkStatusCall)
+      .CaseOfCFGStmt<CXXMemberCallExpr>(isStatusOrMemberCallWithName("emplace"),
+                                        transferEmplaceCall)
       .Build();
 }
 
diff --git a/clang/lib/Basic/FileManager.cpp b/clang/lib/Basic/FileManager.cpp
index 7481e1ee9f7ea..e744cc0afcded 100644
--- a/clang/lib/Basic/FileManager.cpp
+++ b/clang/lib/Basic/FileManager.cpp
@@ -474,8 +474,9 @@ OptionalFileEntryRef FileManager::getBypassFile(FileEntryRef VF) {
   return FileEntryRef(*Insertion.first);
 }
 
-bool FileManager::FixupRelativePath(SmallVectorImpl<char> &path) const {
-  StringRef pathRef(path.data(), path.size());
+bool FileManager::fixupRelativePath(const FileSystemOptions &FileSystemOpts,
+                                    SmallVectorImpl<char> &Path) {
+  StringRef pathRef(Path.data(), Path.size());
 
   if (FileSystemOpts.WorkingDir.empty()
       || llvm::sys::path::is_absolute(pathRef))
@@ -483,7 +484,7 @@ bool FileManager::FixupRelativePath(SmallVectorImpl<char> &path) const {
 
   SmallString<128> NewPath(FileSystemOpts.WorkingDir);
   llvm::sys::path::append(NewPath, pathRef);
-  path = NewPath;
+  Path = NewPath;
   return true;
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 18641a96063cd..a97e93470987c 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -206,8 +206,7 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
 StringRef AArch64TargetInfo::getABI() const { return ABI; }
 
 bool AArch64TargetInfo::setABI(const std::string &Name) {
-  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs" &&
-      Name != "pauthtest")
+  if (Name != "aapcs" && Name != "aapcs-soft" && Name != "darwinpcs")
     return false;
 
   ABI = Name;
@@ -221,12 +220,6 @@ bool AArch64TargetInfo::validateTarget(DiagnosticsEngine &Diags) const {
     Diags.Report(diag::err_target_unsupported_abi_with_fpu) << ABI;
     return false;
   }
-  if (getTriple().getEnvironment() == llvm::Triple::PAuthTest &&
-      getTriple().getOS() != llvm::Triple::Linux) {
-    Diags.Report(diag::err_target_unsupported_abi_for_triple)
-        << getTriple().getEnvironmentName() << getTriple().getTriple();
-    return false;
-  }
   return true;
 }
 
@@ -398,6 +391,12 @@ void AArch64TargetInfo::getTargetDefinesARMV96A(const LangOptions &Opts,
   getTargetDefinesARMV95A(Opts, Builder);
 }
 
+void AArch64TargetInfo::getTargetDefinesARMV97A(const LangOptions &Opts,
+                                                MacroBuilder &Builder) const {
+  // Armv9.7-A does not have a v8.* equivalent, but is a superset of v9.6-A.
+  getTargetDefinesARMV96A(Opts, Builder);
+}
+
 void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
                                          MacroBuilder &Builder) const {
   // Target identification.
@@ -714,6 +713,8 @@ void AArch64TargetInfo::getTargetDefines(const LangOptions &Opts,
     getTargetDefinesARMV95A(Opts, Builder);
   else if (*ArchInfo == llvm::AArch64::ARMV9_6A)
     getTargetDefinesARMV96A(Opts, Builder);
+  else if (*ArchInfo == llvm::AArch64::ARMV9_7A)
+    getTargetDefinesARMV97A(Opts, Builder);
 
   // All of the __sync_(bool|val)_compare_and_swap_(1|2|4|8|16) builtins work.
   Builder.defineMacro("__GCC_HAVE_SYNC_COMPARE_AND_SWAP_1");
@@ -810,10 +811,10 @@ bool AArch64TargetInfo::validateCpuSupports(StringRef FeatureStr) const {
 
 bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
   return llvm::StringSwitch<bool>(Feature)
-      .Cases("aarch64", "arm64", "arm", true)
+      .Cases({"aarch64", "arm64", "arm"}, true)
       .Case("fmv", HasFMV)
       .Case("fp", FPU & FPUMode)
-      .Cases("neon", "simd", FPU & NeonMode)
+      .Cases({"neon", "simd"}, FPU & NeonMode)
       .Case("jscvt", HasJSCVT)
       .Case("fcma", HasFCMA)
       .Case("rng", HasRandGen)
@@ -828,8 +829,8 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("cssc", HasCSSC)
       .Case("sha2", HasSHA2)
       .Case("sha3", HasSHA3)
-      .Cases("aes", "pmull", HasAES)
-      .Cases("fp16", "fullfp16", HasFullFP16)
+      .Cases({"aes", "pmull"}, HasAES)
+      .Cases({"fp16", "fullfp16"}, HasFullFP16)
       .Case("dit", HasDIT)
       .Case("dpb", HasCCPP)
       .Case("dpb2", HasCCDP)
@@ -858,9 +859,9 @@ bool AArch64TargetInfo::hasFeature(StringRef Feature) const {
       .Case("memtag", HasMTE)
       .Case("sb", HasSB)
       .Case("predres", HasPredRes)
-      .Cases("ssbs", "ssbs2", HasSSBS)
+      .Cases({"ssbs", "ssbs2"}, HasSSBS)
       .Case("bti", HasBTI)
-      .Cases("ls64", "ls64_v", "ls64_accdata", HasLS64)
+      .Cases({"ls64", "ls64_v", "ls64_accdata"}, HasLS64)
       .Case("wfxt", HasWFxT)
       .Case("rcpc3", HasRCPC3)
       .Case("fp8", HasFP8)
@@ -1152,6 +1153,9 @@ bool AArch64TargetInfo::handleTargetFeatures(std::vector<std::string> &Features,
     if (Feature == "+v9.6a" &&
         ArchInfo->Version < llvm::AArch64::ARMV9_6A.Version)
       ArchInfo = &llvm::AArch64::ARMV9_6A;
+    if (Feature == "+v9.7a" &&
+        ArchInfo->Version < llvm::AArch64::ARMV9_7A.Version)
+      ArchInfo = &llvm::AArch64::ARMV9_7A;
     if (Feature == "+v8r")
       ArchInfo = &llvm::AArch64::ARMV8R;
     if (Feature == "+fullfp16") {
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index dfd89beeee2f3..7d0737b2e8df0 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -135,6 +135,7 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
 
   const llvm::AArch64::ArchInfo *ArchInfo = &llvm::AArch64::ARMV8A;
 
+protected:
   std::string ABI;
 
 public:
@@ -190,6 +191,8 @@ class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
                                MacroBuilder &Builder) const;
   void getTargetDefinesARMV96A(const LangOptions &Opts,
                                MacroBuilder &Builder) const;
+  void getTargetDefinesARMV97A(const LangOptions &Opts,
+                               MacroBuilder &Builder) const;
   void getTargetDefines(const LangOptions &Opts,
                         MacroBuilder &Builder) const override;
 
@@ -277,6 +280,16 @@ class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo {
   void setDataLayout() override;
 };
 
+template <>
+inline bool
+LinuxTargetInfo<AArch64leTargetInfo>::setABI(const std::string &Name) {
+  if (Name == "pauthtest") {
+    ABI = Name;
+    return true;
+  }
+  return AArch64leTargetInfo::setABI(Name);
+}
+
 class LLVM_LIBRARY_VISIBILITY WindowsARM64TargetInfo
     : public WindowsTargetInfo<AArch64leTargetInfo> {
   const llvm::Triple Triple;
diff --git a/clang/lib/Basic/Targets/ARM.cpp b/clang/lib/Basic/Targets/ARM.cpp
index d00a3a453ed52..394b50b9ee222 100644
--- a/clang/lib/Basic/Targets/ARM.cpp
+++ b/clang/lib/Basic/Targets/ARM.cpp
@@ -231,6 +231,8 @@ StringRef ARMTargetInfo::getCPUAttr() const {
     return "9_5A";
   case llvm::ARM::ArchKind::ARMV9_6A:
     return "9_6A";
+  case llvm::ARM::ArchKind::ARMV9_7A:
+    return "9_7A";
   case llvm::ARM::ArchKind::ARMV8MBaseline:
     return "8M_BASE";
   case llvm::ARM::ArchKind::ARMV8MMainline:
@@ -904,6 +906,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_4A:
   case llvm::ARM::ArchKind::ARMV9_5A:
   case llvm::ARM::ArchKind::ARMV9_6A:
+  case llvm::ARM::ArchKind::ARMV9_7A:
     // Filter __arm_cdp, __arm_ldcl, __arm_stcl in arm_acle.h
     FeatureCoprocBF = FEATURE_COPROC_B1 | FEATURE_COPROC_B3;
     break;
@@ -1074,6 +1077,7 @@ void ARMTargetInfo::getTargetDefines(const LangOptions &Opts,
   case llvm::ARM::ArchKind::ARMV9_4A:
   case llvm::ARM::ArchKind::ARMV9_5A:
   case llvm::ARM::ArchKind::ARMV9_6A:
+  case llvm::ARM::ArchKind::ARMV9_7A:
     getTargetDefinesARMV83A(Opts, Builder);
     break;
   }
diff --git a/clang/lib/Basic/Targets/Hexagon.cpp b/clang/lib/Basic/Targets/Hexagon.cpp
index cea64f9860036..d5b413cb58eb2 100644
--- a/clang/lib/Basic/Targets/Hexagon.cpp
+++ b/clang/lib/Basic/Targets/Hexagon.cpp
@@ -83,6 +83,9 @@ void HexagonTargetInfo::getTargetDefines(const LangOptions &Opts,
   } else if (CPU == "hexagonv79") {
     Builder.defineMacro("__HEXAGON_V79__");
     Builder.defineMacro("__HEXAGON_ARCH__", "79");
+  } else if (CPU == "hexagonv81") {
+    Builder.defineMacro("__HEXAGON_V81__");
+    Builder.defineMacro("__HEXAGON_ARCH__", "81");
   }
 
   if (hasFeature("hvx-length64b")) {
@@ -252,8 +255,7 @@ static constexpr CPUSuffix Suffixes[] = {
     {{"hexagonv68"}, {"68"}}, {{"hexagonv69"}, {"69"}},
     {{"hexagonv71"}, {"71"}}, {{"hexagonv71t"}, {"71t"}},
     {{"hexagonv73"}, {"73"}}, {{"hexagonv75"}, {"75"}},
-    {{"hexagonv79"}, {"79"}},
-};
+    {{"hexagonv79"}, {"79"}}, {{"hexagonv81"}, {"81"}}};
 
 std::optional<unsigned> HexagonTargetInfo::getHexagonCPURev(StringRef Name) {
   StringRef Arch = Name;
diff --git a/clang/lib/Basic/Targets/NVPTX.h b/clang/lib/Basic/Targets/NVPTX.h
index 33c29586359be..f5c8396f398aa 100644
--- a/clang/lib/Basic/Targets/NVPTX.h
+++ b/clang/lib/Basic/Targets/NVPTX.h
@@ -200,7 +200,7 @@ class LLVM_LIBRARY_VISIBILITY NVPTXTargetInfo : public TargetInfo {
     // a host function.
     if (HostTarget)
       return HostTarget->checkCallingConvention(CC);
-    return CCCR_Warning;
+    return CC == CC_DeviceKernel ? CCCR_OK : CCCR_Warning;
   }
 
   bool hasBitIntType() const override { return true; }
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index e744e84a5b079..e99bbd159929c 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "OSTargets.h"
+#include "AArch64.h"
 #include "clang/Basic/MacroBuilder.h"
 #include "llvm/ADT/StringRef.h"
 
diff --git a/clang/lib/Basic/Targets/OSTargets.h b/clang/lib/Basic/Targets/OSTargets.h
index bd6ffcf3173ab..4d81c9a83714d 100644
--- a/clang/lib/Basic/Targets/OSTargets.h
+++ b/clang/lib/Basic/Targets/OSTargets.h
@@ -408,6 +408,12 @@ class LLVM_LIBRARY_VISIBILITY LinuxTargetInfo : public OSTargetInfo<Target> {
   const char *getStaticInitSectionSpecifier() const override {
     return ".text.startup";
   }
+
+  // This allows template specializations, see
+  // LinuxTargetInfo<AArch64leTargetInfo>::setABI
+  bool setABI(const std::string &Name) override {
+    return OSTargetInfo<Target>::setABI(Name);
+  }
 };
 
 // Managarm Target
diff --git a/clang/lib/Basic/Targets/PPC.h b/clang/lib/Basic/Targets/PPC.h
index 9f3a4cd2da716..846b240218172 100644
--- a/clang/lib/Basic/Targets/PPC.h
+++ b/clang/lib/Basic/Targets/PPC.h
@@ -122,41 +122,41 @@ class LLVM_LIBRARY_VISIBILITY PPCTargetInfo : public TargetInfo {
               .Case("970", ArchDefineName | ArchDefinePwr4 | ArchDefinePpcgr |
                                ArchDefinePpcsq)
               .Case("a2", ArchDefineA2)
-              .Cases("power3", "pwr3", ArchDefinePpcgr)
-              .Cases("power4", "pwr4",
+              .Cases({"power3", "pwr3"}, ArchDefinePpcgr)
+              .Cases({"power4", "pwr4"},
                      ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
               .Cases("power5", "pwr5",
                      ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power5x", "pwr5x",
+              .Cases({"power5x", "pwr5x"},
                      ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power6", "pwr6",
-                     ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
-                         ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power6x", "pwr6x",
+              .Cases({"power6", "pwr6"}, ArchDefinePwr6 | ArchDefinePwr5x |
+                                             ArchDefinePwr5 | ArchDefinePwr4 |
+                                             ArchDefinePpcgr | ArchDefinePpcsq)
+              .Cases({"power6x", "pwr6x"},
                      ArchDefinePwr6x | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power7", "pwr7",
-                     ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
-                         ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
-                         ArchDefinePpcsq)
+              .Cases({"power7", "pwr7"}, ArchDefinePwr7 | ArchDefinePwr6 |
+                                             ArchDefinePwr5x | ArchDefinePwr5 |
+                                             ArchDefinePwr4 | ArchDefinePpcgr |
+                                             ArchDefinePpcsq)
               // powerpc64le automatically defaults to at least power8.
-              .Cases("power8", "pwr8", "ppc64le",
+              .Cases({"power8", "pwr8", "ppc64le"},
                      ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
                          ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
                          ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power9", "pwr9",
+              .Cases({"power9", "pwr9"},
                      ArchDefinePwr9 | ArchDefinePwr8 | ArchDefinePwr7 |
                          ArchDefinePwr6 | ArchDefinePwr5x | ArchDefinePwr5 |
                          ArchDefinePwr4 | ArchDefinePpcgr | ArchDefinePpcsq)
-              .Cases("power10", "pwr10",
+              .Cases({"power10", "pwr10"},
                      ArchDefinePwr10 | ArchDefinePwr9 | ArchDefinePwr8 |
                          ArchDefinePwr7 | ArchDefinePwr6 | ArchDefinePwr5x |
                          ArchDefinePwr5 | ArchDefinePwr4 | ArchDefinePpcgr |
                          ArchDefinePpcsq)
-              .Cases("power11", "pwr11",
+              .Cases({"power11", "pwr11"},
                      ArchDefinePwr11 | ArchDefinePwr10 | ArchDefinePwr9 |
                          ArchDefinePwr8 | ArchDefinePwr7 | ArchDefinePwr6 |
                          ArchDefinePwr5x | ArchDefinePwr5 | ArchDefinePwr4 |
diff --git a/clang/lib/CIR/CodeGen/CIRGenAsm.cpp b/clang/lib/CIR/CodeGen/CIRGenAsm.cpp
index 17dffb3515d2a..88a7e85cb2a64 100644
--- a/clang/lib/CIR/CodeGen/CIRGenAsm.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenAsm.cpp
@@ -117,9 +117,9 @@ mlir::LogicalResult CIRGenFunction::emitAsmStmt(const AsmStmt &s) {
 
   bool hasSideEffect = s.isVolatile() || s.getNumOutputs() == 0;
 
-  cir::InlineAsmOp ia = builder.create<cir::InlineAsmOp>(
-      getLoc(s.getAsmLoc()), resultType, operands, asmString, constraints,
-      hasSideEffect, inferFlavor(cgm, s), mlir::ArrayAttr());
+  cir::InlineAsmOp ia = cir::InlineAsmOp::create(
+      builder, getLoc(s.getAsmLoc()), resultType, operands, asmString,
+      constraints, hasSideEffect, inferFlavor(cgm, s), mlir::ArrayAttr());
 
   if (isGCCAsmGoto) {
     assert(!cir::MissingFeatures::asmGoto());
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
index 670a431e47f40..75355ee33b29d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuilder.cpp
@@ -22,8 +22,8 @@ mlir::Value CIRGenBuilderTy::maybeBuildArrayDecay(mlir::Location loc,
 
   if (arrayTy) {
     const cir::PointerType flatPtrTy = getPointerTo(arrayTy.getElementType());
-    return create<cir::CastOp>(loc, flatPtrTy, cir::CastKind::array_to_ptrdecay,
-                               arrayPtr);
+    return cir::CastOp::create(*this, loc, flatPtrTy,
+                               cir::CastKind::array_to_ptrdecay, arrayPtr);
   }
 
   assert(arrayPtrTy.getPointee() == eltTy &&
@@ -40,7 +40,7 @@ mlir::Value CIRGenBuilderTy::getArrayElement(mlir::Location arrayLocBegin,
   if (shouldDecay)
     basePtr = maybeBuildArrayDecay(arrayLocBegin, arrayPtr, eltTy);
   const mlir::Type flatPtrTy = basePtr.getType();
-  return create<cir::PtrStrideOp>(arrayLocEnd, flatPtrTy, basePtr, idx);
+  return cir::PtrStrideOp::create(*this, arrayLocEnd, flatPtrTy, basePtr, idx);
 }
 
 cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
@@ -60,14 +60,14 @@ cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc,
 cir::ConstantOp CIRGenBuilderTy::getConstInt(mlir::Location loc, mlir::Type t,
                                              uint64_t c) {
   assert(mlir::isa<cir::IntType>(t) && "expected cir::IntType");
-  return create<cir::ConstantOp>(loc, cir::IntAttr::get(t, c));
+  return cir::ConstantOp::create(*this, loc, cir::IntAttr::get(t, c));
 }
 
 cir::ConstantOp
 clang::CIRGen::CIRGenBuilderTy::getConstFP(mlir::Location loc, mlir::Type t,
                                            llvm::APFloat fpVal) {
   assert(mlir::isa<cir::FPTypeInterface>(t) && "expected floating point type");
-  return create<cir::ConstantOp>(loc, cir::FPAttr::get(t, fpVal));
+  return cir::ConstantOp::create(*this, loc, cir::FPAttr::get(t, fpVal));
 }
 
 void CIRGenBuilderTy::computeGlobalViewIndicesFromFlatOffset(
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
index ea31871806bd7..e35100ffe4b6b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltin.cpp
@@ -46,9 +46,9 @@ static RValue emitBuiltinBitOp(CIRGenFunction &cgf, const CallExpr *e,
   Op op;
   if constexpr (std::is_same_v<Op, cir::BitClzOp> ||
                 std::is_same_v<Op, cir::BitCtzOp>)
-    op = builder.create<Op>(cgf.getLoc(e->getSourceRange()), arg, poisonZero);
+    op = Op::create(builder, cgf.getLoc(e->getSourceRange()), arg, poisonZero);
   else
-    op = builder.create<Op>(cgf.getLoc(e->getSourceRange()), arg);
+    op = Op::create(builder, cgf.getLoc(e->getSourceRange()), arg);
 
   mlir::Value result = op.getResult();
   mlir::Type exprTy = cgf.convertType(e->getType());
@@ -67,8 +67,8 @@ RValue CIRGenFunction::emitRotate(const CallExpr *e, bool isRotateLeft) {
   // to the type of input when necessary.
   assert(!cir::MissingFeatures::msvcBuiltins());
 
-  auto r = builder.create<cir::RotateOp>(getLoc(e->getSourceRange()), input,
-                                         amount, isRotateLeft);
+  auto r = cir::RotateOp::create(builder, getLoc(e->getSourceRange()), input,
+                                 amount, isRotateLeft);
   return RValue::get(r);
 }
 
@@ -227,14 +227,14 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
       return RValue::get(nullptr);
 
     mlir::Value argValue = emitCheckedArgForAssume(e->getArg(0));
-    builder.create<cir::AssumeOp>(loc, argValue);
+    cir::AssumeOp::create(builder, loc, argValue);
     return RValue::get(nullptr);
   }
 
   case Builtin::BI__builtin_assume_separate_storage: {
     mlir::Value value0 = emitScalarExpr(e->getArg(0));
     mlir::Value value1 = emitScalarExpr(e->getArg(1));
-    builder.create<cir::AssumeSepStorageOp>(loc, value0, value1);
+    cir::AssumeSepStorageOp::create(builder, loc, value0, value1);
     return RValue::get(nullptr);
   }
 
@@ -363,8 +363,8 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
                                       probability);
     }
 
-    auto result = builder.create<cir::ExpectOp>(
-        loc, argValue.getType(), argValue, expectedValue, probAttr);
+    auto result = cir::ExpectOp::create(builder, loc, argValue.getType(),
+                                        argValue, expectedValue, probAttr);
     return RValue::get(result);
   }
 
@@ -375,7 +375,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI_byteswap_ulong:
   case Builtin::BI_byteswap_uint64: {
     mlir::Value arg = emitScalarExpr(e->getArg(0));
-    return RValue::get(builder.create<cir::ByteSwapOp>(loc, arg));
+    return RValue::get(cir::ByteSwapOp::create(builder, loc, arg));
   }
 
   case Builtin::BI__builtin_bitreverse8:
@@ -383,7 +383,7 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   case Builtin::BI__builtin_bitreverse32:
   case Builtin::BI__builtin_bitreverse64: {
     mlir::Value arg = emitScalarExpr(e->getArg(0));
-    return RValue::get(builder.create<cir::BitReverseOp>(loc, arg));
+    return RValue::get(cir::BitReverseOp::create(builder, loc, arg));
   }
 
   case Builtin::BI__builtin_rotateleft8:
@@ -449,10 +449,36 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
   }
   case Builtin::BI__builtin_coro_free:
   case Builtin::BI__builtin_coro_size: {
-    cgm.errorNYI(e->getSourceRange(),
-                 "BI__builtin_coro_free, BI__builtin_coro_size NYI");
-    assert(!cir::MissingFeatures::coroSizeBuiltinCall());
-    return getUndefRValue(e->getType());
+    GlobalDecl gd{fd};
+    mlir::Type ty = cgm.getTypes().getFunctionType(
+        cgm.getTypes().arrangeGlobalDeclaration(gd));
+    const auto *nd = cast<NamedDecl>(gd.getDecl());
+    cir::FuncOp fnOp =
+        cgm.getOrCreateCIRFunction(nd->getName(), ty, gd, /*ForVTable=*/false);
+    fnOp.setBuiltin(true);
+    return emitCall(e->getCallee()->getType(), CIRGenCallee::forDirect(fnOp), e,
+                    returnValue);
+  }
+  case Builtin::BI__builtin_prefetch: {
+    auto evaluateOperandAsInt = [&](const Expr *arg) {
+      Expr::EvalResult res;
+      [[maybe_unused]] bool evalSucceed =
+          arg->EvaluateAsInt(res, cgm.getASTContext());
+      assert(evalSucceed && "expression should be able to evaluate as int");
+      return res.Val.getInt().getZExtValue();
+    };
+
+    bool isWrite = false;
+    if (e->getNumArgs() > 1)
+      isWrite = evaluateOperandAsInt(e->getArg(1));
+
+    int locality = 3;
+    if (e->getNumArgs() > 2)
+      locality = evaluateOperandAsInt(e->getArg(2));
+
+    mlir::Value address = emitScalarExpr(e->getArg(0));
+    cir::PrefetchOp::create(builder, loc, address, locality, isWrite);
+    return RValue::get(nullptr);
   }
   }
 
@@ -463,12 +489,107 @@ RValue CIRGenFunction::emitBuiltinExpr(const GlobalDecl &gd, unsigned builtinID,
     return emitLibraryCall(*this, fd, e,
                            cgm.getBuiltinLibFunction(fd, builtinID));
 
+  // Some target-specific builtins can have aggregate return values, e.g.
+  // __builtin_arm_mve_vld2q_u32. So if the result is an aggregate, force
+  // returnValue to be non-null, so that the target-specific emission code can
+  // always just emit into it.
+  cir::TypeEvaluationKind evalKind = getEvaluationKind(e->getType());
+  if (evalKind == cir::TEK_Aggregate && returnValue.isNull()) {
+    cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin");
+    return getUndefRValue(e->getType());
+  }
+
+  // Now see if we can emit a target-specific builtin.
+  if (mlir::Value v = emitTargetBuiltinExpr(builtinID, e, returnValue)) {
+    switch (evalKind) {
+    case cir::TEK_Scalar:
+      if (mlir::isa<cir::VoidType>(v.getType()))
+        return RValue::get(nullptr);
+      return RValue::get(v);
+    case cir::TEK_Aggregate:
+      cgm.errorNYI(e->getSourceRange(), "aggregate return value from builtin");
+      return getUndefRValue(e->getType());
+    case cir::TEK_Complex:
+      llvm_unreachable("No current target builtin returns complex");
+    }
+    llvm_unreachable("Bad evaluation kind in EmitBuiltinExpr");
+  }
+
   cgm.errorNYI(e->getSourceRange(),
                std::string("unimplemented builtin call: ") +
                    getContext().BuiltinInfo.getName(builtinID));
   return getUndefRValue(e->getType());
 }
 
+static mlir::Value emitTargetArchBuiltinExpr(CIRGenFunction *cgf,
+                                             unsigned builtinID,
+                                             const CallExpr *e,
+                                             ReturnValueSlot &returnValue,
+                                             llvm::Triple::ArchType arch) {
+  // When compiling in HipStdPar mode we have to be conservative in rejecting
+  // target specific features in the FE, and defer the possible error to the
+  // AcceleratorCodeSelection pass, wherein iff an unsupported target builtin is
+  // referenced by an accelerator executable function, we emit an error.
+  // Returning nullptr here leads to the builtin being handled in
+  // EmitStdParUnsupportedBuiltin.
+  if (cgf->getLangOpts().HIPStdPar && cgf->getLangOpts().CUDAIsDevice &&
+      arch != cgf->getTarget().getTriple().getArch())
+    return {};
+
+  switch (arch) {
+  case llvm::Triple::arm:
+  case llvm::Triple::armeb:
+  case llvm::Triple::thumb:
+  case llvm::Triple::thumbeb:
+  case llvm::Triple::aarch64:
+  case llvm::Triple::aarch64_32:
+  case llvm::Triple::aarch64_be:
+  case llvm::Triple::bpfeb:
+  case llvm::Triple::bpfel:
+    // These are actually NYI, but that will be reported by emitBuiltinExpr.
+    // At this point, we don't even know that the builtin is target-specific.
+    return nullptr;
+
+  case llvm::Triple::x86:
+  case llvm::Triple::x86_64:
+    return cgf->emitX86BuiltinExpr(builtinID, e);
+
+  case llvm::Triple::ppc:
+  case llvm::Triple::ppcle:
+  case llvm::Triple::ppc64:
+  case llvm::Triple::ppc64le:
+  case llvm::Triple::r600:
+  case llvm::Triple::amdgcn:
+  case llvm::Triple::systemz:
+  case llvm::Triple::nvptx:
+  case llvm::Triple::nvptx64:
+  case llvm::Triple::wasm32:
+  case llvm::Triple::wasm64:
+  case llvm::Triple::hexagon:
+  case llvm::Triple::riscv32:
+  case llvm::Triple::riscv64:
+    // These are actually NYI, but that will be reported by emitBuiltinExpr.
+    // At this point, we don't even know that the builtin is target-specific.
+    return {};
+  default:
+    return {};
+  }
+}
+
+mlir::Value
+CIRGenFunction::emitTargetBuiltinExpr(unsigned builtinID, const CallExpr *e,
+                                      ReturnValueSlot &returnValue) {
+  if (getContext().BuiltinInfo.isAuxBuiltinID(builtinID)) {
+    assert(getContext().getAuxTargetInfo() && "Missing aux target info");
+    return emitTargetArchBuiltinExpr(
+        this, getContext().BuiltinInfo.getAuxBuiltinID(builtinID), e,
+        returnValue, getContext().getAuxTargetInfo()->getTriple().getArch());
+  }
+
+  return emitTargetArchBuiltinExpr(this, builtinID, e, returnValue,
+                                   getTarget().getTriple().getArch());
+}
+
 /// Given a builtin id for a function like "__builtin_fabsf", return a Function*
 /// for "fabsf".
 cir::FuncOp CIRGenModule::getBuiltinLibFunction(const FunctionDecl *fd,
diff --git a/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
new file mode 100644
index 0000000000000..3c9c7ecf35aff
--- /dev/null
+++ b/clang/lib/CIR/CodeGen/CIRGenBuiltinX86.cpp
@@ -0,0 +1,814 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This contains code to emit x86/x86_64 Builtin calls as CIR or a function
+// call to be later resolved.
+//
+//===----------------------------------------------------------------------===//
+
+#include "CIRGenFunction.h"
+#include "CIRGenModule.h"
+#include "clang/Basic/Builtins.h"
+#include "clang/Basic/TargetBuiltins.h"
+#include "clang/CIR/MissingFeatures.h"
+#include "llvm/IR/IntrinsicsX86.h"
+
+using namespace clang;
+using namespace clang::CIRGen;
+
+mlir::Value CIRGenFunction::emitX86BuiltinExpr(unsigned builtinID,
+                                               const CallExpr *e) {
+  if (builtinID == Builtin::BI__builtin_cpu_is) {
+    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_is");
+    return {};
+  }
+  if (builtinID == Builtin::BI__builtin_cpu_supports) {
+    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_supports");
+    return {};
+  }
+  if (builtinID == Builtin::BI__builtin_cpu_init) {
+    cgm.errorNYI(e->getSourceRange(), "__builtin_cpu_init");
+    return {};
+  }
+
+  // Handle MSVC intrinsics before argument evaluation to prevent double
+  // evaluation.
+  assert(!cir::MissingFeatures::msvcBuiltins());
+
+  // Find out if any arguments are required to be integer constant expressions.
+  assert(!cir::MissingFeatures::handleBuiltinICEArguments());
+
+  switch (builtinID) {
+  default:
+    return {};
+  case X86::BI_mm_prefetch:
+  case X86::BI_mm_clflush:
+  case X86::BI_mm_lfence:
+  case X86::BI_mm_pause:
+  case X86::BI_mm_mfence:
+  case X86::BI_mm_sfence:
+  case X86::BI__rdtsc:
+  case X86::BI__builtin_ia32_rdtscp:
+  case X86::BI__builtin_ia32_lzcnt_u16:
+  case X86::BI__builtin_ia32_lzcnt_u32:
+  case X86::BI__builtin_ia32_lzcnt_u64:
+  case X86::BI__builtin_ia32_tzcnt_u16:
+  case X86::BI__builtin_ia32_tzcnt_u32:
+  case X86::BI__builtin_ia32_tzcnt_u64:
+  case X86::BI__builtin_ia32_undef128:
+  case X86::BI__builtin_ia32_undef256:
+  case X86::BI__builtin_ia32_undef512:
+  case X86::BI__builtin_ia32_vec_ext_v4hi:
+  case X86::BI__builtin_ia32_vec_ext_v16qi:
+  case X86::BI__builtin_ia32_vec_ext_v8hi:
+  case X86::BI__builtin_ia32_vec_ext_v4si:
+  case X86::BI__builtin_ia32_vec_ext_v4sf:
+  case X86::BI__builtin_ia32_vec_ext_v2di:
+  case X86::BI__builtin_ia32_vec_ext_v32qi:
+  case X86::BI__builtin_ia32_vec_ext_v16hi:
+  case X86::BI__builtin_ia32_vec_ext_v8si:
+  case X86::BI__builtin_ia32_vec_ext_v4di:
+  case X86::BI__builtin_ia32_vec_set_v4hi:
+  case X86::BI__builtin_ia32_vec_set_v16qi:
+  case X86::BI__builtin_ia32_vec_set_v8hi:
+  case X86::BI__builtin_ia32_vec_set_v4si:
+  case X86::BI__builtin_ia32_vec_set_v2di:
+  case X86::BI__builtin_ia32_vec_set_v32qi:
+  case X86::BI__builtin_ia32_vec_set_v16hi:
+  case X86::BI__builtin_ia32_vec_set_v8si:
+  case X86::BI__builtin_ia32_vec_set_v4di:
+  case X86::BI_mm_setcsr:
+  case X86::BI__builtin_ia32_ldmxcsr:
+  case X86::BI_mm_getcsr:
+  case X86::BI__builtin_ia32_stmxcsr:
+  case X86::BI__builtin_ia32_xsave:
+  case X86::BI__builtin_ia32_xsave64:
+  case X86::BI__builtin_ia32_xrstor:
+  case X86::BI__builtin_ia32_xrstor64:
+  case X86::BI__builtin_ia32_xsaveopt:
+  case X86::BI__builtin_ia32_xsaveopt64:
+  case X86::BI__builtin_ia32_xrstors:
+  case X86::BI__builtin_ia32_xrstors64:
+  case X86::BI__builtin_ia32_xsavec:
+  case X86::BI__builtin_ia32_xsavec64:
+  case X86::BI__builtin_ia32_xsaves:
+  case X86::BI__builtin_ia32_xsaves64:
+  case X86::BI__builtin_ia32_xsetbv:
+  case X86::BI_xsetbv:
+  case X86::BI__builtin_ia32_xgetbv:
+  case X86::BI_xgetbv:
+  case X86::BI__builtin_ia32_storedqudi128_mask:
+  case X86::BI__builtin_ia32_storedqusi128_mask:
+  case X86::BI__builtin_ia32_storedquhi128_mask:
+  case X86::BI__builtin_ia32_storedquqi128_mask:
+  case X86::BI__builtin_ia32_storeupd128_mask:
+  case X86::BI__builtin_ia32_storeups128_mask:
+  case X86::BI__builtin_ia32_storedqudi256_mask:
+  case X86::BI__builtin_ia32_storedqusi256_mask:
+  case X86::BI__builtin_ia32_storedquhi256_mask:
+  case X86::BI__builtin_ia32_storedquqi256_mask:
+  case X86::BI__builtin_ia32_storeupd256_mask:
+  case X86::BI__builtin_ia32_storeups256_mask:
+  case X86::BI__builtin_ia32_storedqudi512_mask:
+  case X86::BI__builtin_ia32_storedqusi512_mask:
+  case X86::BI__builtin_ia32_storedquhi512_mask:
+  case X86::BI__builtin_ia32_storedquqi512_mask:
+  case X86::BI__builtin_ia32_storeupd512_mask:
+  case X86::BI__builtin_ia32_storeups512_mask:
+  case X86::BI__builtin_ia32_storesbf16128_mask:
+  case X86::BI__builtin_ia32_storesh128_mask:
+  case X86::BI__builtin_ia32_storess128_mask:
+  case X86::BI__builtin_ia32_storesd128_mask:
+  case X86::BI__builtin_ia32_cvtmask2b128:
+  case X86::BI__builtin_ia32_cvtmask2b256:
+  case X86::BI__builtin_ia32_cvtmask2b512:
+  case X86::BI__builtin_ia32_cvtmask2w128:
+  case X86::BI__builtin_ia32_cvtmask2w256:
+  case X86::BI__builtin_ia32_cvtmask2w512:
+  case X86::BI__builtin_ia32_cvtmask2d128:
+  case X86::BI__builtin_ia32_cvtmask2d256:
+  case X86::BI__builtin_ia32_cvtmask2d512:
+  case X86::BI__builtin_ia32_cvtmask2q128:
+  case X86::BI__builtin_ia32_cvtmask2q256:
+  case X86::BI__builtin_ia32_cvtmask2q512:
+  case X86::BI__builtin_ia32_cvtb2mask128:
+  case X86::BI__builtin_ia32_cvtb2mask256:
+  case X86::BI__builtin_ia32_cvtb2mask512:
+  case X86::BI__builtin_ia32_cvtw2mask128:
+  case X86::BI__builtin_ia32_cvtw2mask256:
+  case X86::BI__builtin_ia32_cvtw2mask512:
+  case X86::BI__builtin_ia32_cvtd2mask128:
+  case X86::BI__builtin_ia32_cvtd2mask256:
+  case X86::BI__builtin_ia32_cvtd2mask512:
+  case X86::BI__builtin_ia32_cvtq2mask128:
+  case X86::BI__builtin_ia32_cvtq2mask256:
+  case X86::BI__builtin_ia32_cvtq2mask512:
+  case X86::BI__builtin_ia32_cvtdq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtqq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtqq2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtdq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtqq2ph512_mask:
+  case X86::BI__builtin_ia32_cvtudq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2ps512_mask:
+  case X86::BI__builtin_ia32_cvtuqq2pd512_mask:
+  case X86::BI__builtin_ia32_vcvtuw2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtudq2ph512_mask:
+  case X86::BI__builtin_ia32_vcvtuqq2ph512_mask:
+  case X86::BI__builtin_ia32_vfmaddss3:
+  case X86::BI__builtin_ia32_vfmaddsd3:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask:
+  case X86::BI__builtin_ia32_vfmaddss3_mask:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask:
+  case X86::BI__builtin_ia32_vfmaddss:
+  case X86::BI__builtin_ia32_vfmaddsd:
+  case X86::BI__builtin_ia32_vfmaddsh3_maskz:
+  case X86::BI__builtin_ia32_vfmaddss3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsd3_maskz:
+  case X86::BI__builtin_ia32_vfmaddsh3_mask3:
+  case X86::BI__builtin_ia32_vfmaddss3_mask3:
+  case X86::BI__builtin_ia32_vfmaddsd3_mask3:
+  case X86::BI__builtin_ia32_vfmsubsh3_mask3:
+  case X86::BI__builtin_ia32_vfmsubss3_mask3:
+  case X86::BI__builtin_ia32_vfmsubsd3_mask3:
+  case X86::BI__builtin_ia32_vfmaddph512_mask:
+  case X86::BI__builtin_ia32_vfmaddph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddps512_mask:
+  case X86::BI__builtin_ia32_vfmaddps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubph512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubph512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddph512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubps512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubps512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddps512_mask3:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_maskz:
+  case X86::BI__builtin_ia32_vfmaddsubpd512_mask3:
+  case X86::BI__builtin_ia32_vfmsubaddpd512_mask3:
+  case X86::BI__builtin_ia32_movdqa32store128_mask:
+  case X86::BI__builtin_ia32_movdqa64store128_mask:
+  case X86::BI__builtin_ia32_storeaps128_mask:
+  case X86::BI__builtin_ia32_storeapd128_mask:
+  case X86::BI__builtin_ia32_movdqa32store256_mask:
+  case X86::BI__builtin_ia32_movdqa64store256_mask:
+  case X86::BI__builtin_ia32_storeaps256_mask:
+  case X86::BI__builtin_ia32_storeapd256_mask:
+  case X86::BI__builtin_ia32_movdqa32store512_mask:
+  case X86::BI__builtin_ia32_movdqa64store512_mask:
+  case X86::BI__builtin_ia32_storeaps512_mask:
+  case X86::BI__builtin_ia32_storeapd512_mask:
+  case X86::BI__builtin_ia32_loadups128_mask:
+  case X86::BI__builtin_ia32_loadups256_mask:
+  case X86::BI__builtin_ia32_loadups512_mask:
+  case X86::BI__builtin_ia32_loadupd128_mask:
+  case X86::BI__builtin_ia32_loadupd256_mask:
+  case X86::BI__builtin_ia32_loadupd512_mask:
+  case X86::BI__builtin_ia32_loaddquqi128_mask:
+  case X86::BI__builtin_ia32_loaddquqi256_mask:
+  case X86::BI__builtin_ia32_loaddquqi512_mask:
+  case X86::BI__builtin_ia32_loaddquhi128_mask:
+  case X86::BI__builtin_ia32_loaddquhi256_mask:
+  case X86::BI__builtin_ia32_loaddquhi512_mask:
+  case X86::BI__builtin_ia32_loaddqusi128_mask:
+  case X86::BI__builtin_ia32_loaddqusi256_mask:
+  case X86::BI__builtin_ia32_loaddqusi512_mask:
+  case X86::BI__builtin_ia32_loaddqudi128_mask:
+  case X86::BI__builtin_ia32_loaddqudi256_mask:
+  case X86::BI__builtin_ia32_loaddqudi512_mask:
+  case X86::BI__builtin_ia32_loadsbf16128_mask:
+  case X86::BI__builtin_ia32_loadsh128_mask:
+  case X86::BI__builtin_ia32_loadss128_mask:
+  case X86::BI__builtin_ia32_loadsd128_mask:
+  case X86::BI__builtin_ia32_loadaps128_mask:
+  case X86::BI__builtin_ia32_loadaps256_mask:
+  case X86::BI__builtin_ia32_loadaps512_mask:
+  case X86::BI__builtin_ia32_loadapd128_mask:
+  case X86::BI__builtin_ia32_loadapd256_mask:
+  case X86::BI__builtin_ia32_loadapd512_mask:
+  case X86::BI__builtin_ia32_movdqa32load128_mask:
+  case X86::BI__builtin_ia32_movdqa32load256_mask:
+  case X86::BI__builtin_ia32_movdqa32load512_mask:
+  case X86::BI__builtin_ia32_movdqa64load128_mask:
+  case X86::BI__builtin_ia32_movdqa64load256_mask:
+  case X86::BI__builtin_ia32_movdqa64load512_mask:
+  case X86::BI__builtin_ia32_expandloaddf128_mask:
+  case X86::BI__builtin_ia32_expandloaddf256_mask:
+  case X86::BI__builtin_ia32_expandloaddf512_mask:
+  case X86::BI__builtin_ia32_expandloadsf128_mask:
+  case X86::BI__builtin_ia32_expandloadsf256_mask:
+  case X86::BI__builtin_ia32_expandloadsf512_mask:
+  case X86::BI__builtin_ia32_expandloaddi128_mask:
+  case X86::BI__builtin_ia32_expandloaddi256_mask:
+  case X86::BI__builtin_ia32_expandloaddi512_mask:
+  case X86::BI__builtin_ia32_expandloadsi128_mask:
+  case X86::BI__builtin_ia32_expandloadsi256_mask:
+  case X86::BI__builtin_ia32_expandloadsi512_mask:
+  case X86::BI__builtin_ia32_expandloadhi128_mask:
+  case X86::BI__builtin_ia32_expandloadhi256_mask:
+  case X86::BI__builtin_ia32_expandloadhi512_mask:
+  case X86::BI__builtin_ia32_expandloadqi128_mask:
+  case X86::BI__builtin_ia32_expandloadqi256_mask:
+  case X86::BI__builtin_ia32_expandloadqi512_mask:
+  case X86::BI__builtin_ia32_compressstoredf128_mask:
+  case X86::BI__builtin_ia32_compressstoredf256_mask:
+  case X86::BI__builtin_ia32_compressstoredf512_mask:
+  case X86::BI__builtin_ia32_compressstoresf128_mask:
+  case X86::BI__builtin_ia32_compressstoresf256_mask:
+  case X86::BI__builtin_ia32_compressstoresf512_mask:
+  case X86::BI__builtin_ia32_compressstoredi128_mask:
+  case X86::BI__builtin_ia32_compressstoredi256_mask:
+  case X86::BI__builtin_ia32_compressstoredi512_mask:
+  case X86::BI__builtin_ia32_compressstoresi128_mask:
+  case X86::BI__builtin_ia32_compressstoresi256_mask:
+  case X86::BI__builtin_ia32_compressstoresi512_mask:
+  case X86::BI__builtin_ia32_compressstorehi128_mask:
+  case X86::BI__builtin_ia32_compressstorehi256_mask:
+  case X86::BI__builtin_ia32_compressstorehi512_mask:
+  case X86::BI__builtin_ia32_compressstoreqi128_mask:
+  case X86::BI__builtin_ia32_compressstoreqi256_mask:
+  case X86::BI__builtin_ia32_compressstoreqi512_mask:
+  case X86::BI__builtin_ia32_expanddf128_mask:
+  case X86::BI__builtin_ia32_expanddf256_mask:
+  case X86::BI__builtin_ia32_expanddf512_mask:
+  case X86::BI__builtin_ia32_expandsf128_mask:
+  case X86::BI__builtin_ia32_expandsf256_mask:
+  case X86::BI__builtin_ia32_expandsf512_mask:
+  case X86::BI__builtin_ia32_expanddi128_mask:
+  case X86::BI__builtin_ia32_expanddi256_mask:
+  case X86::BI__builtin_ia32_expanddi512_mask:
+  case X86::BI__builtin_ia32_expandsi128_mask:
+  case X86::BI__builtin_ia32_expandsi256_mask:
+  case X86::BI__builtin_ia32_expandsi512_mask:
+  case X86::BI__builtin_ia32_expandhi128_mask:
+  case X86::BI__builtin_ia32_expandhi256_mask:
+  case X86::BI__builtin_ia32_expandhi512_mask:
+  case X86::BI__builtin_ia32_expandqi128_mask:
+  case X86::BI__builtin_ia32_expandqi256_mask:
+  case X86::BI__builtin_ia32_expandqi512_mask:
+  case X86::BI__builtin_ia32_compressdf128_mask:
+  case X86::BI__builtin_ia32_compressdf256_mask:
+  case X86::BI__builtin_ia32_compressdf512_mask:
+  case X86::BI__builtin_ia32_compresssf128_mask:
+  case X86::BI__builtin_ia32_compresssf256_mask:
+  case X86::BI__builtin_ia32_compresssf512_mask:
+  case X86::BI__builtin_ia32_compressdi128_mask:
+  case X86::BI__builtin_ia32_compressdi256_mask:
+  case X86::BI__builtin_ia32_compressdi512_mask:
+  case X86::BI__builtin_ia32_compresssi128_mask:
+  case X86::BI__builtin_ia32_compresssi256_mask:
+  case X86::BI__builtin_ia32_compresssi512_mask:
+  case X86::BI__builtin_ia32_compresshi128_mask:
+  case X86::BI__builtin_ia32_compresshi256_mask:
+  case X86::BI__builtin_ia32_compresshi512_mask:
+  case X86::BI__builtin_ia32_compressqi128_mask:
+  case X86::BI__builtin_ia32_compressqi256_mask:
+  case X86::BI__builtin_ia32_compressqi512_mask:
+  case X86::BI__builtin_ia32_gather3div2df:
+  case X86::BI__builtin_ia32_gather3div2di:
+  case X86::BI__builtin_ia32_gather3div4df:
+  case X86::BI__builtin_ia32_gather3div4di:
+  case X86::BI__builtin_ia32_gather3div4sf:
+  case X86::BI__builtin_ia32_gather3div4si:
+  case X86::BI__builtin_ia32_gather3div8sf:
+  case X86::BI__builtin_ia32_gather3div8si:
+  case X86::BI__builtin_ia32_gather3siv2df:
+  case X86::BI__builtin_ia32_gather3siv2di:
+  case X86::BI__builtin_ia32_gather3siv4df:
+  case X86::BI__builtin_ia32_gather3siv4di:
+  case X86::BI__builtin_ia32_gather3siv4sf:
+  case X86::BI__builtin_ia32_gather3siv4si:
+  case X86::BI__builtin_ia32_gather3siv8sf:
+  case X86::BI__builtin_ia32_gather3siv8si:
+  case X86::BI__builtin_ia32_gathersiv8df:
+  case X86::BI__builtin_ia32_gathersiv16sf:
+  case X86::BI__builtin_ia32_gatherdiv8df:
+  case X86::BI__builtin_ia32_gatherdiv16sf:
+  case X86::BI__builtin_ia32_gathersiv8di:
+  case X86::BI__builtin_ia32_gathersiv16si:
+  case X86::BI__builtin_ia32_gatherdiv8di:
+  case X86::BI__builtin_ia32_gatherdiv16si:
+  case X86::BI__builtin_ia32_scattersiv8df:
+  case X86::BI__builtin_ia32_scattersiv16sf:
+  case X86::BI__builtin_ia32_scatterdiv8df:
+  case X86::BI__builtin_ia32_scatterdiv16sf:
+  case X86::BI__builtin_ia32_scattersiv8di:
+  case X86::BI__builtin_ia32_scattersiv16si:
+  case X86::BI__builtin_ia32_scatterdiv8di:
+  case X86::BI__builtin_ia32_scatterdiv16si:
+  case X86::BI__builtin_ia32_scatterdiv2df:
+  case X86::BI__builtin_ia32_scatterdiv2di:
+  case X86::BI__builtin_ia32_scatterdiv4df:
+  case X86::BI__builtin_ia32_scatterdiv4di:
+  case X86::BI__builtin_ia32_scatterdiv4sf:
+  case X86::BI__builtin_ia32_scatterdiv4si:
+  case X86::BI__builtin_ia32_scatterdiv8sf:
+  case X86::BI__builtin_ia32_scatterdiv8si:
+  case X86::BI__builtin_ia32_scattersiv2df:
+  case X86::BI__builtin_ia32_scattersiv2di:
+  case X86::BI__builtin_ia32_scattersiv4df:
+  case X86::BI__builtin_ia32_scattersiv4di:
+  case X86::BI__builtin_ia32_scattersiv4sf:
+  case X86::BI__builtin_ia32_scattersiv4si:
+  case X86::BI__builtin_ia32_scattersiv8sf:
+  case X86::BI__builtin_ia32_scattersiv8si:
+  case X86::BI__builtin_ia32_vextractf128_pd256:
+  case X86::BI__builtin_ia32_vextractf128_ps256:
+  case X86::BI__builtin_ia32_vextractf128_si256:
+  case X86::BI__builtin_ia32_extract128i256:
+  case X86::BI__builtin_ia32_extractf64x4_mask:
+  case X86::BI__builtin_ia32_extractf32x4_mask:
+  case X86::BI__builtin_ia32_extracti64x4_mask:
+  case X86::BI__builtin_ia32_extracti32x4_mask:
+  case X86::BI__builtin_ia32_extractf32x8_mask:
+  case X86::BI__builtin_ia32_extracti32x8_mask:
+  case X86::BI__builtin_ia32_extractf32x4_256_mask:
+  case X86::BI__builtin_ia32_extracti32x4_256_mask:
+  case X86::BI__builtin_ia32_extractf64x2_256_mask:
+  case X86::BI__builtin_ia32_extracti64x2_256_mask:
+  case X86::BI__builtin_ia32_extractf64x2_512_mask:
+  case X86::BI__builtin_ia32_extracti64x2_512_mask:
+  case X86::BI__builtin_ia32_vinsertf128_pd256:
+  case X86::BI__builtin_ia32_vinsertf128_ps256:
+  case X86::BI__builtin_ia32_vinsertf128_si256:
+  case X86::BI__builtin_ia32_insert128i256:
+  case X86::BI__builtin_ia32_insertf64x4:
+  case X86::BI__builtin_ia32_insertf32x4:
+  case X86::BI__builtin_ia32_inserti64x4:
+  case X86::BI__builtin_ia32_inserti32x4:
+  case X86::BI__builtin_ia32_insertf32x8:
+  case X86::BI__builtin_ia32_inserti32x8:
+  case X86::BI__builtin_ia32_insertf32x4_256:
+  case X86::BI__builtin_ia32_inserti32x4_256:
+  case X86::BI__builtin_ia32_insertf64x2_256:
+  case X86::BI__builtin_ia32_inserti64x2_256:
+  case X86::BI__builtin_ia32_insertf64x2_512:
+  case X86::BI__builtin_ia32_inserti64x2_512:
+  case X86::BI__builtin_ia32_pmovqd512_mask:
+  case X86::BI__builtin_ia32_pmovwb512_mask:
+  case X86::BI__builtin_ia32_pblendw128:
+  case X86::BI__builtin_ia32_blendpd:
+  case X86::BI__builtin_ia32_blendps:
+  case X86::BI__builtin_ia32_blendpd256:
+  case X86::BI__builtin_ia32_blendps256:
+  case X86::BI__builtin_ia32_pblendw256:
+  case X86::BI__builtin_ia32_pblendd128:
+  case X86::BI__builtin_ia32_pblendd256:
+  case X86::BI__builtin_ia32_pshuflw:
+  case X86::BI__builtin_ia32_pshuflw256:
+  case X86::BI__builtin_ia32_pshuflw512:
+  case X86::BI__builtin_ia32_pshufhw:
+  case X86::BI__builtin_ia32_pshufhw256:
+  case X86::BI__builtin_ia32_pshufhw512:
+  case X86::BI__builtin_ia32_pshufd:
+  case X86::BI__builtin_ia32_pshufd256:
+  case X86::BI__builtin_ia32_pshufd512:
+  case X86::BI__builtin_ia32_vpermilpd:
+  case X86::BI__builtin_ia32_vpermilps:
+  case X86::BI__builtin_ia32_vpermilpd256:
+  case X86::BI__builtin_ia32_vpermilps256:
+  case X86::BI__builtin_ia32_vpermilpd512:
+  case X86::BI__builtin_ia32_vpermilps512:
+  case X86::BI__builtin_ia32_shufpd:
+  case X86::BI__builtin_ia32_shufpd256:
+  case X86::BI__builtin_ia32_shufpd512:
+  case X86::BI__builtin_ia32_shufps:
+  case X86::BI__builtin_ia32_shufps256:
+  case X86::BI__builtin_ia32_shufps512:
+  case X86::BI__builtin_ia32_permdi256:
+  case X86::BI__builtin_ia32_permdf256:
+  case X86::BI__builtin_ia32_permdi512:
+  case X86::BI__builtin_ia32_permdf512:
+  case X86::BI__builtin_ia32_palignr128:
+  case X86::BI__builtin_ia32_palignr256:
+  case X86::BI__builtin_ia32_palignr512:
+  case X86::BI__builtin_ia32_alignd128:
+  case X86::BI__builtin_ia32_alignd256:
+  case X86::BI__builtin_ia32_alignd512:
+  case X86::BI__builtin_ia32_alignq128:
+  case X86::BI__builtin_ia32_alignq256:
+  case X86::BI__builtin_ia32_alignq512:
+  case X86::BI__builtin_ia32_shuf_f32x4_256:
+  case X86::BI__builtin_ia32_shuf_f64x2_256:
+  case X86::BI__builtin_ia32_shuf_i32x4_256:
+  case X86::BI__builtin_ia32_shuf_i64x2_256:
+  case X86::BI__builtin_ia32_shuf_f32x4:
+  case X86::BI__builtin_ia32_shuf_f64x2:
+  case X86::BI__builtin_ia32_shuf_i32x4:
+  case X86::BI__builtin_ia32_shuf_i64x2:
+  case X86::BI__builtin_ia32_vperm2f128_pd256:
+  case X86::BI__builtin_ia32_vperm2f128_ps256:
+  case X86::BI__builtin_ia32_vperm2f128_si256:
+  case X86::BI__builtin_ia32_permti256:
+  case X86::BI__builtin_ia32_pslldqi128_byteshift:
+  case X86::BI__builtin_ia32_pslldqi256_byteshift:
+  case X86::BI__builtin_ia32_pslldqi512_byteshift:
+  case X86::BI__builtin_ia32_psrldqi128_byteshift:
+  case X86::BI__builtin_ia32_psrldqi256_byteshift:
+  case X86::BI__builtin_ia32_psrldqi512_byteshift:
+  case X86::BI__builtin_ia32_kshiftliqi:
+  case X86::BI__builtin_ia32_kshiftlihi:
+  case X86::BI__builtin_ia32_kshiftlisi:
+  case X86::BI__builtin_ia32_kshiftlidi:
+  case X86::BI__builtin_ia32_kshiftriqi:
+  case X86::BI__builtin_ia32_kshiftrihi:
+  case X86::BI__builtin_ia32_kshiftrisi:
+  case X86::BI__builtin_ia32_kshiftridi:
+  case X86::BI__builtin_ia32_vprotbi:
+  case X86::BI__builtin_ia32_vprotwi:
+  case X86::BI__builtin_ia32_vprotdi:
+  case X86::BI__builtin_ia32_vprotqi:
+  case X86::BI__builtin_ia32_prold128:
+  case X86::BI__builtin_ia32_prold256:
+  case X86::BI__builtin_ia32_prold512:
+  case X86::BI__builtin_ia32_prolq128:
+  case X86::BI__builtin_ia32_prolq256:
+  case X86::BI__builtin_ia32_prolq512:
+  case X86::BI__builtin_ia32_prord128:
+  case X86::BI__builtin_ia32_prord256:
+  case X86::BI__builtin_ia32_prord512:
+  case X86::BI__builtin_ia32_prorq128:
+  case X86::BI__builtin_ia32_prorq256:
+  case X86::BI__builtin_ia32_prorq512:
+  case X86::BI__builtin_ia32_selectb_128:
+  case X86::BI__builtin_ia32_selectb_256:
+  case X86::BI__builtin_ia32_selectb_512:
+  case X86::BI__builtin_ia32_selectw_128:
+  case X86::BI__builtin_ia32_selectw_256:
+  case X86::BI__builtin_ia32_selectw_512:
+  case X86::BI__builtin_ia32_selectd_128:
+  case X86::BI__builtin_ia32_selectd_256:
+  case X86::BI__builtin_ia32_selectd_512:
+  case X86::BI__builtin_ia32_selectq_128:
+  case X86::BI__builtin_ia32_selectq_256:
+  case X86::BI__builtin_ia32_selectq_512:
+  case X86::BI__builtin_ia32_selectph_128:
+  case X86::BI__builtin_ia32_selectph_256:
+  case X86::BI__builtin_ia32_selectph_512:
+  case X86::BI__builtin_ia32_selectpbf_128:
+  case X86::BI__builtin_ia32_selectpbf_256:
+  case X86::BI__builtin_ia32_selectpbf_512:
+  case X86::BI__builtin_ia32_selectps_128:
+  case X86::BI__builtin_ia32_selectps_256:
+  case X86::BI__builtin_ia32_selectps_512:
+  case X86::BI__builtin_ia32_selectpd_128:
+  case X86::BI__builtin_ia32_selectpd_256:
+  case X86::BI__builtin_ia32_selectpd_512:
+  case X86::BI__builtin_ia32_selectsh_128:
+  case X86::BI__builtin_ia32_selectsbf_128:
+  case X86::BI__builtin_ia32_selectss_128:
+  case X86::BI__builtin_ia32_selectsd_128:
+  case X86::BI__builtin_ia32_cmpb128_mask:
+  case X86::BI__builtin_ia32_cmpb256_mask:
+  case X86::BI__builtin_ia32_cmpb512_mask:
+  case X86::BI__builtin_ia32_cmpw128_mask:
+  case X86::BI__builtin_ia32_cmpw256_mask:
+  case X86::BI__builtin_ia32_cmpw512_mask:
+  case X86::BI__builtin_ia32_cmpd128_mask:
+  case X86::BI__builtin_ia32_cmpd256_mask:
+  case X86::BI__builtin_ia32_cmpd512_mask:
+  case X86::BI__builtin_ia32_cmpq128_mask:
+  case X86::BI__builtin_ia32_cmpq256_mask:
+  case X86::BI__builtin_ia32_cmpq512_mask:
+  case X86::BI__builtin_ia32_ucmpb128_mask:
+  case X86::BI__builtin_ia32_ucmpb256_mask:
+  case X86::BI__builtin_ia32_ucmpb512_mask:
+  case X86::BI__builtin_ia32_ucmpw128_mask:
+  case X86::BI__builtin_ia32_ucmpw256_mask:
+  case X86::BI__builtin_ia32_ucmpw512_mask:
+  case X86::BI__builtin_ia32_ucmpd128_mask:
+  case X86::BI__builtin_ia32_ucmpd256_mask:
+  case X86::BI__builtin_ia32_ucmpd512_mask:
+  case X86::BI__builtin_ia32_ucmpq128_mask:
+  case X86::BI__builtin_ia32_ucmpq256_mask:
+  case X86::BI__builtin_ia32_ucmpq512_mask:
+  case X86::BI__builtin_ia32_vpcomb:
+  case X86::BI__builtin_ia32_vpcomw:
+  case X86::BI__builtin_ia32_vpcomd:
+  case X86::BI__builtin_ia32_vpcomq:
+  case X86::BI__builtin_ia32_vpcomub:
+  case X86::BI__builtin_ia32_vpcomuw:
+  case X86::BI__builtin_ia32_vpcomud:
+  case X86::BI__builtin_ia32_vpcomuq:
+  case X86::BI__builtin_ia32_kortestcqi:
+  case X86::BI__builtin_ia32_kortestchi:
+  case X86::BI__builtin_ia32_kortestcsi:
+  case X86::BI__builtin_ia32_kortestcdi:
+  case X86::BI__builtin_ia32_kortestzqi:
+  case X86::BI__builtin_ia32_kortestzhi:
+  case X86::BI__builtin_ia32_kortestzsi:
+  case X86::BI__builtin_ia32_kortestzdi:
+  case X86::BI__builtin_ia32_ktestcqi:
+  case X86::BI__builtin_ia32_ktestzqi:
+  case X86::BI__builtin_ia32_ktestchi:
+  case X86::BI__builtin_ia32_ktestzhi:
+  case X86::BI__builtin_ia32_ktestcsi:
+  case X86::BI__builtin_ia32_ktestzsi:
+  case X86::BI__builtin_ia32_ktestcdi:
+  case X86::BI__builtin_ia32_ktestzdi:
+  case X86::BI__builtin_ia32_kaddqi:
+  case X86::BI__builtin_ia32_kaddhi:
+  case X86::BI__builtin_ia32_kaddsi:
+  case X86::BI__builtin_ia32_kadddi:
+  case X86::BI__builtin_ia32_kandqi:
+  case X86::BI__builtin_ia32_kandhi:
+  case X86::BI__builtin_ia32_kandsi:
+  case X86::BI__builtin_ia32_kanddi:
+  case X86::BI__builtin_ia32_kandnqi:
+  case X86::BI__builtin_ia32_kandnhi:
+  case X86::BI__builtin_ia32_kandnsi:
+  case X86::BI__builtin_ia32_kandndi:
+  case X86::BI__builtin_ia32_korqi:
+  case X86::BI__builtin_ia32_korhi:
+  case X86::BI__builtin_ia32_korsi:
+  case X86::BI__builtin_ia32_kordi:
+  case X86::BI__builtin_ia32_kxnorqi:
+  case X86::BI__builtin_ia32_kxnorhi:
+  case X86::BI__builtin_ia32_kxnorsi:
+  case X86::BI__builtin_ia32_kxnordi:
+  case X86::BI__builtin_ia32_kxorqi:
+  case X86::BI__builtin_ia32_kxorhi:
+  case X86::BI__builtin_ia32_kxorsi:
+  case X86::BI__builtin_ia32_kxordi:
+  case X86::BI__builtin_ia32_knotqi:
+  case X86::BI__builtin_ia32_knothi:
+  case X86::BI__builtin_ia32_knotsi:
+  case X86::BI__builtin_ia32_knotdi:
+  case X86::BI__builtin_ia32_kmovb:
+  case X86::BI__builtin_ia32_kmovw:
+  case X86::BI__builtin_ia32_kmovd:
+  case X86::BI__builtin_ia32_kmovq:
+  case X86::BI__builtin_ia32_kunpckdi:
+  case X86::BI__builtin_ia32_kunpcksi:
+  case X86::BI__builtin_ia32_kunpckhi:
+  case X86::BI__builtin_ia32_sqrtsh_round_mask:
+  case X86::BI__builtin_ia32_sqrtsd_round_mask:
+  case X86::BI__builtin_ia32_sqrtss_round_mask:
+  case X86::BI__builtin_ia32_sqrtpd256:
+  case X86::BI__builtin_ia32_sqrtpd:
+  case X86::BI__builtin_ia32_sqrtps256:
+  case X86::BI__builtin_ia32_sqrtps:
+  case X86::BI__builtin_ia32_sqrtph256:
+  case X86::BI__builtin_ia32_sqrtph:
+  case X86::BI__builtin_ia32_sqrtph512:
+  case X86::BI__builtin_ia32_vsqrtbf16256:
+  case X86::BI__builtin_ia32_vsqrtbf16:
+  case X86::BI__builtin_ia32_vsqrtbf16512:
+  case X86::BI__builtin_ia32_sqrtps512:
+  case X86::BI__builtin_ia32_sqrtpd512:
+  case X86::BI__builtin_ia32_pmuludq128:
+  case X86::BI__builtin_ia32_pmuludq256:
+  case X86::BI__builtin_ia32_pmuludq512:
+  case X86::BI__builtin_ia32_pmuldq128:
+  case X86::BI__builtin_ia32_pmuldq256:
+  case X86::BI__builtin_ia32_pmuldq512:
+  case X86::BI__builtin_ia32_pternlogd512_mask:
+  case X86::BI__builtin_ia32_pternlogq512_mask:
+  case X86::BI__builtin_ia32_pternlogd128_mask:
+  case X86::BI__builtin_ia32_pternlogd256_mask:
+  case X86::BI__builtin_ia32_pternlogq128_mask:
+  case X86::BI__builtin_ia32_pternlogq256_mask:
+  case X86::BI__builtin_ia32_pternlogd512_maskz:
+  case X86::BI__builtin_ia32_pternlogq512_maskz:
+  case X86::BI__builtin_ia32_pternlogd128_maskz:
+  case X86::BI__builtin_ia32_pternlogd256_maskz:
+  case X86::BI__builtin_ia32_pternlogq128_maskz:
+  case X86::BI__builtin_ia32_pternlogq256_maskz:
+  case X86::BI__builtin_ia32_vpshldd128:
+  case X86::BI__builtin_ia32_vpshldd256:
+  case X86::BI__builtin_ia32_vpshldd512:
+  case X86::BI__builtin_ia32_vpshldq128:
+  case X86::BI__builtin_ia32_vpshldq256:
+  case X86::BI__builtin_ia32_vpshldq512:
+  case X86::BI__builtin_ia32_vpshldw128:
+  case X86::BI__builtin_ia32_vpshldw256:
+  case X86::BI__builtin_ia32_vpshldw512:
+  case X86::BI__builtin_ia32_vpshrdd128:
+  case X86::BI__builtin_ia32_vpshrdd256:
+  case X86::BI__builtin_ia32_vpshrdd512:
+  case X86::BI__builtin_ia32_vpshrdq128:
+  case X86::BI__builtin_ia32_vpshrdq256:
+  case X86::BI__builtin_ia32_vpshrdq512:
+  case X86::BI__builtin_ia32_vpshrdw128:
+  case X86::BI__builtin_ia32_vpshrdw256:
+  case X86::BI__builtin_ia32_vpshrdw512:
+  case X86::BI__builtin_ia32_reduce_fadd_pd512:
+  case X86::BI__builtin_ia32_reduce_fadd_ps512:
+  case X86::BI__builtin_ia32_reduce_fadd_ph512:
+  case X86::BI__builtin_ia32_reduce_fadd_ph256:
+  case X86::BI__builtin_ia32_reduce_fadd_ph128:
+  case X86::BI__builtin_ia32_reduce_fmul_pd512:
+  case X86::BI__builtin_ia32_reduce_fmul_ps512:
+  case X86::BI__builtin_ia32_reduce_fmul_ph512:
+  case X86::BI__builtin_ia32_reduce_fmul_ph256:
+  case X86::BI__builtin_ia32_reduce_fmul_ph128:
+  case X86::BI__builtin_ia32_reduce_fmax_pd512:
+  case X86::BI__builtin_ia32_reduce_fmax_ps512:
+  case X86::BI__builtin_ia32_reduce_fmax_ph512:
+  case X86::BI__builtin_ia32_reduce_fmax_ph256:
+  case X86::BI__builtin_ia32_reduce_fmax_ph128:
+  case X86::BI__builtin_ia32_reduce_fmin_pd512:
+  case X86::BI__builtin_ia32_reduce_fmin_ps512:
+  case X86::BI__builtin_ia32_reduce_fmin_ph512:
+  case X86::BI__builtin_ia32_reduce_fmin_ph256:
+  case X86::BI__builtin_ia32_reduce_fmin_ph128:
+  case X86::BI__builtin_ia32_rdrand16_step:
+  case X86::BI__builtin_ia32_rdrand32_step:
+  case X86::BI__builtin_ia32_rdrand64_step:
+  case X86::BI__builtin_ia32_rdseed16_step:
+  case X86::BI__builtin_ia32_rdseed32_step:
+  case X86::BI__builtin_ia32_rdseed64_step:
+  case X86::BI__builtin_ia32_addcarryx_u32:
+  case X86::BI__builtin_ia32_addcarryx_u64:
+  case X86::BI__builtin_ia32_subborrow_u32:
+  case X86::BI__builtin_ia32_subborrow_u64:
+  case X86::BI__builtin_ia32_fpclassps128_mask:
+  case X86::BI__builtin_ia32_fpclassps256_mask:
+  case X86::BI__builtin_ia32_fpclassps512_mask:
+  case X86::BI__builtin_ia32_vfpclassbf16128_mask:
+  case X86::BI__builtin_ia32_vfpclassbf16256_mask:
+  case X86::BI__builtin_ia32_vfpclassbf16512_mask:
+  case X86::BI__builtin_ia32_fpclassph128_mask:
+  case X86::BI__builtin_ia32_fpclassph256_mask:
+  case X86::BI__builtin_ia32_fpclassph512_mask:
+  case X86::BI__builtin_ia32_fpclasspd128_mask:
+  case X86::BI__builtin_ia32_fpclasspd256_mask:
+  case X86::BI__builtin_ia32_fpclasspd512_mask:
+  case X86::BI__builtin_ia32_vp2intersect_q_512:
+  case X86::BI__builtin_ia32_vp2intersect_q_256:
+  case X86::BI__builtin_ia32_vp2intersect_q_128:
+  case X86::BI__builtin_ia32_vp2intersect_d_512:
+  case X86::BI__builtin_ia32_vp2intersect_d_256:
+  case X86::BI__builtin_ia32_vp2intersect_d_128:
+  case X86::BI__builtin_ia32_vpmultishiftqb128:
+  case X86::BI__builtin_ia32_vpmultishiftqb256:
+  case X86::BI__builtin_ia32_vpmultishiftqb512:
+  case X86::BI__builtin_ia32_vpshufbitqmb128_mask:
+  case X86::BI__builtin_ia32_vpshufbitqmb256_mask:
+  case X86::BI__builtin_ia32_vpshufbitqmb512_mask:
+  case X86::BI__builtin_ia32_cmpeqps:
+  case X86::BI__builtin_ia32_cmpeqpd:
+  case X86::BI__builtin_ia32_cmpltps:
+  case X86::BI__builtin_ia32_cmpltpd:
+  case X86::BI__builtin_ia32_cmpleps:
+  case X86::BI__builtin_ia32_cmplepd:
+  case X86::BI__builtin_ia32_cmpunordps:
+  case X86::BI__builtin_ia32_cmpunordpd:
+  case X86::BI__builtin_ia32_cmpneqps:
+  case X86::BI__builtin_ia32_cmpneqpd:
+  case X86::BI__builtin_ia32_cmpnltps:
+  case X86::BI__builtin_ia32_cmpnltpd:
+  case X86::BI__builtin_ia32_cmpnleps:
+  case X86::BI__builtin_ia32_cmpnlepd:
+  case X86::BI__builtin_ia32_cmpordps:
+  case X86::BI__builtin_ia32_cmpordpd:
+  case X86::BI__builtin_ia32_cmpph128_mask:
+  case X86::BI__builtin_ia32_cmpph256_mask:
+  case X86::BI__builtin_ia32_cmpph512_mask:
+  case X86::BI__builtin_ia32_cmpps128_mask:
+  case X86::BI__builtin_ia32_cmpps256_mask:
+  case X86::BI__builtin_ia32_cmpps512_mask:
+  case X86::BI__builtin_ia32_cmppd128_mask:
+  case X86::BI__builtin_ia32_cmppd256_mask:
+  case X86::BI__builtin_ia32_cmppd512_mask:
+  case X86::BI__builtin_ia32_vcmpbf16512_mask:
+  case X86::BI__builtin_ia32_vcmpbf16256_mask:
+  case X86::BI__builtin_ia32_vcmpbf16128_mask:
+  case X86::BI__builtin_ia32_cmpps:
+  case X86::BI__builtin_ia32_cmpps256:
+  case X86::BI__builtin_ia32_cmppd:
+  case X86::BI__builtin_ia32_cmppd256:
+  case X86::BI__builtin_ia32_cmpeqss:
+  case X86::BI__builtin_ia32_cmpltss:
+  case X86::BI__builtin_ia32_cmpless:
+  case X86::BI__builtin_ia32_cmpunordss:
+  case X86::BI__builtin_ia32_cmpneqss:
+  case X86::BI__builtin_ia32_cmpnltss:
+  case X86::BI__builtin_ia32_cmpnless:
+  case X86::BI__builtin_ia32_cmpordss:
+  case X86::BI__builtin_ia32_cmpeqsd:
+  case X86::BI__builtin_ia32_cmpltsd:
+  case X86::BI__builtin_ia32_cmplesd:
+  case X86::BI__builtin_ia32_cmpunordsd:
+  case X86::BI__builtin_ia32_cmpneqsd:
+  case X86::BI__builtin_ia32_cmpnltsd:
+  case X86::BI__builtin_ia32_cmpnlesd:
+  case X86::BI__builtin_ia32_cmpordsd:
+  case X86::BI__builtin_ia32_vcvtph2ps_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps256_mask:
+  case X86::BI__builtin_ia32_vcvtph2ps512_mask:
+  case X86::BI__builtin_ia32_cvtneps2bf16_128_mask:
+  case X86::BI__builtin_ia32_cvtsbf162ss_32:
+  case X86::BI__builtin_ia32_cvtneps2bf16_256_mask:
+  case X86::BI__builtin_ia32_cvtneps2bf16_512_mask:
+  case X86::BI__cpuid:
+  case X86::BI__cpuidex:
+  case X86::BI__emul:
+  case X86::BI__emulu:
+  case X86::BI__mulh:
+  case X86::BI__umulh:
+  case X86::BI_mul128:
+  case X86::BI_umul128:
+  case X86::BI__faststorefence:
+  case X86::BI__shiftleft128:
+  case X86::BI__shiftright128:
+  case X86::BI_ReadWriteBarrier:
+  case X86::BI_ReadBarrier:
+  case X86::BI_WriteBarrier:
+  case X86::BI_AddressOfReturnAddress:
+  case X86::BI__stosb:
+  case X86::BI__builtin_ia32_t2rpntlvwz0_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rs_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz0rst1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rs_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1t1_internal:
+  case X86::BI__builtin_ia32_t2rpntlvwz1rst1_internal:
+  case X86::BI__ud2:
+  case X86::BI__int2c:
+  case X86::BI__readfsbyte:
+  case X86::BI__readfsword:
+  case X86::BI__readfsdword:
+  case X86::BI__readfsqword:
+  case X86::BI__readgsbyte:
+  case X86::BI__readgsword:
+  case X86::BI__readgsdword:
+  case X86::BI__readgsqword:
+  case X86::BI__builtin_ia32_encodekey128_u32:
+  case X86::BI__builtin_ia32_encodekey256_u32:
+  case X86::BI__builtin_ia32_aesenc128kl_u8:
+  case X86::BI__builtin_ia32_aesdec128kl_u8:
+  case X86::BI__builtin_ia32_aesenc256kl_u8:
+  case X86::BI__builtin_ia32_aesdec256kl_u8:
+  case X86::BI__builtin_ia32_aesencwide128kl_u8:
+  case X86::BI__builtin_ia32_aesdecwide128kl_u8:
+  case X86::BI__builtin_ia32_aesencwide256kl_u8:
+  case X86::BI__builtin_ia32_aesdecwide256kl_u8:
+  case X86::BI__builtin_ia32_vfcmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfmaddcph512_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask:
+  case X86::BI__builtin_ia32_vfcmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_vfmaddcsh_round_mask3:
+  case X86::BI__builtin_ia32_prefetchi:
+    cgm.errorNYI(e->getSourceRange(),
+                 std::string("unimplemented X86 builtin call: ") +
+                     getContext().BuiltinInfo.getName(builtinID));
+    return {};
+  }
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
index 171ce1c950907..a3e20817d2ca4 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXX.cpp
@@ -53,7 +53,7 @@ static void emitDeclInit(CIRGenFunction &cgf, const VarDecl *varDecl,
     cgf.emitScalarInit(init, cgf.getLoc(varDecl->getLocation()), lv, false);
     break;
   case cir::TEK_Complex:
-    cgf.cgm.errorNYI(varDecl->getSourceRange(), "complex global initializer");
+    cgf.emitComplexExprIntoLValue(init, lv, /*isInit=*/true);
     break;
   case cir::TEK_Aggregate:
     assert(!cir::MissingFeatures::aggValueSlotGC());
@@ -151,7 +151,7 @@ static void emitDeclDestroy(CIRGenFunction &cgf, const VarDecl *vd,
     // Don't confuse lexical cleanup.
     builder.clearInsertionPoint();
   } else {
-    builder.create<cir::YieldOp>(addr.getLoc());
+    cir::YieldOp::create(builder, addr.getLoc());
   }
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
index d81492ef01540..aa0182e9677ef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.cpp
@@ -70,8 +70,8 @@ cir::GlobalLinkageKind CIRGenCXXABI::getCXXDestructorLinkage(
 mlir::Value CIRGenCXXABI::loadIncomingCXXThis(CIRGenFunction &cgf) {
   ImplicitParamDecl *vd = getThisDecl(cgf);
   Address addr = cgf.getAddrOfLocalVar(vd);
-  return cgf.getBuilder().create<cir::LoadOp>(
-      cgf.getLoc(vd->getLocation()), addr.getElementType(), addr.getPointer());
+  return cir::LoadOp::create(cgf.getBuilder(), cgf.getLoc(vd->getLocation()),
+                             addr.getElementType(), addr.getPointer());
 }
 
 void CIRGenCXXABI::setCXXABIThisValue(CIRGenFunction &cgf,
@@ -85,8 +85,7 @@ CharUnits CIRGenCXXABI::getArrayCookieSize(const CXXNewExpr *e) {
   if (!requiresArrayCookie(e))
     return CharUnits::Zero();
 
-  cgm.errorNYI(e->getSourceRange(), "CIRGenCXXABI::getArrayCookieSize");
-  return CharUnits::Zero();
+  return getArrayCookieSizeImpl(e->getAllocatedType());
 }
 
 bool CIRGenCXXABI::requiresArrayCookie(const CXXNewExpr *e) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
index da7cc72136ef1..13dc9f305945a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCXXABI.h
@@ -124,6 +124,8 @@ class CIRGenCXXABI {
   virtual void emitRethrow(CIRGenFunction &cgf, bool isNoReturn) = 0;
   virtual void emitThrow(CIRGenFunction &cgf, const CXXThrowExpr *e) = 0;
 
+  virtual void emitBadCastCall(CIRGenFunction &cgf, mlir::Location loc) = 0;
+
   virtual mlir::Attribute getAddrOfRTTIDescriptor(mlir::Location loc,
                                                   QualType ty) = 0;
 
@@ -185,6 +187,11 @@ class CIRGenCXXABI {
   virtual void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
                                   mlir::Value addr) = 0;
 
+  virtual void emitVirtualObjectDelete(CIRGenFunction &cgf,
+                                       const CXXDeleteExpr *de, Address ptr,
+                                       QualType elementType,
+                                       const CXXDestructorDecl *dtor) = 0;
+
   /// Checks if ABI requires extra virtual offset for vtable field.
   virtual bool
   isVirtualOffsetNeededForVTableField(CIRGenFunction &cgf,
@@ -302,8 +309,28 @@ class CIRGenCXXABI {
   ///   - non-array allocations never need a cookie
   ///   - calls to \::operator new(size_t, void*) never need a cookie
   ///
-  /// \param E - the new-expression being allocated.
+  /// \param e - the new-expression being allocated.
   virtual CharUnits getArrayCookieSize(const CXXNewExpr *e);
+
+  /// Initialize the array cookie for the given allocation.
+  ///
+  /// \param newPtr - a char* which is the presumed-non-null
+  ///   return value of the allocation function
+  /// \param numElements - the computed number of elements,
+  ///   potentially collapsed from the multidimensional array case;
+  ///   always a size_t
+  /// \param elementType - the base element allocated type,
+  ///   i.e. the allocated type after stripping all array types
+  virtual Address initializeArrayCookie(CIRGenFunction &cgf, Address newPtr,
+                                        mlir::Value numElements,
+                                        const CXXNewExpr *e,
+                                        QualType elementType) = 0;
+
+protected:
+  /// Returns the extra size required in order to store the array
+  /// cookie for the given type.  Assumes that an array cookie is
+  /// required.
+  virtual CharUnits getArrayCookieSizeImpl(QualType elementType) = 0;
 };
 
 /// Creates and Itanium-family ABI
diff --git a/clang/lib/CIR/CodeGen/CIRGenCall.cpp b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
index 61072f0883728..50d4c035d30a1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCall.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCall.cpp
@@ -671,9 +671,12 @@ RValue CIRGenFunction::emitCall(const CIRGenFunctionInfo &funcInfo,
 
     return RValue::get(results[0]);
   }
-  case cir::TEK_Complex:
-    cgm.errorNYI(loc, "unsupported evaluation kind of function call result");
-    return getUndefRValue(retTy);
+  case cir::TEK_Complex: {
+    mlir::ResultRange results = theCall->getOpResults();
+    assert(!results.empty() &&
+           "Expected at least one result for complex rvalue");
+    return RValue::getComplex(results[0]);
+  }
   }
   llvm_unreachable("Invalid evaluation kind");
 }
@@ -690,6 +693,22 @@ void CallArg::copyInto(CIRGenFunction &cgf, Address addr,
   isUsed = true;
 }
 
+mlir::Value CIRGenFunction::emitRuntimeCall(mlir::Location loc,
+                                            cir::FuncOp callee,
+                                            ArrayRef<mlir::Value> args) {
+  // TODO(cir): set the calling convention to this runtime call.
+  assert(!cir::MissingFeatures::opFuncCallingConv());
+
+  cir::CallOp call = builder.createCallOp(loc, callee, args);
+  assert(call->getNumResults() <= 1 &&
+         "runtime functions have at most 1 result");
+
+  if (call->getNumResults() == 0)
+    return nullptr;
+
+  return call->getResult(0);
+}
+
 void CIRGenFunction::emitCallArg(CallArgList &args, const clang::Expr *e,
                                  clang::QualType argType) {
   assert(argType->isReferenceType() == e->isGLValue() &&
diff --git a/clang/lib/CIR/CodeGen/CIRGenClass.cpp b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
index 89f492603b11c..5046e0945002f 100644
--- a/clang/lib/CIR/CodeGen/CIRGenClass.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenClass.cpp
@@ -725,8 +725,9 @@ void CIRGenFunction::emitCXXAggrConstructorCall(
     // Emit the constructor call that will execute for every array element.
     mlir::Value arrayOp =
         builder.createPtrBitcast(arrayBase.getPointer(), arrayTy);
-    builder.create<cir::ArrayCtor>(
-        *currSrcLoc, arrayOp, [&](mlir::OpBuilder &b, mlir::Location loc) {
+    cir::ArrayCtor::create(
+        builder, *currSrcLoc, arrayOp,
+        [&](mlir::OpBuilder &b, mlir::Location loc) {
           mlir::BlockArgument arg =
               b.getInsertionBlock()->addArgument(ptrToElmType, loc);
           Address curAddr = Address(arg, elementType, eltAlignment);
@@ -738,7 +739,7 @@ void CIRGenFunction::emitCXXAggrConstructorCall(
           emitCXXConstructorCall(ctor, Ctor_Complete,
                                  /*ForVirtualBase=*/false,
                                  /*Delegating=*/false, currAVS, e);
-          builder.create<cir::YieldOp>(loc);
+          cir::YieldOp::create(builder, loc);
         });
   }
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
index aabe4bbdf18c8..851328a7db680 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.cpp
@@ -28,6 +28,46 @@ using namespace clang::CIRGen;
 // CIRGenFunction cleanup related
 //===----------------------------------------------------------------------===//
 
+/// Build a unconditional branch to the lexical scope cleanup block
+/// or with the labeled blocked if already solved.
+///
+/// Track on scope basis, goto's we need to fix later.
+cir::BrOp CIRGenFunction::emitBranchThroughCleanup(mlir::Location loc,
+                                                   JumpDest dest) {
+  // Insert a branch: to the cleanup block (unsolved) or to the already
+  // materialized label. Keep track of unsolved goto's.
+  assert(dest.getBlock() && "assumes incoming valid dest");
+  auto brOp = cir::BrOp::create(builder, loc, dest.getBlock());
+
+  // Calculate the innermost active normal cleanup.
+  EHScopeStack::stable_iterator topCleanup =
+      ehStack.getInnermostActiveNormalCleanup();
+
+  // If we're not in an active normal cleanup scope, or if the
+  // destination scope is within the innermost active normal cleanup
+  // scope, we don't need to worry about fixups.
+  if (topCleanup == ehStack.stable_end() ||
+      topCleanup.encloses(dest.getScopeDepth())) { // works for invalid
+    // FIXME(cir): should we clear insertion point here?
+    return brOp;
+  }
+
+  // If we can't resolve the destination cleanup scope, just add this
+  // to the current cleanup scope as a branch fixup.
+  if (!dest.getScopeDepth().isValid()) {
+    BranchFixup &fixup = ehStack.addBranchFixup();
+    fixup.destination = dest.getBlock();
+    fixup.destinationIndex = dest.getDestIndex();
+    fixup.initialBranch = brOp;
+    fixup.optimisticBranchBlock = nullptr;
+    // FIXME(cir): should we clear insertion point here?
+    return brOp;
+  }
+
+  cgm.errorNYI(loc, "emitBranchThroughCleanup: valid destination scope depth");
+  return brOp;
+}
+
 /// Emits all the code to cause the given temporary to be cleaned up.
 void CIRGenFunction::emitCXXTemporary(const CXXTemporary *temporary,
                                       QualType tempType, Address ptr) {
@@ -40,6 +80,19 @@ void CIRGenFunction::emitCXXTemporary(const CXXTemporary *temporary,
 
 void EHScopeStack::Cleanup::anchor() {}
 
+EHScopeStack::stable_iterator
+EHScopeStack::getInnermostActiveNormalCleanup() const {
+  stable_iterator si = getInnermostNormalCleanup();
+  stable_iterator se = stable_end();
+  while (si != se) {
+    EHCleanupScope &cleanup = llvm::cast<EHCleanupScope>(*find(si));
+    if (cleanup.isActive())
+      return si;
+    si = cleanup.getEnclosingNormalCleanup();
+  }
+  return stable_end();
+}
+
 /// Push an entry of the given size onto this protected-scope stack.
 char *EHScopeStack::allocate(size_t size) {
   size = llvm::alignTo(size, ScopeStackAlignment);
@@ -75,14 +128,30 @@ void EHScopeStack::deallocate(size_t size) {
   startOfData += llvm::alignTo(size, ScopeStackAlignment);
 }
 
+/// Remove any 'null' fixups on the stack.  However, we can't pop more
+/// fixups than the fixup depth on the innermost normal cleanup, or
+/// else fixups that we try to add to that cleanup will end up in the
+/// wrong place.  We *could* try to shrink fixup depths, but that's
+/// actually a lot of work for little benefit.
+void EHScopeStack::popNullFixups() {
+  // We expect this to only be called when there's still an innermost
+  // normal cleanup;  otherwise there really shouldn't be any fixups.
+  cgf->cgm.errorNYI("popNullFixups");
+}
+
 void *EHScopeStack::pushCleanup(CleanupKind kind, size_t size) {
   char *buffer = allocate(EHCleanupScope::getSizeForCleanupSize(size));
+  bool isNormalCleanup = kind & NormalCleanup;
   bool isEHCleanup = kind & EHCleanup;
   bool isLifetimeMarker = kind & LifetimeMarker;
 
   assert(!cir::MissingFeatures::innermostEHScope());
 
-  EHCleanupScope *scope = new (buffer) EHCleanupScope(size);
+  EHCleanupScope *scope = new (buffer)
+      EHCleanupScope(size, branchFixups.size(), innermostNormalCleanup);
+
+  if (isNormalCleanup)
+    innermostNormalCleanup = stable_begin();
 
   if (isLifetimeMarker)
     cgf->cgm.errorNYI("push lifetime marker cleanup");
@@ -100,12 +169,23 @@ void EHScopeStack::popCleanup() {
 
   assert(isa<EHCleanupScope>(*begin()));
   EHCleanupScope &cleanup = cast<EHCleanupScope>(*begin());
+  innermostNormalCleanup = cleanup.getEnclosingNormalCleanup();
   deallocate(cleanup.getAllocatedSize());
 
   // Destroy the cleanup.
   cleanup.destroy();
 
-  assert(!cir::MissingFeatures::ehCleanupBranchFixups());
+  // Check whether we can shrink the branch-fixups stack.
+  if (!branchFixups.empty()) {
+    // If we no longer have any normal cleanups, all the fixups are
+    // complete.
+    if (!hasNormalCleanups()) {
+      branchFixups.clear();
+    } else {
+      // Otherwise we can still trim out unnecessary nulls.
+      popNullFixups();
+    }
+  }
 }
 
 EHCatchScope *EHScopeStack::pushCatch(unsigned numHandlers) {
@@ -123,6 +203,18 @@ static void emitCleanup(CIRGenFunction &cgf, EHScopeStack::Cleanup *cleanup) {
   assert(cgf.haveInsertPoint() && "cleanup ended with no insertion point?");
 }
 
+static mlir::Block *createNormalEntry(CIRGenFunction &cgf,
+                                      EHCleanupScope &scope) {
+  assert(scope.isNormalCleanup());
+  mlir::Block *entry = scope.getNormalBlock();
+  if (!entry) {
+    mlir::OpBuilder::InsertionGuard guard(cgf.getBuilder());
+    entry = cgf.curLexScope->getOrCreateCleanupBlock(cgf.getBuilder());
+    scope.setNormalBlock(entry);
+  }
+  return entry;
+}
+
 /// Pops a cleanup block. If the block includes a normal cleanup, the
 /// current insertion point is threaded through the cleanup, as are
 /// any branch fixups on the cleanup.
@@ -130,17 +222,21 @@ void CIRGenFunction::popCleanupBlock() {
   assert(!ehStack.empty() && "cleanup stack is empty!");
   assert(isa<EHCleanupScope>(*ehStack.begin()) && "top not a cleanup!");
   EHCleanupScope &scope = cast<EHCleanupScope>(*ehStack.begin());
+  assert(scope.getFixupDepth() <= ehStack.getNumBranchFixups());
 
   // Remember activation information.
   bool isActive = scope.isActive();
 
-  assert(!cir::MissingFeatures::ehCleanupBranchFixups());
+  // - whether there are branch fix-ups through this cleanup
+  unsigned fixupDepth = scope.getFixupDepth();
+  bool hasFixups = ehStack.getNumBranchFixups() != fixupDepth;
 
   // - whether there's a fallthrough
   mlir::Block *fallthroughSource = builder.getInsertionBlock();
   bool hasFallthrough = fallthroughSource != nullptr && isActive;
 
-  bool requiresNormalCleanup = scope.isNormalCleanup() && hasFallthrough;
+  bool requiresNormalCleanup =
+      scope.isNormalCleanup() && (hasFixups || hasFallthrough);
 
   // If we don't need the cleanup at all, we're done.
   assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup());
@@ -175,9 +271,119 @@ void CIRGenFunction::popCleanupBlock() {
 
   assert(!cir::MissingFeatures::ehCleanupFlags());
 
-  ehStack.popCleanup();
-  scope.markEmitted();
-  emitCleanup(*this, cleanup);
+  // If we have a fallthrough and no other need for the cleanup,
+  // emit it directly.
+  if (hasFallthrough && !hasFixups) {
+    assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup());
+    ehStack.popCleanup();
+    scope.markEmitted();
+    emitCleanup(*this, cleanup);
+  } else {
+    // Otherwise, the best approach is to thread everything through
+    // the cleanup block and then try to clean up after ourselves.
+
+    // Force the entry block to exist.
+    mlir::Block *normalEntry = createNormalEntry(*this, scope);
+
+    // I.  Set up the fallthrough edge in.
+    mlir::OpBuilder::InsertPoint savedInactiveFallthroughIP;
+
+    // If there's a fallthrough, we need to store the cleanup
+    // destination index. For fall-throughs this is always zero.
+    if (hasFallthrough) {
+      assert(!cir::MissingFeatures::ehCleanupHasPrebranchedFallthrough());
+
+    } else if (fallthroughSource) {
+      // Otherwise, save and clear the IP if we don't have fallthrough
+      // because the cleanup is inactive.
+      assert(!isActive && "source without fallthrough for active cleanup");
+      savedInactiveFallthroughIP = builder.saveInsertionPoint();
+    }
+
+    // II.  Emit the entry block.  This implicitly branches to it if
+    // we have fallthrough.  All the fixups and existing branches
+    // should already be branched to it.
+    builder.setInsertionPointToEnd(normalEntry);
+
+    // intercept normal cleanup to mark SEH scope end
+    assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup());
+
+    // III.  Figure out where we're going and build the cleanup
+    // epilogue.
+    bool hasEnclosingCleanups =
+        (scope.getEnclosingNormalCleanup() != ehStack.stable_end());
+
+    // Compute the branch-through dest if we need it:
+    //   - if there are branch-throughs threaded through the scope
+    //   - if fall-through is a branch-through
+    //   - if there are fixups that will be optimistically forwarded
+    //     to the enclosing cleanup
+    assert(!cir::MissingFeatures::cleanupBranchThrough());
+    if (hasFixups && hasEnclosingCleanups)
+      cgm.errorNYI("cleanup branch-through dest");
+
+    mlir::Block *fallthroughDest = nullptr;
+
+    // If there's exactly one branch-after and no other threads,
+    // we can route it without a switch.
+    // Skip for SEH, since ExitSwitch is used to generate code to indicate
+    // abnormal termination. (SEH: Except _leave and fall-through at
+    // the end, all other exits in a _try (return/goto/continue/break)
+    // are considered as abnormal terminations, using NormalCleanupDestSlot
+    // to indicate abnormal termination)
+    assert(!cir::MissingFeatures::cleanupBranchThrough());
+    assert(!cir::MissingFeatures::ehCleanupScopeRequiresEHCleanup());
+
+    // IV.  Pop the cleanup and emit it.
+    scope.markEmitted();
+    ehStack.popCleanup();
+    assert(ehStack.hasNormalCleanups() == hasEnclosingCleanups);
+
+    emitCleanup(*this, cleanup);
+
+    // Append the prepared cleanup prologue from above.
+    assert(!cir::MissingFeatures::cleanupAppendInsts());
+
+    // Optimistically hope that any fixups will continue falling through.
+    if (fixupDepth != ehStack.getNumBranchFixups())
+      cgm.errorNYI("cleanup fixup depth mismatch");
+
+    // V.  Set up the fallthrough edge out.
+
+    // Case 1: a fallthrough source exists but doesn't branch to the
+    // cleanup because the cleanup is inactive.
+    if (!hasFallthrough && fallthroughSource) {
+      // Prebranched fallthrough was forwarded earlier.
+      // Non-prebranched fallthrough doesn't need to be forwarded.
+      // Either way, all we need to do is restore the IP we cleared before.
+      assert(!isActive);
+      cgm.errorNYI("cleanup inactive fallthrough");
+
+      // Case 2: a fallthrough source exists and should branch to the
+      // cleanup, but we're not supposed to branch through to the next
+      // cleanup.
+    } else if (hasFallthrough && fallthroughDest) {
+      cgm.errorNYI("cleanup fallthrough destination");
+
+      // Case 3: a fallthrough source exists and should branch to the
+      // cleanup and then through to the next.
+    } else if (hasFallthrough) {
+      // Everything is already set up for this.
+
+      // Case 4: no fallthrough source exists.
+    } else {
+      // FIXME(cir): should we clear insertion point here?
+    }
+
+    // VI.  Assorted cleaning.
+
+    // Check whether we can merge NormalEntry into a single predecessor.
+    // This might invalidate (non-IR) pointers to NormalEntry.
+    //
+    // If it did invalidate those pointers, and normalEntry was the same
+    // as NormalExit, go back and patch up the fixups.
+    assert(!cir::MissingFeatures::simplifyCleanupEntry());
+  }
 }
 
 /// Pops cleanup blocks until the given savepoint is reached.
diff --git a/clang/lib/CIR/CodeGen/CIRGenCleanup.h b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
index 5852ea3f43dd5..61a09a59b05c0 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCleanup.h
+++ b/clang/lib/CIR/CodeGen/CIRGenCleanup.h
@@ -15,6 +15,7 @@
 #define CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
 
 #include "Address.h"
+#include "CIRGenModule.h"
 #include "EHScopeStack.h"
 #include "mlir/IR/Value.h"
 
@@ -151,6 +152,18 @@ class EHCatchScope : public EHScope {
 /// A cleanup scope which generates the cleanup blocks lazily.
 class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
     : public EHScope {
+  /// The nearest normal cleanup scope enclosing this one.
+  EHScopeStack::stable_iterator enclosingNormal;
+
+  /// The dual entry/exit block along the normal edge.  This is lazily
+  /// created if needed before the cleanup is popped.
+  mlir::Block *normalBlock = nullptr;
+
+  /// The number of fixups required by enclosing scopes (not including
+  /// this one).  If this is the top cleanup scope, all the fixups
+  /// from this index onwards belong to this scope.
+  unsigned fixupDepth = 0;
+
 public:
   /// Gets the size required for a lazy cleanup scope with the given
   /// cleanup-data requirements.
@@ -162,7 +175,10 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
     return sizeof(EHCleanupScope) + cleanupBits.cleanupSize;
   }
 
-  EHCleanupScope(unsigned cleanupSize) : EHScope(EHScope::Cleanup) {
+  EHCleanupScope(unsigned cleanupSize, unsigned fixupDepth,
+                 EHScopeStack::stable_iterator enclosingNormal)
+      : EHScope(EHScope::Cleanup), enclosingNormal(enclosingNormal),
+        fixupDepth(fixupDepth) {
     // TODO(cir): When exception handling is upstreamed, isNormalCleanup and
     // isEHCleanup will be arguments to the constructor.
     cleanupBits.isNormalCleanup = true;
@@ -180,11 +196,19 @@ class alignas(EHScopeStack::ScopeStackAlignment) EHCleanupScope
   // Objects of EHCleanupScope are not destructed. Use destroy().
   ~EHCleanupScope() = delete;
 
+  mlir::Block *getNormalBlock() const { return normalBlock; }
+  void setNormalBlock(mlir::Block *bb) { normalBlock = bb; }
+
   bool isNormalCleanup() const { return cleanupBits.isNormalCleanup; }
 
   bool isActive() const { return cleanupBits.isActive; }
   void setActive(bool isActive) { cleanupBits.isActive = isActive; }
 
+  unsigned getFixupDepth() const { return fixupDepth; }
+  EHScopeStack::stable_iterator getEnclosingNormalCleanup() const {
+    return enclosingNormal;
+  }
+
   size_t getCleanupSize() const { return cleanupBits.cleanupSize; }
   void *getCleanupBuffer() { return this + 1; }
 
@@ -234,5 +258,53 @@ inline void EHScopeStack::popCatch() {
   deallocate(EHCatchScope::getSizeForNumHandlers(scope.getNumHandlers()));
 }
 
+/// The exceptions personality for a function.
+struct EHPersonality {
+  const char *personalityFn = nullptr;
+
+  // If this is non-null, this personality requires a non-standard
+  // function for rethrowing an exception after a catchall cleanup.
+  // This function must have prototype void(void*).
+  const char *catchallRethrowFn = nullptr;
+
+  static const EHPersonality &get(CIRGenModule &cgm,
+                                  const clang::FunctionDecl *fd);
+  static const EHPersonality &get(CIRGenFunction &cgf);
+
+  static const EHPersonality GNU_C;
+  static const EHPersonality GNU_C_SJLJ;
+  static const EHPersonality GNU_C_SEH;
+  static const EHPersonality GNU_ObjC;
+  static const EHPersonality GNU_ObjC_SJLJ;
+  static const EHPersonality GNU_ObjC_SEH;
+  static const EHPersonality GNUstep_ObjC;
+  static const EHPersonality GNU_ObjCXX;
+  static const EHPersonality NeXT_ObjC;
+  static const EHPersonality GNU_CPlusPlus;
+  static const EHPersonality GNU_CPlusPlus_SJLJ;
+  static const EHPersonality GNU_CPlusPlus_SEH;
+  static const EHPersonality MSVC_except_handler;
+  static const EHPersonality MSVC_C_specific_handler;
+  static const EHPersonality MSVC_CxxFrameHandler3;
+  static const EHPersonality GNU_Wasm_CPlusPlus;
+  static const EHPersonality XL_CPlusPlus;
+  static const EHPersonality ZOS_CPlusPlus;
+
+  /// Does this personality use landingpads or the family of pad instructions
+  /// designed to form funclets?
+  bool usesFuncletPads() const {
+    return isMSVCPersonality() || isWasmPersonality();
+  }
+
+  bool isMSVCPersonality() const {
+    return this == &MSVC_except_handler || this == &MSVC_C_specific_handler ||
+           this == &MSVC_CxxFrameHandler3;
+  }
+
+  bool isWasmPersonality() const { return this == &GNU_Wasm_CPlusPlus; }
+
+  bool isMSVCXXPersonality() const { return this == &MSVC_CxxFrameHandler3; }
+};
+
 } // namespace clang::CIRGen
 #endif // CLANG_LIB_CIR_CODEGEN_CIRGENCLEANUP_H
diff --git a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
index c25cce4ab33b3..8723a6e502b38 100644
--- a/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenCoroutine.cpp
@@ -15,6 +15,7 @@
 #include "clang/AST/StmtCXX.h"
 #include "clang/Basic/TargetInfo.h"
 #include "clang/CIR/Dialect/IR/CIRTypes.h"
+#include "clang/CIR/MissingFeatures.h"
 
 using namespace clang;
 using namespace clang::CIRGen;
@@ -23,6 +24,9 @@ struct clang::CIRGen::CGCoroData {
   // Stores the __builtin_coro_id emitted in the function so that we can supply
   // it as the first argument to other builtins.
   cir::CallOp coroId = nullptr;
+
+  // Stores the result of __builtin_coro_begin call.
+  mlir::Value coroBegin = nullptr;
 };
 
 // Defining these here allows to keep CGCoroData private to this file.
@@ -63,6 +67,46 @@ cir::CallOp CIRGenFunction::emitCoroIDBuiltinCall(mlir::Location loc,
                                                nullPtr, nullPtr, nullPtr});
 }
 
+cir::CallOp CIRGenFunction::emitCoroAllocBuiltinCall(mlir::Location loc) {
+  cir::BoolType boolTy = builder.getBoolTy();
+
+  mlir::Operation *builtin = cgm.getGlobalValue(cgm.builtinCoroAlloc);
+
+  cir::FuncOp fnOp;
+  if (!builtin) {
+    fnOp = cgm.createCIRBuiltinFunction(loc, cgm.builtinCoroAlloc,
+                                        cir::FuncType::get({UInt32Ty}, boolTy),
+                                        /*fd=*/nullptr);
+    assert(fnOp && "should always succeed");
+  } else {
+    fnOp = cast<cir::FuncOp>(builtin);
+  }
+
+  return builder.createCallOp(
+      loc, fnOp, mlir::ValueRange{curCoro.data->coroId.getResult()});
+}
+
+cir::CallOp
+CIRGenFunction::emitCoroBeginBuiltinCall(mlir::Location loc,
+                                         mlir::Value coroframeAddr) {
+  mlir::Operation *builtin = cgm.getGlobalValue(cgm.builtinCoroBegin);
+
+  cir::FuncOp fnOp;
+  if (!builtin) {
+    fnOp = cgm.createCIRBuiltinFunction(
+        loc, cgm.builtinCoroBegin,
+        cir::FuncType::get({UInt32Ty, VoidPtrTy}, VoidPtrTy),
+        /*fd=*/nullptr);
+    assert(fnOp && "should always succeed");
+  } else {
+    fnOp = cast<cir::FuncOp>(builtin);
+  }
+
+  return builder.createCallOp(
+      loc, fnOp,
+      mlir::ValueRange{curCoro.data->coroId.getResult(), coroframeAddr});
+}
+
 mlir::LogicalResult
 CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   mlir::Location openCurlyLoc = getLoc(s.getBeginLoc());
@@ -73,10 +117,39 @@ CIRGenFunction::emitCoroutineBody(const CoroutineBodyStmt &s) {
   cir::CallOp coroId = emitCoroIDBuiltinCall(openCurlyLoc, nullPtrCst);
   createCoroData(*this, curCoro, coroId);
 
-  assert(!cir::MissingFeatures::coroAllocBuiltinCall());
-
-  assert(!cir::MissingFeatures::coroBeginBuiltinCall());
+  // Backend is allowed to elide memory allocations, to help it, emit
+  // auto mem = coro.alloc() ? 0 : ... allocation code ...;
+  cir::CallOp coroAlloc = emitCoroAllocBuiltinCall(openCurlyLoc);
+
+  // Initialize address of coroutine frame to null
+  CanQualType astVoidPtrTy = cgm.getASTContext().VoidPtrTy;
+  mlir::Type allocaTy = convertTypeForMem(astVoidPtrTy);
+  Address coroFrame =
+      createTempAlloca(allocaTy, getContext().getTypeAlignInChars(astVoidPtrTy),
+                       openCurlyLoc, "__coro_frame_addr",
+                       /*ArraySize=*/nullptr);
+
+  mlir::Value storeAddr = coroFrame.getPointer();
+  builder.CIRBaseBuilderTy::createStore(openCurlyLoc, nullPtrCst, storeAddr);
+  cir::IfOp::create(
+      builder, openCurlyLoc, coroAlloc.getResult(),
+      /*withElseRegion=*/false,
+      /*thenBuilder=*/[&](mlir::OpBuilder &b, mlir::Location loc) {
+        builder.CIRBaseBuilderTy::createStore(
+            loc, emitScalarExpr(s.getAllocate()), storeAddr);
+        cir::YieldOp::create(builder, loc);
+      });
+  curCoro.data->coroBegin =
+      emitCoroBeginBuiltinCall(
+          openCurlyLoc,
+          cir::LoadOp::create(builder, openCurlyLoc, allocaTy, storeAddr))
+          .getResult();
+
+  // Handle allocation failure if 'ReturnStmtOnAllocFailure' was provided.
+  if (s.getReturnStmtOnAllocFailure())
+    cgm.errorNYI("handle coroutine return alloc failure");
 
   assert(!cir::MissingFeatures::generateDebugInfo());
+  assert(!cir::MissingFeatures::emitBodyAndFallthrough());
   return mlir::success();
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenException.cpp b/clang/lib/CIR/CodeGen/CIRGenException.cpp
index 717a3e0032cea..67f46ffde8fda 100644
--- a/clang/lib/CIR/CodeGen/CIRGenException.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenException.cpp
@@ -18,6 +18,171 @@
 using namespace clang;
 using namespace clang::CIRGen;
 
+const EHPersonality EHPersonality::GNU_C = {"__gcc_personality_v0", nullptr};
+const EHPersonality EHPersonality::GNU_C_SJLJ = {"__gcc_personality_sj0",
+                                                 nullptr};
+const EHPersonality EHPersonality::GNU_C_SEH = {"__gcc_personality_seh0",
+                                                nullptr};
+const EHPersonality EHPersonality::NeXT_ObjC = {"__objc_personality_v0",
+                                                nullptr};
+const EHPersonality EHPersonality::GNU_CPlusPlus = {"__gxx_personality_v0",
+                                                    nullptr};
+const EHPersonality EHPersonality::GNU_CPlusPlus_SJLJ = {
+    "__gxx_personality_sj0", nullptr};
+const EHPersonality EHPersonality::GNU_CPlusPlus_SEH = {
+    "__gxx_personality_seh0", nullptr};
+const EHPersonality EHPersonality::GNU_ObjC = {"__gnu_objc_personality_v0",
+                                               "objc_exception_throw"};
+const EHPersonality EHPersonality::GNU_ObjC_SJLJ = {
+    "__gnu_objc_personality_sj0", "objc_exception_throw"};
+const EHPersonality EHPersonality::GNU_ObjC_SEH = {
+    "__gnu_objc_personality_seh0", "objc_exception_throw"};
+const EHPersonality EHPersonality::GNU_ObjCXX = {
+    "__gnustep_objcxx_personality_v0", nullptr};
+const EHPersonality EHPersonality::GNUstep_ObjC = {
+    "__gnustep_objc_personality_v0", nullptr};
+const EHPersonality EHPersonality::MSVC_except_handler = {"_except_handler3",
+                                                          nullptr};
+const EHPersonality EHPersonality::MSVC_C_specific_handler = {
+    "__C_specific_handler", nullptr};
+const EHPersonality EHPersonality::MSVC_CxxFrameHandler3 = {
+    "__CxxFrameHandler3", nullptr};
+const EHPersonality EHPersonality::GNU_Wasm_CPlusPlus = {
+    "__gxx_wasm_personality_v0", nullptr};
+const EHPersonality EHPersonality::XL_CPlusPlus = {"__xlcxx_personality_v1",
+                                                   nullptr};
+const EHPersonality EHPersonality::ZOS_CPlusPlus = {"__zos_cxx_personality_v2",
+                                                    nullptr};
+
+static const EHPersonality &getCPersonality(const TargetInfo &target,
+                                            const CodeGenOptions &cgOpts) {
+  const llvm::Triple &triple = target.getTriple();
+  if (triple.isWindowsMSVCEnvironment())
+    return EHPersonality::MSVC_CxxFrameHandler3;
+  if (cgOpts.hasSjLjExceptions())
+    return EHPersonality::GNU_C_SJLJ;
+  if (cgOpts.hasDWARFExceptions())
+    return EHPersonality::GNU_C;
+  if (cgOpts.hasSEHExceptions())
+    return EHPersonality::GNU_C_SEH;
+  return EHPersonality::GNU_C;
+}
+
+static const EHPersonality &getObjCPersonality(const TargetInfo &target,
+                                               const LangOptions &langOpts,
+                                               const CodeGenOptions &cgOpts) {
+  const llvm::Triple &triple = target.getTriple();
+  if (triple.isWindowsMSVCEnvironment())
+    return EHPersonality::MSVC_CxxFrameHandler3;
+
+  switch (langOpts.ObjCRuntime.getKind()) {
+  case ObjCRuntime::FragileMacOSX:
+    return getCPersonality(target, cgOpts);
+  case ObjCRuntime::MacOSX:
+  case ObjCRuntime::iOS:
+  case ObjCRuntime::WatchOS:
+    return EHPersonality::NeXT_ObjC;
+  case ObjCRuntime::GNUstep:
+    if (langOpts.ObjCRuntime.getVersion() >= VersionTuple(1, 7))
+      return EHPersonality::GNUstep_ObjC;
+    [[fallthrough]];
+  case ObjCRuntime::GCC:
+  case ObjCRuntime::ObjFW:
+    if (cgOpts.hasSjLjExceptions())
+      return EHPersonality::GNU_ObjC_SJLJ;
+    if (cgOpts.hasSEHExceptions())
+      return EHPersonality::GNU_ObjC_SEH;
+    return EHPersonality::GNU_ObjC;
+  }
+  llvm_unreachable("bad runtime kind");
+}
+
+static const EHPersonality &getCXXPersonality(const TargetInfo &target,
+                                              const CodeGenOptions &cgOpts) {
+  const llvm::Triple &triple = target.getTriple();
+  if (triple.isWindowsMSVCEnvironment())
+    return EHPersonality::MSVC_CxxFrameHandler3;
+  if (triple.isOSAIX())
+    return EHPersonality::XL_CPlusPlus;
+  if (cgOpts.hasSjLjExceptions())
+    return EHPersonality::GNU_CPlusPlus_SJLJ;
+  if (cgOpts.hasDWARFExceptions())
+    return EHPersonality::GNU_CPlusPlus;
+  if (cgOpts.hasSEHExceptions())
+    return EHPersonality::GNU_CPlusPlus_SEH;
+  if (cgOpts.hasWasmExceptions())
+    return EHPersonality::GNU_Wasm_CPlusPlus;
+  return EHPersonality::GNU_CPlusPlus;
+}
+
+/// Determines the personality function to use when both C++
+/// and Objective-C exceptions are being caught.
+static const EHPersonality &getObjCXXPersonality(const TargetInfo &target,
+                                                 const LangOptions &langOpts,
+                                                 const CodeGenOptions &cgOpts) {
+  if (target.getTriple().isWindowsMSVCEnvironment())
+    return EHPersonality::MSVC_CxxFrameHandler3;
+
+  switch (langOpts.ObjCRuntime.getKind()) {
+  // In the fragile ABI, just use C++ exception handling and hope
+  // they're not doing crazy exception mixing.
+  case ObjCRuntime::FragileMacOSX:
+    return getCXXPersonality(target, cgOpts);
+
+  // The ObjC personality defers to the C++ personality for non-ObjC
+  // handlers.  Unlike the C++ case, we use the same personality
+  // function on targets using (backend-driven) SJLJ EH.
+  case ObjCRuntime::MacOSX:
+  case ObjCRuntime::iOS:
+  case ObjCRuntime::WatchOS:
+    return getObjCPersonality(target, langOpts, cgOpts);
+
+  case ObjCRuntime::GNUstep:
+    return EHPersonality::GNU_ObjCXX;
+
+  // The GCC runtime's personality function inherently doesn't support
+  // mixed EH.  Use the ObjC personality just to avoid returning null.
+  case ObjCRuntime::GCC:
+  case ObjCRuntime::ObjFW:
+    return getObjCPersonality(target, langOpts, cgOpts);
+  }
+  llvm_unreachable("bad runtime kind");
+}
+
+static const EHPersonality &getSEHPersonalityMSVC(const llvm::Triple &triple) {
+  return triple.getArch() == llvm::Triple::x86
+             ? EHPersonality::MSVC_except_handler
+             : EHPersonality::MSVC_C_specific_handler;
+}
+
+const EHPersonality &EHPersonality::get(CIRGenModule &cgm,
+                                        const FunctionDecl *fd) {
+  const llvm::Triple &triple = cgm.getTarget().getTriple();
+  const LangOptions &langOpts = cgm.getLangOpts();
+  const CodeGenOptions &cgOpts = cgm.getCodeGenOpts();
+  const TargetInfo &target = cgm.getTarget();
+
+  // Functions using SEH get an SEH personality.
+  if (fd && fd->usesSEHTry())
+    return getSEHPersonalityMSVC(triple);
+
+  if (langOpts.ObjC) {
+    return langOpts.CPlusPlus ? getObjCXXPersonality(target, langOpts, cgOpts)
+                              : getObjCPersonality(target, langOpts, cgOpts);
+  }
+  return langOpts.CPlusPlus ? getCXXPersonality(target, cgOpts)
+                            : getCPersonality(target, cgOpts);
+}
+
+const EHPersonality &EHPersonality::get(CIRGenFunction &cgf) {
+  const auto *fg = cgf.curCodeDecl;
+  // For outlined finallys and filters, use the SEH personality in case they
+  // contain more SEH. This mostly only affects finallys. Filters could
+  // hypothetically use gnu statement expressions to sneak in nested SEH.
+  fg = fg ? fg : cgf.curSEHParent.getDecl();
+  return get(cgf.cgm, dyn_cast_or_null<FunctionDecl>(fg));
+}
+
 void CIRGenFunction::emitCXXThrowExpr(const CXXThrowExpr *e) {
   const llvm::Triple &triple = getTarget().getTriple();
   if (cgm.getLangOpts().OpenMPIsTargetDevice &&
diff --git a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
index 52021fce1c675..df6ee56eac30b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExpr.cpp
@@ -251,8 +251,8 @@ void CIRGenFunction::emitStoreThroughLValue(RValue src, LValue dst,
       const mlir::Location loc = dst.getVectorPointer().getLoc();
       const mlir::Value vector =
           builder.createLoad(loc, dst.getVectorAddress());
-      const mlir::Value newVector = builder.create<cir::VecInsertOp>(
-          loc, vector, src.getValue(), dst.getVectorIdx());
+      const mlir::Value newVector = cir::VecInsertOp::create(
+          builder, loc, vector, src.getValue(), dst.getVectorIdx());
       builder.createStore(loc, newVector, dst.getVectorAddress());
       return;
     }
@@ -615,8 +615,8 @@ RValue CIRGenFunction::emitLoadOfLValue(LValue lv, SourceLocation loc) {
   if (lv.isVectorElt()) {
     const mlir::Value load =
         builder.createLoad(getLoc(loc), lv.getVectorAddress());
-    return RValue::get(builder.create<cir::VecExtractOp>(getLoc(loc), load,
-                                                         lv.getVectorIdx()));
+    return RValue::get(cir::VecExtractOp::create(builder, getLoc(loc), load,
+                                                 lv.getVectorIdx()));
   }
 
   cgm.errorNYI(loc, "emitLoadOfLValue");
@@ -671,8 +671,8 @@ static LValue emitFunctionDeclLValue(CIRGenFunction &cgf, const Expr *e,
 
   mlir::Type fnTy = funcOp.getFunctionType();
   mlir::Type ptrTy = cir::PointerType::get(fnTy);
-  mlir::Value addr = cgf.getBuilder().create<cir::GetGlobalOp>(
-      loc, ptrTy, funcOp.getSymName());
+  mlir::Value addr = cir::GetGlobalOp::create(cgf.getBuilder(), loc, ptrTy,
+                                              funcOp.getSymName());
 
   if (funcOp.getFunctionType() != cgf.convertType(fd->getType())) {
     fnTy = cgf.convertType(fd->getType());
@@ -1685,8 +1685,8 @@ CIRGenCallee CIRGenFunction::emitDirectCallee(const GlobalDecl &gd) {
         mlir::OpBuilder::InsertionGuard guard(builder);
         builder.setInsertionPointToStart(cgm.getModule().getBody());
 
-        clone = builder.create<cir::FuncOp>(calleeFunc.getLoc(), fdInlineName,
-                                            calleeFunc.getFunctionType());
+        clone = cir::FuncOp::create(builder, calleeFunc.getLoc(), fdInlineName,
+                                    calleeFunc.getFunctionType());
         clone.setLinkageAttr(cir::GlobalLinkageKindAttr::get(
             &cgm.getMLIRContext(), cir::GlobalLinkageKind::InternalLinkage));
         clone.setSymVisibility("private");
@@ -1778,8 +1778,8 @@ RValue CIRGenFunction::emitCall(clang::QualType calleeTy,
     mlir::Operation *fn = callee.getFunctionPointer();
     mlir::Value addr;
     if (auto funcOp = mlir::dyn_cast<cir::FuncOp>(fn)) {
-      addr = builder.create<cir::GetGlobalOp>(
-          getLoc(e->getSourceRange()),
+      addr = cir::GetGlobalOp::create(
+          builder, getLoc(e->getSourceRange()),
           cir::PointerType::get(funcOp.getFunctionType()), funcOp.getSymName());
     } else {
       addr = fn->getResult(0);
@@ -1820,10 +1820,12 @@ CIRGenCallee CIRGenFunction::emitCallee(const clang::Expr *e) {
     // Resolve direct calls.
     const auto *funcDecl = cast<FunctionDecl>(declRef->getDecl());
     return emitDirectCallee(funcDecl);
-  } else if (isa<MemberExpr>(e)) {
-    cgm.errorNYI(e->getSourceRange(),
-                 "emitCallee: call to member function is NYI");
-    return {};
+  } else if (auto me = dyn_cast<MemberExpr>(e)) {
+    if (const auto *fd = dyn_cast<FunctionDecl>(me->getMemberDecl())) {
+      emitIgnoredExpr(me->getBase());
+      return emitDirectCallee(fd);
+    }
+    // Else fall through to the indirect reference handling below.
   } else if (auto *pde = dyn_cast<CXXPseudoDestructorExpr>(e)) {
     return CIRGenCallee::forPseudoDestructor(pde);
   }
@@ -1996,9 +1998,9 @@ cir::IfOp CIRGenFunction::emitIfOnBoolExpr(
 
   // Emit the code with the fully general case.
   mlir::Value condV = emitOpOnBoolExpr(loc, cond);
-  return builder.create<cir::IfOp>(loc, condV, elseLoc.has_value(),
-                                   /*thenBuilder=*/thenBuilder,
-                                   /*elseBuilder=*/elseBuilder);
+  return cir::IfOp::create(builder, loc, condV, elseLoc.has_value(),
+                           /*thenBuilder=*/thenBuilder,
+                           /*elseBuilder=*/elseBuilder);
 }
 
 /// TODO(cir): see EmitBranchOnBoolExpr for extra ideas).
@@ -2020,18 +2022,17 @@ mlir::Value CIRGenFunction::emitOpOnBoolExpr(mlir::Location loc,
     mlir::Value condV = emitOpOnBoolExpr(loc, condOp->getCond());
 
     mlir::Value ternaryOpRes =
-        builder
-            .create<cir::TernaryOp>(
-                loc, condV, /*thenBuilder=*/
-                [this, trueExpr](mlir::OpBuilder &b, mlir::Location loc) {
-                  mlir::Value lhs = emitScalarExpr(trueExpr);
-                  b.create<cir::YieldOp>(loc, lhs);
-                },
-                /*elseBuilder=*/
-                [this, falseExpr](mlir::OpBuilder &b, mlir::Location loc) {
-                  mlir::Value rhs = emitScalarExpr(falseExpr);
-                  b.create<cir::YieldOp>(loc, rhs);
-                })
+        cir::TernaryOp::create(
+            builder, loc, condV, /*thenBuilder=*/
+            [this, trueExpr](mlir::OpBuilder &b, mlir::Location loc) {
+              mlir::Value lhs = emitScalarExpr(trueExpr);
+              cir::YieldOp::create(b, loc, lhs);
+            },
+            /*elseBuilder=*/
+            [this, falseExpr](mlir::OpBuilder &b, mlir::Location loc) {
+              mlir::Value rhs = emitScalarExpr(falseExpr);
+              cir::YieldOp::create(b, loc, rhs);
+            })
             .getResult();
 
     return emitScalarConversion(ternaryOpRes, condOp->getType(),
@@ -2064,7 +2065,11 @@ mlir::Value CIRGenFunction::emitAlloca(StringRef name, mlir::Type ty,
   // a surrounding cir.scope, make sure the alloca ends up in the surrounding
   // scope instead. This is necessary in order to guarantee all SSA values are
   // reachable during cleanups.
-  assert(!cir::MissingFeatures::tryOp());
+  if (auto tryOp =
+          llvm::dyn_cast_if_present<cir::TryOp>(entryBlock->getParentOp())) {
+    if (auto scopeOp = llvm::dyn_cast<cir::ScopeOp>(tryOp->getParentOp()))
+      entryBlock = &scopeOp.getScopeRegion().front();
+  }
 
   return emitAlloca(name, ty, loc, alignment,
                     builder.getBestAllocaInsertPoint(entryBlock), arraySize);
@@ -2211,8 +2216,8 @@ Address CIRGenFunction::emitLoadOfReference(LValue refLVal, mlir::Location loc,
     cgm.errorNYI(loc, "load of volatile reference");
 
   cir::LoadOp load =
-      builder.create<cir::LoadOp>(loc, refLVal.getAddress().getElementType(),
-                                  refLVal.getAddress().getPointer());
+      cir::LoadOp::create(builder, loc, refLVal.getAddress().getElementType(),
+                          refLVal.getAddress().getPointer());
 
   assert(!cir::MissingFeatures::opTBAA());
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
index 568cbdb06bb48..8fe0d9b4a69ef 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprAggregate.cpp
@@ -280,6 +280,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
   void VisitUnaryDeref(UnaryOperator *e) { emitAggLoadOfLValue(e); }
   void VisitStringLiteral(StringLiteral *e) { emitAggLoadOfLValue(e); }
   void VisitCompoundLiteralExpr(CompoundLiteralExpr *e);
+
   void VisitPredefinedExpr(const PredefinedExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
                      "AggExprEmitter: VisitPredefinedExpr");
@@ -361,8 +362,7 @@ class AggExprEmitter : public StmtVisitor<AggExprEmitter> {
     cgf.cgm.errorNYI(e->getSourceRange(), "AggExprEmitter: VisitCXXTypeidExpr");
   }
   void VisitMaterializeTemporaryExpr(MaterializeTemporaryExpr *e) {
-    cgf.cgm.errorNYI(e->getSourceRange(),
-                     "AggExprEmitter: VisitMaterializeTemporaryExpr");
+    Visit(e->getSubExpr());
   }
   void VisitOpaqueValueExpr(OpaqueValueExpr *e) {
     cgf.cgm.errorNYI(e->getSourceRange(),
@@ -670,7 +670,7 @@ void AggExprEmitter::emitNullInitializationToLValue(mlir::Location loc,
       return;
     }
 
-    cgf.cgm.errorNYI("emitStoreThroughBitfieldLValue");
+    cgf.emitStoreThroughBitfieldLValue(RValue::get(null), lv);
     return;
   }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
index b1e9e768ff1e2..7a35382e79a93 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprCXX.cpp
@@ -306,6 +306,7 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
         mlir::cast<cir::IntAttr>(constNumElements).getValue();
 
     unsigned numElementsWidth = count.getBitWidth();
+    bool hasAnyOverflow = false;
 
     // The equivalent code in CodeGen/CGExprCXX.cpp handles these cases as
     // overflow, but that should never happen. The size argument is implicitly
@@ -336,11 +337,22 @@ static mlir::Value emitCXXNewAllocSize(CIRGenFunction &cgf, const CXXNewExpr *e,
 
     // Add in the cookie, and check whether it's overflowed.
     if (cookieSize != 0) {
-      cgf.cgm.errorNYI(e->getSourceRange(),
-                       "emitCXXNewAllocSize: array cookie");
+      // Save the current size without a cookie.  This shouldn't be
+      // used if there was overflow
+      sizeWithoutCookie = cgf.getBuilder().getConstInt(
+          loc, allocationSize.zextOrTrunc(sizeWidth));
+
+      allocationSize = allocationSize.uadd_ov(cookieSize, overflow);
+      hasAnyOverflow |= overflow;
     }
 
-    size = cgf.getBuilder().getConstInt(loc, allocationSize);
+    // On overflow, produce a -1 so operator new will fail
+    if (hasAnyOverflow) {
+      size =
+          cgf.getBuilder().getConstInt(loc, llvm::APInt::getAllOnes(sizeWidth));
+    } else {
+      size = cgf.getBuilder().getConstInt(loc, allocationSize);
+    }
   } else {
     // TODO: Handle the variable size case
     cgf.cgm.errorNYI(e->getSourceRange(),
@@ -390,7 +402,50 @@ void CIRGenFunction::emitNewArrayInitializer(
   if (!e->hasInitializer())
     return;
 
-  cgm.errorNYI(e->getSourceRange(), "emitNewArrayInitializer");
+  unsigned initListElements = 0;
+
+  const Expr *init = e->getInitializer();
+  const InitListExpr *ile = dyn_cast<InitListExpr>(init);
+  if (ile) {
+    cgm.errorNYI(ile->getSourceRange(), "emitNewArrayInitializer: init list");
+    return;
+  }
+
+  // If all elements have already been initialized, skip any further
+  // initialization.
+  auto constOp = mlir::dyn_cast<cir::ConstantOp>(numElements.getDefiningOp());
+  if (constOp) {
+    auto constIntAttr = mlir::dyn_cast<cir::IntAttr>(constOp.getValue());
+    // Just skip out if the constant count is zero.
+    if (constIntAttr && constIntAttr.getUInt() <= initListElements)
+      return;
+  }
+
+  assert(init && "have trailing elements to initialize but no initializer");
+
+  // If this is a constructor call, try to optimize it out, and failing that
+  // emit a single loop to initialize all remaining elements.
+  if (const CXXConstructExpr *cce = dyn_cast<CXXConstructExpr>(init)) {
+    CXXConstructorDecl *ctor = cce->getConstructor();
+    if (ctor->isTrivial()) {
+      // If new expression did not specify value-initialization, then there
+      // is no initialization.
+      if (!cce->requiresZeroInitialization())
+        return;
+
+      cgm.errorNYI(cce->getSourceRange(),
+                   "emitNewArrayInitializer: trivial ctor zero-init");
+      return;
+    }
+
+    cgm.errorNYI(cce->getSourceRange(),
+                 "emitNewArrayInitializer: ctor initializer");
+    return;
+  }
+
+  cgm.errorNYI(init->getSourceRange(),
+               "emitNewArrayInitializer: unsupported initializer");
+  return;
 }
 
 static void emitNewInitializer(CIRGenFunction &cgf, const CXXNewExpr *e,
@@ -510,8 +565,10 @@ static void emitObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
       dtor = rd->getDestructor();
 
       if (dtor->isVirtual()) {
-        cgf.cgm.errorNYI(de->getSourceRange(),
-                         "emitObjectDelete: virtual destructor");
+        assert(!cir::MissingFeatures::devirtualizeDestructor());
+        cgf.cgm.getCXXABI().emitVirtualObjectDelete(cgf, de, ptr, elementType,
+                                                    dtor);
+        return;
       }
     }
   }
@@ -586,9 +643,6 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
 
   // If there is a brace-initializer, cannot allocate fewer elements than inits.
   unsigned minElements = 0;
-  if (e->isArray() && e->hasInitializer()) {
-    cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: array initializer");
-  }
 
   mlir::Value numElements = nullptr;
   mlir::Value allocSizeWithoutCookie = nullptr;
@@ -667,8 +721,11 @@ mlir::Value CIRGenFunction::emitCXXNewExpr(const CXXNewExpr *e) {
       !e->getOperatorDelete()->isReservedGlobalPlacementOperator())
     cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: operator delete");
 
-  if (allocSize != allocSizeWithoutCookie)
-    cgm.errorNYI(e->getSourceRange(), "emitCXXNewExpr: array with cookies");
+  if (allocSize != allocSizeWithoutCookie) {
+    assert(e->isArray());
+    allocation = cgm.getCXXABI().initializeArrayCookie(
+        *this, allocation, numElements, e, allocType);
+  }
 
   mlir::Type elementTy;
   if (e->isArray()) {
@@ -746,6 +803,26 @@ void CIRGenFunction::emitDeleteCall(const FunctionDecl *deleteFD,
   emitNewDeleteCall(*this, deleteFD, deleteFTy, deleteArgs);
 }
 
+static mlir::Value emitDynamicCastToNull(CIRGenFunction &cgf,
+                                         mlir::Location loc, QualType destTy) {
+  mlir::Type destCIRTy = cgf.convertType(destTy);
+  assert(mlir::isa<cir::PointerType>(destCIRTy) &&
+         "result of dynamic_cast should be a ptr");
+
+  if (!destTy->isPointerType()) {
+    mlir::Region *currentRegion = cgf.getBuilder().getBlock()->getParent();
+    /// C++ [expr.dynamic.cast]p9:
+    ///   A failed cast to reference type throws std::bad_cast
+    cgf.cgm.getCXXABI().emitBadCastCall(cgf, loc);
+
+    // The call to bad_cast will terminate the current block. Create a new block
+    // to hold any follow up code.
+    cgf.getBuilder().createBlock(currentRegion, currentRegion->end());
+  }
+
+  return cgf.getBuilder().getNullPtr(destCIRTy, loc);
+}
+
 mlir::Value CIRGenFunction::emitDynamicCast(Address thisAddr,
                                             const CXXDynamicCastExpr *dce) {
   mlir::Location loc = getLoc(dce->getSourceRange());
@@ -776,10 +853,8 @@ mlir::Value CIRGenFunction::emitDynamicCast(Address thisAddr,
   assert(srcRecordTy->isRecordType() && "source type must be a record type!");
   assert(!cir::MissingFeatures::emitTypeCheck());
 
-  if (dce->isAlwaysNull()) {
-    cgm.errorNYI(dce->getSourceRange(), "emitDynamicCastToNull");
-    return {};
-  }
+  if (dce->isAlwaysNull())
+    return emitDynamicCastToNull(*this, loc, destTy);
 
   auto destCirTy = mlir::cast<cir::PointerType>(convertType(destTy));
   return cgm.getCXXABI().emitDynamicCast(*this, loc, srcRecordTy, destRecordTy,
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
index d8f4943a7755a..047f3599eed03 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprComplex.cpp
@@ -390,7 +390,7 @@ ComplexExprEmitter::VisitImaginaryLiteral(const ImaginaryLiteral *il) {
   }
 
   auto complexAttr = cir::ConstComplexAttr::get(realValueAttr, imagValueAttr);
-  return builder.create<cir::ConstantOp>(loc, complexAttr);
+  return cir::ConstantOp::create(builder, loc, complexAttr);
 }
 
 mlir::Value ComplexExprEmitter::VisitCallExpr(const CallExpr *e) {
@@ -601,7 +601,7 @@ mlir::Value ComplexExprEmitter::emitBinAdd(const BinOpInfo &op) {
 
   if (mlir::isa<cir::ComplexType>(op.lhs.getType()) &&
       mlir::isa<cir::ComplexType>(op.rhs.getType()))
-    return builder.create<cir::ComplexAddOp>(op.loc, op.lhs, op.rhs);
+    return cir::ComplexAddOp::create(builder, op.loc, op.lhs, op.rhs);
 
   if (mlir::isa<cir::ComplexType>(op.lhs.getType())) {
     mlir::Value real = builder.createComplexReal(op.loc, op.lhs);
@@ -623,7 +623,7 @@ mlir::Value ComplexExprEmitter::emitBinSub(const BinOpInfo &op) {
 
   if (mlir::isa<cir::ComplexType>(op.lhs.getType()) &&
       mlir::isa<cir::ComplexType>(op.rhs.getType()))
-    return builder.create<cir::ComplexSubOp>(op.loc, op.lhs, op.rhs);
+    return cir::ComplexSubOp::create(builder, op.loc, op.lhs, op.rhs);
 
   if (mlir::isa<cir::ComplexType>(op.lhs.getType())) {
     mlir::Value real = builder.createComplexReal(op.loc, op.lhs);
@@ -664,7 +664,8 @@ mlir::Value ComplexExprEmitter::emitBinMul(const BinOpInfo &op) {
       mlir::isa<cir::ComplexType>(op.rhs.getType())) {
     cir::ComplexRangeKind rangeKind =
         getComplexRangeAttr(op.fpFeatures.getComplexRange());
-    return builder.create<cir::ComplexMulOp>(op.loc, op.lhs, op.rhs, rangeKind);
+    return cir::ComplexMulOp::create(builder, op.loc, op.lhs, op.rhs,
+                                     rangeKind);
   }
 
   if (mlir::isa<cir::ComplexType>(op.lhs.getType())) {
@@ -968,23 +969,22 @@ mlir::Value ComplexExprEmitter::VisitAbstractConditionalOperator(
   Expr *cond = e->getCond()->IgnoreParens();
   mlir::Value condValue = cgf.evaluateExprAsBool(cond);
 
-  return builder
-      .create<cir::TernaryOp>(
-          loc, condValue,
-          /*thenBuilder=*/
-          [&](mlir::OpBuilder &b, mlir::Location loc) {
-            eval.beginEvaluation();
-            mlir::Value trueValue = Visit(e->getTrueExpr());
-            b.create<cir::YieldOp>(loc, trueValue);
-            eval.endEvaluation();
-          },
-          /*elseBuilder=*/
-          [&](mlir::OpBuilder &b, mlir::Location loc) {
-            eval.beginEvaluation();
-            mlir::Value falseValue = Visit(e->getFalseExpr());
-            b.create<cir::YieldOp>(loc, falseValue);
-            eval.endEvaluation();
-          })
+  return cir::TernaryOp::create(
+             builder, loc, condValue,
+             /*thenBuilder=*/
+             [&](mlir::OpBuilder &b, mlir::Location loc) {
+               eval.beginEvaluation();
+               mlir::Value trueValue = Visit(e->getTrueExpr());
+               cir::YieldOp::create(b, loc, trueValue);
+               eval.endEvaluation();
+             },
+             /*elseBuilder=*/
+             [&](mlir::OpBuilder &b, mlir::Location loc) {
+               eval.beginEvaluation();
+               mlir::Value falseValue = Visit(e->getFalseExpr());
+               cir::YieldOp::create(b, loc, falseValue);
+               eval.endEvaluation();
+             })
       .getResult();
 }
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
index 19ed6560245b4..928e5aa821bb5 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprConstant.cpp
@@ -179,8 +179,23 @@ bool ConstantAggregateBuilder::add(mlir::TypedAttr typedAttr, CharUnits offset,
   }
 
   // Uncommon case: constant overlaps what we've already created.
-  cgm.errorNYI("overlapping constants");
-  return false;
+  std::optional<size_t> firstElemToReplace = splitAt(offset);
+  if (!firstElemToReplace)
+    return false;
+
+  CharUnits cSize = getSize(typedAttr);
+  std::optional<size_t> lastElemToReplace = splitAt(offset + cSize);
+  if (!lastElemToReplace)
+    return false;
+
+  assert((firstElemToReplace == lastElemToReplace || allowOverwrite) &&
+         "unexpectedly overwriting field");
+
+  Element newElt(typedAttr, offset);
+  replace(elements, *firstElemToReplace, *lastElemToReplace, {newElt});
+  size = std::max(size, offset + cSize);
+  naturalLayout = false;
+  return true;
 }
 
 bool ConstantAggregateBuilder::addBits(llvm::APInt bits, uint64_t offsetInBits,
@@ -612,10 +627,7 @@ bool ConstRecordBuilder::applyZeroInitPadding(const ASTRecordLayout &layout,
 }
 
 bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
-  RecordDecl *rd = ile->getType()
-                       ->castAs<clang::RecordType>()
-                       ->getDecl()
-                       ->getDefinitionOrSelf();
+  RecordDecl *rd = ile->getType()->castAsRecordDecl();
   const ASTRecordLayout &layout = cgm.getASTContext().getASTRecordLayout(rd);
 
   // Bail out if we have base classes. We could support these, but they only
@@ -671,17 +683,14 @@ bool ConstRecordBuilder::build(InitListExpr *ile, bool allowOverwrite) {
       return false;
     }
 
-    mlir::TypedAttr eltInit;
-    if (init)
-      eltInit = mlir::cast<mlir::TypedAttr>(
-          emitter.tryEmitPrivateForMemory(init, field->getType()));
-    else
-      eltInit = mlir::cast<mlir::TypedAttr>(emitter.emitNullForMemory(
-          cgm.getLoc(ile->getSourceRange()), field->getType()));
-
-    if (!eltInit)
+    mlir::Attribute eltInitAttr =
+        init ? emitter.tryEmitPrivateForMemory(init, field->getType())
+             : emitter.emitNullForMemory(cgm.getLoc(ile->getSourceRange()),
+                                         field->getType());
+    if (!eltInitAttr)
       return false;
 
+    mlir::TypedAttr eltInit = mlir::cast<mlir::TypedAttr>(eltInitAttr);
     if (!field->isBitField()) {
       // Handle non-bitfield members.
       if (!appendField(field, layout.getFieldOffset(index), eltInit,
@@ -913,9 +922,9 @@ class ConstExprEmitter
   }
 
   mlir::Attribute VisitCastExpr(CastExpr *e, QualType destType) {
-    if (isa<ExplicitCastExpr>(e))
-      cgm.errorNYI(e->getBeginLoc(),
-                   "ConstExprEmitter::VisitCastExpr explicit cast");
+    if (const auto *ece = dyn_cast<ExplicitCastExpr>(e))
+      cgm.emitExplicitCastExprType(ece,
+                                   const_cast<CIRGenFunction *>(emitter.cgf));
 
     Expr *subExpr = e->getSubExpr();
 
@@ -1011,9 +1020,9 @@ class ConstExprEmitter
   }
 
   mlir::Attribute VisitCXXDefaultInitExpr(CXXDefaultInitExpr *die, QualType t) {
-    cgm.errorNYI(die->getBeginLoc(),
-                 "ConstExprEmitter::VisitCXXDefaultInitExpr");
-    return {};
+    // No need for a DefaultInitExprScope: we don't handle 'this' in a
+    // constant expression.
+    return Visit(die->getExpr(), t);
   }
 
   mlir::Attribute VisitExprWithCleanups(ExprWithCleanups *e, QualType t) {
@@ -1028,9 +1037,7 @@ class ConstExprEmitter
 
   mlir::Attribute VisitImplicitValueInitExpr(ImplicitValueInitExpr *e,
                                              QualType t) {
-    cgm.errorNYI(e->getBeginLoc(),
-                 "ConstExprEmitter::VisitImplicitValueInitExpr");
-    return {};
+    return cgm.getBuilder().getZeroInitAttr(cgm.convertType(t));
   }
 
   mlir::Attribute VisitInitListExpr(InitListExpr *ile, QualType t) {
@@ -1071,9 +1078,32 @@ class ConstExprEmitter
 
   mlir::Attribute VisitCXXConstructExpr(CXXConstructExpr *e, QualType ty) {
     if (!e->getConstructor()->isTrivial())
-      return nullptr;
-    cgm.errorNYI(e->getBeginLoc(), "trivial constructor const handling");
-    return {};
+      return {};
+
+    // Only default and copy/move constructors can be trivial.
+    if (e->getNumArgs()) {
+      assert(e->getNumArgs() == 1 && "trivial ctor with > 1 argument");
+      assert(e->getConstructor()->isCopyOrMoveConstructor() &&
+             "trivial ctor has argument but isn't a copy/move ctor");
+
+      Expr *arg = e->getArg(0);
+      assert(cgm.getASTContext().hasSameUnqualifiedType(ty, arg->getType()) &&
+             "argument to copy ctor is of wrong type");
+
+      // Look through the temporary; it's just converting the value to an lvalue
+      // to pass it to the constructor.
+      if (auto const *mte = dyn_cast<MaterializeTemporaryExpr>(arg))
+        return Visit(mte->getSubExpr(), ty);
+
+      // TODO: Investigate whether there are cases that can fall through to here
+      //       that need to be handled. This is missing in classic codegen also.
+      assert(!cir::MissingFeatures::ctorConstLvalueToRvalueConversion());
+
+      // Don't try to support arbitrary lvalue-to-rvalue conversions for now.
+      return {};
+    }
+
+    return cgm.getBuilder().getZeroInitAttr(cgm.convertType(ty));
   }
 
   mlir::Attribute VisitStringLiteral(StringLiteral *e, QualType t) {
diff --git a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
index 138082b59fecd..db6878d479366 100644
--- a/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenExprScalar.cpp
@@ -164,22 +164,22 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value VisitIntegerLiteral(const IntegerLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
-    return builder.create<cir::ConstantOp>(
-        cgf.getLoc(e->getExprLoc()), cir::IntAttr::get(type, e->getValue()));
+    return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
+                                   cir::IntAttr::get(type, e->getValue()));
   }
 
   mlir::Value VisitFloatingLiteral(const FloatingLiteral *e) {
     mlir::Type type = cgf.convertType(e->getType());
     assert(mlir::isa<cir::FPTypeInterface>(type) &&
            "expect floating-point type");
-    return builder.create<cir::ConstantOp>(
-        cgf.getLoc(e->getExprLoc()), cir::FPAttr::get(type, e->getValue()));
+    return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()),
+                                   cir::FPAttr::get(type, e->getValue()));
   }
 
   mlir::Value VisitCharacterLiteral(const CharacterLiteral *e) {
     mlir::Type ty = cgf.convertType(e->getType());
     auto init = cir::IntAttr::get(ty, e->getValue());
-    return builder.create<cir::ConstantOp>(cgf.getLoc(e->getExprLoc()), init);
+    return cir::ConstantOp::create(builder, cgf.getLoc(e->getExprLoc()), init);
   }
 
   mlir::Value VisitCXXBoolLiteralExpr(const CXXBoolLiteralExpr *e) {
@@ -227,7 +227,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       const mlir::Location loc = cgf.getLoc(e->getSourceRange());
       const mlir::Value vecValue = Visit(e->getBase());
       const mlir::Value indexValue = Visit(e->getIdx());
-      return cgf.builder.create<cir::VecExtractOp>(loc, vecValue, indexValue);
+      return cir::VecExtractOp::create(cgf.builder, loc, vecValue, indexValue);
     }
     // Just load the lvalue formed by the subscript expression.
     return emitLoadOfLValue(e);
@@ -238,8 +238,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       // The undocumented form of __builtin_shufflevector.
       mlir::Value inputVec = Visit(e->getExpr(0));
       mlir::Value indexVec = Visit(e->getExpr(1));
-      return cgf.builder.create<cir::VecShuffleDynamicOp>(
-          cgf.getLoc(e->getSourceRange()), inputVec, indexVec);
+      return cir::VecShuffleDynamicOp::create(
+          cgf.builder, cgf.getLoc(e->getSourceRange()), inputVec, indexVec);
     }
 
     mlir::Value vec1 = Visit(e->getExpr(0));
@@ -257,9 +257,10 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
                                 .getSExtValue()));
     }
 
-    return cgf.builder.create<cir::VecShuffleOp>(
-        cgf.getLoc(e->getSourceRange()), cgf.convertType(e->getType()), vec1,
-        vec2, cgf.builder.getArrayAttr(indices));
+    return cir::VecShuffleOp::create(cgf.builder,
+                                     cgf.getLoc(e->getSourceRange()),
+                                     cgf.convertType(e->getType()), vec1, vec2,
+                                     cgf.builder.getArrayAttr(indices));
   }
 
   mlir::Value VisitConvertVectorExpr(ConvertVectorExpr *e) {
@@ -296,8 +297,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value emitFloatToBoolConversion(mlir::Value src, mlir::Location loc) {
     cir::BoolType boolTy = builder.getBoolTy();
-    return builder.create<cir::CastOp>(loc, boolTy,
-                                       cir::CastKind::float_to_bool, src);
+    return cir::CastOp::create(builder, loc, boolTy,
+                               cir::CastKind::float_to_bool, src);
   }
 
   mlir::Value emitIntToBoolConversion(mlir::Value srcVal, mlir::Location loc) {
@@ -307,8 +308,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     // TODO: optimize this common case here or leave it for later
     // CIR passes?
     cir::BoolType boolTy = builder.getBoolTy();
-    return builder.create<cir::CastOp>(loc, boolTy, cir::CastKind::int_to_bool,
-                                       srcVal);
+    return cir::CastOp::create(builder, loc, boolTy, cir::CastKind::int_to_bool,
+                               srcVal);
   }
 
   /// Convert the specified expression value to a boolean (!cir.bool) truth
@@ -411,7 +412,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     }
 
     assert(castKind.has_value() && "Internal error: CastKind not set.");
-    return builder.create<cir::CastOp>(src.getLoc(), fullDstTy, *castKind, src);
+    return cir::CastOp::create(builder, src.getLoc(), fullDstTy, *castKind,
+                               src);
   }
 
   mlir::Value
@@ -658,9 +660,9 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
 
   mlir::Value emitUnaryOp(const UnaryOperator *e, cir::UnaryOpKind kind,
                           mlir::Value input, bool nsw = false) {
-    return builder.create<cir::UnaryOp>(
-        cgf.getLoc(e->getSourceRange().getBegin()), input.getType(), kind,
-        input, nsw);
+    return cir::UnaryOp::create(builder,
+                                cgf.getLoc(e->getSourceRange().getBegin()),
+                                input.getType(), kind, input, nsw);
   }
 
   mlir::Value VisitUnaryNot(const UnaryOperator *e) {
@@ -967,9 +969,9 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
         } else {
           // Other kinds of vectors. Element-wise comparison returning
           // a vector.
-          result = builder.create<cir::VecCmpOp>(
-              cgf.getLoc(boInfo.loc), cgf.convertType(boInfo.fullType), kind,
-              boInfo.lhs, boInfo.rhs);
+          result = cir::VecCmpOp::create(builder, cgf.getLoc(boInfo.loc),
+                                         cgf.convertType(boInfo.fullType), kind,
+                                         boInfo.lhs, boInfo.rhs);
         }
       } else if (boInfo.isFixedPointOp()) {
         assert(!cir::MissingFeatures::fixedPointType());
@@ -991,7 +993,7 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
       assert(e->getOpcode() == BO_EQ || e->getOpcode() == BO_NE);
 
       BinOpInfo boInfo = emitBinOps(e);
-      result = builder.create<cir::CmpOp>(loc, kind, boInfo.lhs, boInfo.rhs);
+      result = cir::CmpOp::create(builder, loc, kind, boInfo.lhs, boInfo.rhs);
     }
 
     return emitScalarConversion(result, cgf.getContext().BoolTy, e->getType(),
@@ -1093,8 +1095,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     CIRGenFunction::ConditionalEvaluation eval(cgf);
 
     mlir::Value lhsCondV = cgf.evaluateExprAsBool(e->getLHS());
-    auto resOp = builder.create<cir::TernaryOp>(
-        loc, lhsCondV, /*trueBuilder=*/
+    auto resOp = cir::TernaryOp::create(
+        builder, loc, lhsCondV, /*trueBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           CIRGenFunction::LexicalScope lexScope{cgf, loc,
                                                 b.getInsertionBlock()};
@@ -1139,8 +1141,8 @@ class ScalarExprEmitter : public StmtVisitor<ScalarExprEmitter, mlir::Value> {
     CIRGenFunction::ConditionalEvaluation eval(cgf);
 
     mlir::Value lhsCondV = cgf.evaluateExprAsBool(e->getLHS());
-    auto resOp = builder.create<cir::TernaryOp>(
-        loc, lhsCondV, /*trueBuilder=*/
+    auto resOp = cir::TernaryOp::create(
+        builder, loc, lhsCondV, /*trueBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc) {
           CIRGenFunction::LexicalScope lexScope{cgf, loc,
                                                 b.getInsertionBlock()};
@@ -1566,8 +1568,9 @@ static mlir::Value emitPointerArithmetic(CIRGenFunction &cgf,
   }
 
   assert(!cir::MissingFeatures::sanitizers());
-  return cgf.getBuilder().create<cir::PtrStrideOp>(
-      cgf.getLoc(op.e->getExprLoc()), pointer.getType(), pointer, index);
+  return cir::PtrStrideOp::create(cgf.getBuilder(),
+                                  cgf.getLoc(op.e->getExprLoc()),
+                                  pointer.getType(), pointer, index);
 }
 
 mlir::Value ScalarExprEmitter::emitMul(const BinOpInfo &ops) {
@@ -1609,19 +1612,19 @@ mlir::Value ScalarExprEmitter::emitMul(const BinOpInfo &ops) {
     return nullptr;
   }
 
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Mul, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::Mul,
+                            ops.lhs, ops.rhs);
 }
 mlir::Value ScalarExprEmitter::emitDiv(const BinOpInfo &ops) {
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Div, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::Div,
+                            ops.lhs, ops.rhs);
 }
 mlir::Value ScalarExprEmitter::emitRem(const BinOpInfo &ops) {
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Rem, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::Rem,
+                            ops.lhs, ops.rhs);
 }
 
 mlir::Value ScalarExprEmitter::emitAdd(const BinOpInfo &ops) {
@@ -1668,8 +1671,8 @@ mlir::Value ScalarExprEmitter::emitAdd(const BinOpInfo &ops) {
     return {};
   }
 
-  return builder.create<cir::BinOp>(loc, cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Add, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, loc, cgf.convertType(ops.fullType),
+                            cir::BinOpKind::Add, ops.lhs, ops.rhs);
 }
 
 mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
@@ -1716,9 +1719,9 @@ mlir::Value ScalarExprEmitter::emitSub(const BinOpInfo &ops) {
       return {};
     }
 
-    return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                      cgf.convertType(ops.fullType),
-                                      cir::BinOpKind::Sub, ops.lhs, ops.rhs);
+    return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                              cgf.convertType(ops.fullType),
+                              cir::BinOpKind::Sub, ops.lhs, ops.rhs);
   }
 
   // If the RHS is not a pointer, then we have normal pointer
@@ -1796,19 +1799,19 @@ mlir::Value ScalarExprEmitter::emitShr(const BinOpInfo &ops) {
 }
 
 mlir::Value ScalarExprEmitter::emitAnd(const BinOpInfo &ops) {
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::And, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::And,
+                            ops.lhs, ops.rhs);
 }
 mlir::Value ScalarExprEmitter::emitXor(const BinOpInfo &ops) {
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Xor, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::Xor,
+                            ops.lhs, ops.rhs);
 }
 mlir::Value ScalarExprEmitter::emitOr(const BinOpInfo &ops) {
-  return builder.create<cir::BinOp>(cgf.getLoc(ops.loc),
-                                    cgf.convertType(ops.fullType),
-                                    cir::BinOpKind::Or, ops.lhs, ops.rhs);
+  return cir::BinOp::create(builder, cgf.getLoc(ops.loc),
+                            cgf.convertType(ops.fullType), cir::BinOpKind::Or,
+                            ops.lhs, ops.rhs);
 }
 
 // Emit code for an explicit or implicit cast.  Implicit
@@ -2011,9 +2014,9 @@ mlir::Value ScalarExprEmitter::VisitCastExpr(CastExpr *ce) {
   case CK_VectorSplat: {
     // Create a vector object and fill all elements with the same scalar value.
     assert(destTy->isVectorType() && "CK_VectorSplat to non-vector type");
-    return builder.create<cir::VecSplatOp>(
-        cgf.getLoc(subExpr->getSourceRange()), cgf.convertType(destTy),
-        Visit(subExpr));
+    return cir::VecSplatOp::create(builder,
+                                   cgf.getLoc(subExpr->getSourceRange()),
+                                   cgf.convertType(destTy), Visit(subExpr));
   }
   case CK_FunctionToPointerDecay:
     return cgf.emitLValue(subExpr).getPointer();
@@ -2041,8 +2044,9 @@ mlir::Value ScalarExprEmitter::VisitMemberExpr(MemberExpr *e) {
   assert(!cir::MissingFeatures::tryEmitAsConstant());
   Expr::EvalResult result;
   if (e->EvaluateAsInt(result, cgf.getContext(), Expr::SE_AllowSideEffects)) {
-    cgf.cgm.errorNYI(e->getSourceRange(), "Constant interger member expr");
-    // Fall through to emit this as a non-constant access.
+    llvm::APSInt value = result.Val.getInt();
+    cgf.emitIgnoredExpr(e->getBase());
+    return builder.getConstInt(cgf.getLoc(e->getExprLoc()), value);
   }
   return emitLoadOfLValue(e);
 }
@@ -2072,8 +2076,9 @@ mlir::Value ScalarExprEmitter::VisitInitListExpr(InitListExpr *e) {
                   vectorType.getSize() - numInitElements, zeroValue);
     }
 
-    return cgf.getBuilder().create<cir::VecCreateOp>(
-        cgf.getLoc(e->getSourceRange()), vectorType, elements);
+    return cir::VecCreateOp::create(cgf.getBuilder(),
+                                    cgf.getLoc(e->getSourceRange()), vectorType,
+                                    elements);
   }
 
   // C++11 value-initialization for the scalar.
@@ -2309,8 +2314,8 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
     mlir::Value condValue = Visit(condExpr);
     mlir::Value lhsValue = Visit(lhsExpr);
     mlir::Value rhsValue = Visit(rhsExpr);
-    return builder.create<cir::VecTernaryOp>(loc, condValue, lhsValue,
-                                             rhsValue);
+    return cir::VecTernaryOp::create(builder, loc, condValue, lhsValue,
+                                     rhsValue);
   }
 
   // If this is a really simple expression (like x ? 4 : 5), emit this as a
@@ -2353,7 +2358,7 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 
     if (branch) {
       yieldTy = branch.getType();
-      b.create<cir::YieldOp>(loc, branch);
+      cir::YieldOp::create(b, loc, branch);
     } else {
       // If LHS or RHS is a throw or void expression we need to patch
       // arms as to properly match yield types.
@@ -2361,17 +2366,16 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
     }
   };
 
-  mlir::Value result = builder
-                           .create<cir::TernaryOp>(
-                               loc, condV,
-                               /*trueBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 emitBranch(b, loc, lhsExpr);
-                               },
-                               /*falseBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 emitBranch(b, loc, rhsExpr);
-                               })
+  mlir::Value result = cir::TernaryOp::create(
+                           builder, loc, condV,
+                           /*trueBuilder=*/
+                           [&](mlir::OpBuilder &b, mlir::Location loc) {
+                             emitBranch(b, loc, lhsExpr);
+                           },
+                           /*falseBuilder=*/
+                           [&](mlir::OpBuilder &b, mlir::Location loc) {
+                             emitBranch(b, loc, rhsExpr);
+                           })
                            .getResult();
 
   if (!insertPoints.empty()) {
@@ -2386,10 +2390,10 @@ mlir::Value ScalarExprEmitter::VisitAbstractConditionalOperator(
 
       // Block does not return: build empty yield.
       if (mlir::isa<cir::VoidType>(yieldTy)) {
-        builder.create<cir::YieldOp>(loc);
+        cir::YieldOp::create(builder, loc);
       } else { // Block returns: set null yield value.
         mlir::Value op0 = builder.getNullValue(yieldTy, loc);
-        builder.create<cir::YieldOp>(loc, op0);
+        cir::YieldOp::create(builder, loc, op0);
       }
     }
   }
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
index d3c0d9f109317..58feb36f78f23 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.cpp
@@ -264,11 +264,11 @@ void CIRGenFunction::LexicalScope::cleanup() {
     // If we now have one after `applyCleanup`, hook it up properly.
     if (!cleanupBlock && localScope->getCleanupBlock(builder)) {
       cleanupBlock = localScope->getCleanupBlock(builder);
-      builder.create<cir::BrOp>(insPt->back().getLoc(), cleanupBlock);
+      cir::BrOp::create(builder, insPt->back().getLoc(), cleanupBlock);
       if (!cleanupBlock->mightHaveTerminator()) {
         mlir::OpBuilder::InsertionGuard guard(builder);
         builder.setInsertionPointToEnd(cleanupBlock);
-        builder.create<cir::YieldOp>(localScope->endLoc);
+        cir::YieldOp::create(builder, localScope->endLoc);
       }
     }
 
@@ -286,7 +286,7 @@ void CIRGenFunction::LexicalScope::cleanup() {
             }
           }
 
-          builder.create<cir::BrOp>(*returnLoc, returnBlock);
+          cir::BrOp::create(builder, *returnLoc, returnBlock);
           return;
         }
       }
@@ -298,8 +298,8 @@ void CIRGenFunction::LexicalScope::cleanup() {
     // Ternary ops have to deal with matching arms for yielding types
     // and do return a value, it must do its own cir.yield insertion.
     if (!localScope->isTernary() && !insPt->mightHaveTerminator()) {
-      !retVal ? builder.create<cir::YieldOp>(localScope->endLoc)
-              : builder.create<cir::YieldOp>(localScope->endLoc, retVal);
+      !retVal ? cir::YieldOp::create(builder, localScope->endLoc)
+              : cir::YieldOp::create(builder, localScope->endLoc, retVal);
     }
   };
 
@@ -331,7 +331,7 @@ void CIRGenFunction::LexicalScope::cleanup() {
 
   // If there's a cleanup block, branch to it, nothing else to do.
   if (cleanupBlock) {
-    builder.create<cir::BrOp>(curBlock->back().getLoc(), cleanupBlock);
+    cir::BrOp::create(builder, curBlock->back().getLoc(), cleanupBlock);
     return;
   }
 
@@ -349,12 +349,12 @@ cir::ReturnOp CIRGenFunction::LexicalScope::emitReturn(mlir::Location loc) {
   assert(fn && "emitReturn from non-function");
   if (!fn.getFunctionType().hasVoidReturn()) {
     // Load the value from `__retval` and return it via the `cir.return` op.
-    auto value = builder.create<cir::LoadOp>(
-        loc, fn.getFunctionType().getReturnType(), *cgf.fnRetAlloca);
-    return builder.create<cir::ReturnOp>(loc,
-                                         llvm::ArrayRef(value.getResult()));
+    auto value = cir::LoadOp::create(
+        builder, loc, fn.getFunctionType().getReturnType(), *cgf.fnRetAlloca);
+    return cir::ReturnOp::create(builder, loc,
+                                 llvm::ArrayRef(value.getResult()));
   }
-  return builder.create<cir::ReturnOp>(loc);
+  return cir::ReturnOp::create(builder, loc);
 }
 
 // This is copied from CodeGenModule::MayDropFunctionReturn.  This is a
@@ -389,9 +389,9 @@ void CIRGenFunction::LexicalScope::emitImplicitReturn() {
     if (shouldEmitUnreachable) {
       assert(!cir::MissingFeatures::sanitizers());
       if (cgf.cgm.getCodeGenOpts().OptimizationLevel == 0)
-        builder.create<cir::TrapOp>(localScope->endLoc);
+        cir::TrapOp::create(builder, localScope->endLoc);
       else
-        builder.create<cir::UnreachableOp>(localScope->endLoc);
+        cir::UnreachableOp::create(builder, localScope->endLoc);
       builder.clearInsertionPoint();
       return;
     }
@@ -561,8 +561,8 @@ cir::FuncOp CIRGenFunction::generateCode(clang::GlobalDecl gd, cir::FuncOp fn,
     if (!clone) {
       mlir::OpBuilder::InsertionGuard guard(builder);
       builder.setInsertionPoint(fn);
-      clone = builder.create<cir::FuncOp>(fn.getLoc(), fdInlineName,
-                                          fn.getFunctionType());
+      clone = cir::FuncOp::create(builder, fn.getLoc(), fdInlineName,
+                                  fn.getFunctionType());
       clone.setLinkage(cir::GlobalLinkageKind::InternalLinkage);
       clone.setSymVisibility("private");
       clone.setInlineKind(cir::InlineKind::AlwaysInline);
diff --git a/clang/lib/CIR/CodeGen/CIRGenFunction.h b/clang/lib/CIR/CodeGen/CIRGenFunction.h
index fa3260c5ec5b8..c3fcd1a69a88e 100644
--- a/clang/lib/CIR/CodeGen/CIRGenFunction.h
+++ b/clang/lib/CIR/CodeGen/CIRGenFunction.h
@@ -60,11 +60,44 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// is where the next operations will be introduced.
   CIRGenBuilderTy &builder;
 
+  /// A jump destination is an abstract label, branching to which may
+  /// require a jump out through normal cleanups.
+  struct JumpDest {
+    JumpDest() = default;
+    JumpDest(mlir::Block *block, EHScopeStack::stable_iterator depth = {},
+             unsigned index = 0)
+        : block(block) {}
+
+    bool isValid() const { return block != nullptr; }
+    mlir::Block *getBlock() const { return block; }
+    EHScopeStack::stable_iterator getScopeDepth() const { return scopeDepth; }
+    unsigned getDestIndex() const { return index; }
+
+    // This should be used cautiously.
+    void setScopeDepth(EHScopeStack::stable_iterator depth) {
+      scopeDepth = depth;
+    }
+
+  private:
+    mlir::Block *block = nullptr;
+    EHScopeStack::stable_iterator scopeDepth;
+    unsigned index;
+  };
+
 public:
   /// The GlobalDecl for the current function being compiled or the global
   /// variable currently being initialized.
   clang::GlobalDecl curGD;
 
+  /// Unified return block.
+  /// In CIR this is a function because each scope might have
+  /// its associated return block.
+  JumpDest returnBlock(mlir::Block *retBlock) {
+    return getJumpDestInCurrentScope(retBlock);
+  }
+
+  unsigned nextCleanupDestIndex = 1;
+
   /// The compiler-generated variable that holds the return value.
   std::optional<mlir::Value> fnRetAlloca;
 
@@ -87,6 +120,8 @@ class CIRGenFunction : public CIRGenTypeCache {
   /// Tracks function scope overall cleanup handling.
   EHScopeStack ehStack;
 
+  GlobalDecl curSEHParent;
+
   llvm::DenseMap<const clang::ValueDecl *, clang::FieldDecl *>
       lambdaCaptureFields;
   clang::FieldDecl *lambdaThisCaptureField = nullptr;
@@ -574,6 +609,16 @@ class CIRGenFunction : public CIRGenTypeCache {
     }
   };
 
+  /// The given basic block lies in the current EH scope, but may be a
+  /// target of a potentially scope-crossing jump; get a stable handle
+  /// to which we can perform this jump later.
+  /// CIRGen: this mostly tracks state for figuring out the proper scope
+  /// information, no actual branches are emitted.
+  JumpDest getJumpDestInCurrentScope(mlir::Block *target) {
+    return JumpDest(target, ehStack.getInnermostNormalCleanup(),
+                    nextCleanupDestIndex++);
+  }
+
   /// Perform the usual unary conversions on the specified expression and
   /// compare the result against zero, returning an Int1Ty value.
   mlir::Value evaluateExprAsBool(const clang::Expr *e);
@@ -622,6 +667,12 @@ class CIRGenFunction : public CIRGenTypeCache {
     symbolTable.insert(vd, addr.getPointer());
   }
 
+  // Replaces the address of the local variable, if it exists.  Else does the
+  // same thing as setAddrOfLocalVar.
+  void replaceAddrOfLocalVar(const clang::VarDecl *vd, Address addr) {
+    localDeclMap.insert_or_assign(vd, addr);
+  }
+
   // A class to allow reverting changes to a var-decl's registration to the
   // localDeclMap. This is used in cases where things are being inserted into
   // the variable list but don't follow normal lookup/search rules, like in
@@ -1221,6 +1272,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   LValue emitBinaryOperatorLValue(const BinaryOperator *e);
 
+  cir::BrOp emitBranchThroughCleanup(mlir::Location loc, JumpDest dest);
+
   mlir::LogicalResult emitBreakStmt(const clang::BreakStmt &s);
 
   RValue emitBuiltinExpr(const clang::GlobalDecl &gd, unsigned builtinID,
@@ -1281,6 +1334,9 @@ class CIRGenFunction : public CIRGenTypeCache {
   mlir::LogicalResult emitCoroutineBody(const CoroutineBodyStmt &s);
   cir::CallOp emitCoroEndBuiltinCall(mlir::Location loc, mlir::Value nullPtr);
   cir::CallOp emitCoroIDBuiltinCall(mlir::Location loc, mlir::Value nullPtr);
+  cir::CallOp emitCoroAllocBuiltinCall(mlir::Location loc);
+  cir::CallOp emitCoroBeginBuiltinCall(mlir::Location loc,
+                                       mlir::Value coroframeAddr);
 
   void emitDestroy(Address addr, QualType type, Destroyer *destroyer);
 
@@ -1416,6 +1472,9 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   void emitReturnOfRValue(mlir::Location loc, RValue rv, QualType ty);
 
+  mlir::Value emitRuntimeCall(mlir::Location loc, cir::FuncOp callee,
+                              llvm::ArrayRef<mlir::Value> args = {});
+
   /// Emit the computation of the specified expression of scalar type.
   mlir::Value emitScalarExpr(const clang::Expr *e);
 
@@ -1614,6 +1673,10 @@ class CIRGenFunction : public CIRGenTypeCache {
                                      bool buildingTopLevelCase);
   mlir::LogicalResult emitSwitchStmt(const clang::SwitchStmt &s);
 
+  mlir::Value emitTargetBuiltinExpr(unsigned builtinID,
+                                    const clang::CallExpr *e,
+                                    ReturnValueSlot &returnValue);
+
   /// Given a value and its clang type, returns the value casted to its memory
   /// representation.
   /// Note: CIR defers most of the special casting to the final lowering passes
@@ -1652,6 +1715,8 @@ class CIRGenFunction : public CIRGenTypeCache {
 
   mlir::LogicalResult emitWhileStmt(const clang::WhileStmt &s);
 
+  mlir::Value emitX86BuiltinExpr(unsigned builtinID, const CallExpr *e);
+
   /// Given an assignment `*lhs = rhs`, emit a test that checks if \p rhs is
   /// nonnull, if 1\p LHS is marked _Nonnull.
   void emitNullabilityCheck(LValue lhs, mlir::Value rhs,
diff --git a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
index c184d4a4b1d97..88fedf1acc6a1 100644
--- a/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenItaniumCXXABI.cpp
@@ -74,6 +74,9 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
                           QualType thisTy) override;
   void registerGlobalDtor(const VarDecl *vd, cir::FuncOp dtor,
                           mlir::Value addr) override;
+  void emitVirtualObjectDelete(CIRGenFunction &cgf, const CXXDeleteExpr *de,
+                               Address ptr, QualType elementType,
+                               const CXXDestructorDecl *dtor) override;
 
   void emitRethrow(CIRGenFunction &cgf, bool isNoReturn) override;
   void emitThrow(CIRGenFunction &cgf, const CXXThrowExpr *e) override;
@@ -120,6 +123,8 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
     return true;
   }
 
+  void emitBadCastCall(CIRGenFunction &cgf, mlir::Location loc) override;
+
   mlir::Value
   getVirtualBaseClassOffset(mlir::Location loc, CIRGenFunction &cgf,
                             Address thisAddr, const CXXRecordDecl *classDecl,
@@ -135,8 +140,14 @@ class CIRGenItaniumCXXABI : public CIRGenCXXABI {
                               cir::PointerType destCIRTy, bool isRefCast,
                               Address src) override;
 
-  /**************************** RTTI Uniqueness ******************************/
+  Address initializeArrayCookie(CIRGenFunction &cgf, Address newPtr,
+                                mlir::Value numElements, const CXXNewExpr *e,
+                                QualType elementType) override;
+
 protected:
+  CharUnits getArrayCookieSizeImpl(QualType elementType) override;
+
+  /**************************** RTTI Uniqueness ******************************/
   /// Returns true if the ABI requires RTTI type_info objects to be unique
   /// across a program.
   virtual bool shouldRTTIBeUnique() const { return true; }
@@ -1803,8 +1814,8 @@ CIRGenItaniumCXXABI::getVTableAddressPoint(BaseSubobject base,
   mlir::OpBuilder &builder = cgm.getBuilder();
   auto vtablePtrTy = cir::VPtrType::get(builder.getContext());
 
-  return builder.create<cir::VTableAddrPointOp>(
-      cgm.getLoc(vtableClass->getSourceRange()), vtablePtrTy,
+  return cir::VTableAddrPointOp::create(
+      builder, cgm.getLoc(vtableClass->getSourceRange()), vtablePtrTy,
       mlir::FlatSymbolRefAttr::get(vtable.getSymNameAttr()),
       cir::AddressPointAttr::get(cgm.getBuilder().getContext(),
                                  addressPoint.VTableIndex,
@@ -1868,6 +1879,20 @@ static cir::FuncOp getBadCastFn(CIRGenFunction &cgf) {
   return cgf.cgm.createRuntimeFunction(fnTy, "__cxa_bad_cast");
 }
 
+static void emitCallToBadCast(CIRGenFunction &cgf, mlir::Location loc) {
+  // TODO(cir): set the calling convention to the runtime function.
+  assert(!cir::MissingFeatures::opFuncCallingConv());
+
+  cgf.emitRuntimeCall(loc, getBadCastFn(cgf));
+  cir::UnreachableOp::create(cgf.getBuilder(), loc);
+  cgf.getBuilder().clearInsertionPoint();
+}
+
+void CIRGenItaniumCXXABI::emitBadCastCall(CIRGenFunction &cgf,
+                                          mlir::Location loc) {
+  emitCallToBadCast(cgf, loc);
+}
+
 // TODO(cir): This could be shared with classic codegen.
 static CharUnits computeOffsetHint(ASTContext &astContext,
                                    const CXXRecordDecl *src,
@@ -1953,6 +1978,136 @@ static Address emitDynamicCastToVoid(CIRGenFunction &cgf, mlir::Location loc,
   return Address{ptr, src.getAlignment()};
 }
 
+static mlir::Value emitExactDynamicCast(CIRGenItaniumCXXABI &abi,
+                                        CIRGenFunction &cgf, mlir::Location loc,
+                                        QualType srcRecordTy,
+                                        QualType destRecordTy,
+                                        cir::PointerType destCIRTy,
+                                        bool isRefCast, Address src) {
+  // Find all the inheritance paths from SrcRecordTy to DestRecordTy.
+  const CXXRecordDecl *srcDecl = srcRecordTy->getAsCXXRecordDecl();
+  const CXXRecordDecl *destDecl = destRecordTy->getAsCXXRecordDecl();
+  CXXBasePaths paths(/*FindAmbiguities=*/true, /*RecordPaths=*/true,
+                     /*DetectVirtual=*/false);
+  (void)destDecl->isDerivedFrom(srcDecl, paths);
+
+  // Find an offset within `destDecl` where a `srcDecl` instance and its vptr
+  // might appear.
+  std::optional<CharUnits> offset;
+  for (const CXXBasePath &path : paths) {
+    // dynamic_cast only finds public inheritance paths.
+    if (path.Access != AS_public)
+      continue;
+
+    CharUnits pathOffset;
+    for (const CXXBasePathElement &pathElement : path) {
+      // Find the offset along this inheritance step.
+      const CXXRecordDecl *base =
+          pathElement.Base->getType()->getAsCXXRecordDecl();
+      if (pathElement.Base->isVirtual()) {
+        // For a virtual base class, we know that the derived class is exactly
+        // destDecl, so we can use the vbase offset from its layout.
+        const ASTRecordLayout &layout =
+            cgf.getContext().getASTRecordLayout(destDecl);
+        pathOffset = layout.getVBaseClassOffset(base);
+      } else {
+        const ASTRecordLayout &layout =
+            cgf.getContext().getASTRecordLayout(pathElement.Class);
+        pathOffset += layout.getBaseClassOffset(base);
+      }
+    }
+
+    if (!offset) {
+      offset = pathOffset;
+    } else if (offset != pathOffset) {
+      // base appears in at least two different places. Find the most-derived
+      // object and see if it's a DestDecl. Note that the most-derived object
+      // must be at least as aligned as this base class subobject, and must
+      // have a vptr at offset 0.
+      src = emitDynamicCastToVoid(cgf, loc, srcRecordTy, src);
+      srcDecl = destDecl;
+      offset = CharUnits::Zero();
+      break;
+    }
+  }
+
+  CIRGenBuilderTy &builder = cgf.getBuilder();
+
+  if (!offset) {
+    // If there are no public inheritance paths, the cast always fails.
+    mlir::Value nullPtrValue = builder.getNullPtr(destCIRTy, loc);
+    if (isRefCast) {
+      mlir::Region *currentRegion = builder.getBlock()->getParent();
+      emitCallToBadCast(cgf, loc);
+
+      // The call to bad_cast will terminate the block. Create a new block to
+      // hold any follow up code.
+      builder.createBlock(currentRegion, currentRegion->end());
+    }
+
+    return nullPtrValue;
+  }
+
+  // Compare the vptr against the expected vptr for the destination type at
+  // this offset. Note that we do not know what type src points to in the case
+  // where the derived class multiply inherits from the base class so we can't
+  // use getVTablePtr, so we load the vptr directly instead.
+
+  mlir::Value expectedVPtr =
+      abi.getVTableAddressPoint(BaseSubobject(srcDecl, *offset), destDecl);
+
+  // TODO(cir): handle address space here.
+  assert(!cir::MissingFeatures::addressSpace());
+  mlir::Type vptrTy = expectedVPtr.getType();
+  mlir::Type vptrPtrTy = builder.getPointerTo(vptrTy);
+  Address srcVPtrPtr(builder.createBitcast(src.getPointer(), vptrPtrTy),
+                     src.getAlignment());
+  mlir::Value srcVPtr = builder.createLoad(loc, srcVPtrPtr);
+
+  // TODO(cir): decorate SrcVPtr with TBAA info.
+  assert(!cir::MissingFeatures::opTBAA());
+
+  mlir::Value success =
+      builder.createCompare(loc, cir::CmpOpKind::eq, srcVPtr, expectedVPtr);
+
+  auto emitCastResult = [&] {
+    if (offset->isZero())
+      return builder.createBitcast(src.getPointer(), destCIRTy);
+
+    // TODO(cir): handle address space here.
+    assert(!cir::MissingFeatures::addressSpace());
+    mlir::Type u8PtrTy = builder.getUInt8PtrTy();
+
+    mlir::Value strideToApply =
+        builder.getConstInt(loc, builder.getUInt64Ty(), -offset->getQuantity());
+    mlir::Value srcU8Ptr = builder.createBitcast(src.getPointer(), u8PtrTy);
+    mlir::Value resultU8Ptr = cir::PtrStrideOp::create(builder, loc, u8PtrTy,
+                                                       srcU8Ptr, strideToApply);
+    return builder.createBitcast(resultU8Ptr, destCIRTy);
+  };
+
+  if (isRefCast) {
+    mlir::Value failed = builder.createNot(success);
+    cir::IfOp::create(builder, loc, failed, /*withElseRegion=*/false,
+                      [&](mlir::OpBuilder &, mlir::Location) {
+                        emitCallToBadCast(cgf, loc);
+                      });
+    return emitCastResult();
+  }
+
+  return cir::TernaryOp::create(
+             builder, loc, success,
+             [&](mlir::OpBuilder &, mlir::Location) {
+               auto result = emitCastResult();
+               builder.createYield(loc, result);
+             },
+             [&](mlir::OpBuilder &, mlir::Location) {
+               mlir::Value nullPtrValue = builder.getNullPtr(destCIRTy, loc);
+               builder.createYield(loc, nullPtrValue);
+             })
+      .getResult();
+}
+
 static cir::DynamicCastInfoAttr emitDynamicCastInfo(CIRGenFunction &cgf,
                                                     mlir::Location loc,
                                                     QualType srcRecordTy,
@@ -1994,8 +2149,27 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
   // if the dynamic type of the pointer is exactly the destination type.
   if (destRecordTy->getAsCXXRecordDecl()->isEffectivelyFinal() &&
       cgf.cgm.getCodeGenOpts().OptimizationLevel > 0) {
-    cgm.errorNYI(loc, "emitExactDynamicCast");
-    return {};
+    CIRGenBuilderTy &builder = cgf.getBuilder();
+    // If this isn't a reference cast, check the pointer to see if it's null.
+    if (!isRefCast) {
+      mlir::Value srcPtrIsNull = builder.createPtrIsNull(src.getPointer());
+      return cir::TernaryOp::create(
+                 builder, loc, srcPtrIsNull,
+                 [&](mlir::OpBuilder, mlir::Location) {
+                   builder.createYield(
+                       loc, builder.getNullPtr(destCIRTy, loc).getResult());
+                 },
+                 [&](mlir::OpBuilder &, mlir::Location) {
+                   mlir::Value exactCast = emitExactDynamicCast(
+                       *this, cgf, loc, srcRecordTy, destRecordTy, destCIRTy,
+                       isRefCast, src);
+                   builder.createYield(loc, exactCast);
+                 })
+          .getResult();
+    }
+
+    return emitExactDynamicCast(*this, cgf, loc, srcRecordTy, destRecordTy,
+                                destCIRTy, isRefCast, src);
   }
 
   cir::DynamicCastInfoAttr castInfo =
@@ -2003,3 +2177,85 @@ mlir::Value CIRGenItaniumCXXABI::emitDynamicCast(CIRGenFunction &cgf,
   return cgf.getBuilder().createDynCast(loc, src.getPointer(), destCIRTy,
                                         isRefCast, castInfo);
 }
+
+/// The Itanium ABI always places an offset to the complete object
+/// at entry -2 in the vtable.
+void CIRGenItaniumCXXABI::emitVirtualObjectDelete(
+    CIRGenFunction &cgf, const CXXDeleteExpr *delExpr, Address ptr,
+    QualType elementType, const CXXDestructorDecl *dtor) {
+  bool useGlobalDelete = delExpr->isGlobalDelete();
+  if (useGlobalDelete) {
+    cgf.cgm.errorNYI(delExpr->getSourceRange(),
+                     "emitVirtualObjectDelete: global delete");
+  }
+
+  CXXDtorType dtorType = useGlobalDelete ? Dtor_Complete : Dtor_Deleting;
+  emitVirtualDestructorCall(cgf, dtor, dtorType, ptr, delExpr);
+}
+
+/************************** Array allocation cookies **************************/
+
+CharUnits CIRGenItaniumCXXABI::getArrayCookieSizeImpl(QualType elementType) {
+  // The array cookie is a size_t; pad that up to the element alignment.
+  // The cookie is actually right-justified in that space.
+  return std::max(
+      cgm.getSizeSize(),
+      cgm.getASTContext().getPreferredTypeAlignInChars(elementType));
+}
+
+Address CIRGenItaniumCXXABI::initializeArrayCookie(CIRGenFunction &cgf,
+                                                   Address newPtr,
+                                                   mlir::Value numElements,
+                                                   const CXXNewExpr *e,
+                                                   QualType elementType) {
+  assert(requiresArrayCookie(e));
+
+  // TODO: When sanitizer support is implemented, we'll need to
+  // get the address space from `newPtr`.
+  assert(!cir::MissingFeatures::addressSpace());
+  assert(!cir::MissingFeatures::sanitizers());
+
+  ASTContext &ctx = cgm.getASTContext();
+  CharUnits sizeSize = cgf.getSizeSize();
+  mlir::Location loc = cgf.getLoc(e->getSourceRange());
+
+  // The size of the cookie.
+  CharUnits cookieSize =
+      std::max(sizeSize, ctx.getPreferredTypeAlignInChars(elementType));
+  assert(cookieSize == getArrayCookieSizeImpl(elementType));
+
+  cir::PointerType u8PtrTy = cgf.getBuilder().getUInt8PtrTy();
+  mlir::Value baseBytePtr =
+      cgf.getBuilder().createPtrBitcast(newPtr.getPointer(), u8PtrTy);
+
+  // Compute an offset to the cookie.
+  CharUnits cookieOffset = cookieSize - sizeSize;
+  mlir::Value cookiePtrValue = baseBytePtr;
+  if (!cookieOffset.isZero()) {
+    mlir::Value offsetOp = cgf.getBuilder().getSignedInt(
+        loc, cookieOffset.getQuantity(), /*width=*/32);
+    cookiePtrValue =
+        cgf.getBuilder().createPtrStride(loc, cookiePtrValue, offsetOp);
+  }
+
+  CharUnits baseAlignment = newPtr.getAlignment();
+  CharUnits cookiePtrAlignment = baseAlignment.alignmentAtOffset(cookieOffset);
+  Address cookiePtr(cookiePtrValue, u8PtrTy, cookiePtrAlignment);
+
+  // Write the number of elements into the appropriate slot.
+  Address numElementsPtr =
+      cookiePtr.withElementType(cgf.getBuilder(), cgf.SizeTy);
+  cgf.getBuilder().createStore(loc, numElements, numElementsPtr);
+
+  // Finally, compute a pointer to the actual data buffer by skipping
+  // over the cookie completely.
+  mlir::Value dataOffset =
+      cgf.getBuilder().getSignedInt(loc, cookieSize.getQuantity(),
+                                    /*width=*/32);
+  mlir::Value dataPtr =
+      cgf.getBuilder().createPtrStride(loc, baseBytePtr, dataOffset);
+  mlir::Value finalPtr =
+      cgf.getBuilder().createPtrBitcast(dataPtr, newPtr.getElementType());
+  CharUnits finalAlignment = baseAlignment.alignmentAtOffset(cookieSize);
+  return Address(finalPtr, newPtr.getElementType(), finalAlignment);
+}
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.cpp b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
index 74d39b73fff8f..46adfe28e377a 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.cpp
@@ -102,7 +102,7 @@ CIRGenModule::CIRGenModule(mlir::MLIRContext &mlirContext,
   // TODO(CIR): Should be updated once TypeSizeInfoAttr is upstreamed
   const unsigned sizeTypeSize =
       astContext.getTypeSize(astContext.getSignedSizeType());
-  SizeAlignInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity();
+  SizeSizeInBytes = astContext.toCharUnitsFromBits(sizeTypeSize).getQuantity();
   // In CIRGenTypeCache, UIntPtrTy and SizeType are fields of the same union
   UIntPtrTy =
       cir::IntType::get(&getMLIRContext(), sizeTypeSize, /*isSigned=*/false);
@@ -535,7 +535,7 @@ cir::GlobalOp CIRGenModule::createGlobalOp(CIRGenModule &cgm,
         builder.setInsertionPointToStart(cgm.getModule().getBody());
     }
 
-    g = builder.create<cir::GlobalOp>(loc, name, t, isConstant);
+    g = cir::GlobalOp::create(builder, loc, name, t, isConstant);
     if (!insertPoint)
       cgm.lastGlobalOp = g;
 
@@ -739,8 +739,8 @@ mlir::Value CIRGenModule::getAddrOfGlobalVar(const VarDecl *d, mlir::Type ty,
 
   cir::GlobalOp g = getOrCreateCIRGlobal(d, ty, isForDefinition);
   mlir::Type ptrTy = builder.getPointerTo(g.getSymType());
-  return builder.create<cir::GetGlobalOp>(getLoc(d->getSourceRange()), ptrTy,
-                                          g.getSymName());
+  return cir::GetGlobalOp::create(builder, getLoc(d->getSourceRange()), ptrTy,
+                                  g.getSymName());
 }
 
 cir::GlobalViewAttr CIRGenModule::getAddrOfGlobalVarAttr(const VarDecl *d) {
@@ -2176,7 +2176,7 @@ CIRGenModule::createCIRFunction(mlir::Location loc, StringRef name,
     if (cgf)
       builder.setInsertionPoint(cgf->curFn);
 
-    func = builder.create<cir::FuncOp>(loc, name, funcType);
+    func = cir::FuncOp::create(builder, loc, name, funcType);
 
     assert(!cir::MissingFeatures::opFuncAstDeclAttr());
 
diff --git a/clang/lib/CIR/CodeGen/CIRGenModule.h b/clang/lib/CIR/CodeGen/CIRGenModule.h
index 1fc116d98a858..186913d1bac9d 100644
--- a/clang/lib/CIR/CodeGen/CIRGenModule.h
+++ b/clang/lib/CIR/CodeGen/CIRGenModule.h
@@ -496,6 +496,8 @@ class CIRGenModule : public CIRGenTypeCache {
                                     bool assumeConvergent = false);
 
   static constexpr const char *builtinCoroId = "__builtin_coro_id";
+  static constexpr const char *builtinCoroAlloc = "__builtin_coro_alloc";
+  static constexpr const char *builtinCoroBegin = "__builtin_coro_begin";
 
   /// Given a builtin id for a function like "__builtin_fabsf", return a
   /// Function* for "fabsf".
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
index 5ba6bcb192b91..e7bf3bcc85c0b 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACC.cpp
@@ -27,8 +27,8 @@ mlir::Value createBound(CIRGenFunction &cgf, CIRGen::CIRGenBuilderTy &builder,
   // Stride is always 1 in C/C++.
   mlir::Value stride = cgf.createOpenACCConstantInt(boundLoc, 64, 1);
 
-  auto bound =
-      builder.create<mlir::acc::DataBoundsOp>(boundLoc, lowerBound, upperBound);
+  auto bound = mlir::acc::DataBoundsOp::create(builder, boundLoc, lowerBound,
+                                               upperBound);
   bound.getStartIdxMutable().assign(startIdx);
   if (extent)
     bound.getExtentMutable().assign(extent);
@@ -48,8 +48,8 @@ mlir::Value CIRGenFunction::emitOpenACCIntExpr(const Expr *intExpr) {
           ? mlir::IntegerType::SignednessSemantics::Signed
           : mlir::IntegerType::SignednessSemantics::Unsigned);
 
-  auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
-      exprLoc, targetType, expr);
+  auto conversionOp = mlir::UnrealizedConversionCastOp::create(
+      builder, exprLoc, targetType, expr);
   return conversionOp.getResult(0);
 }
 
@@ -59,8 +59,8 @@ mlir::Value CIRGenFunction::createOpenACCConstantInt(mlir::Location loc,
   mlir::IntegerType ty =
       mlir::IntegerType::get(&getMLIRContext(), width,
                              mlir::IntegerType::SignednessSemantics::Signless);
-  auto constOp = builder.create<mlir::arith::ConstantOp>(
-      loc, builder.getIntegerAttr(ty, value));
+  auto constOp = mlir::arith::ConstantOp::create(
+      builder, loc, builder.getIntegerAttr(ty, value));
 
   return constOp;
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
index ce4ae7ec5efc4..50101373f3e9c 100644
--- a/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenOpenACCClause.cpp
@@ -96,8 +96,8 @@ class OpenACCClauseCIREmitter final
     mlir::IntegerType targetType = mlir::IntegerType::get(
         &cgf.getMLIRContext(), /*width=*/1,
         mlir::IntegerType::SignednessSemantics::Signless);
-    auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
-        exprLoc, targetType, condition);
+    auto conversionOp = mlir::UnrealizedConversionCastOp::create(
+        builder, exprLoc, targetType, condition);
     return conversionOp.getResult(0);
   }
 
@@ -107,8 +107,8 @@ class OpenACCClauseCIREmitter final
     mlir::IntegerType ty = mlir::IntegerType::get(
         &cgf.getMLIRContext(), width,
         mlir::IntegerType::SignednessSemantics::Signless);
-    auto constOp = builder.create<mlir::arith::ConstantOp>(
-        loc, builder.getIntegerAttr(ty, value));
+    auto constOp = mlir::arith::ConstantOp::create(
+        builder, loc, builder.getIntegerAttr(ty, value));
 
     return constOp;
   }
@@ -217,8 +217,8 @@ class OpenACCClauseCIREmitter final
         cgf.getOpenACCDataOperandInfo(varOperand);
 
     auto beforeOp =
-        builder.create<BeforeOpTy>(opInfo.beginLoc, opInfo.varValue, structured,
-                                   implicit, opInfo.name, opInfo.bounds);
+        BeforeOpTy::create(builder, opInfo.beginLoc, opInfo.varValue,
+                           structured, implicit, opInfo.name, opInfo.bounds);
     operation.getDataClauseOperandsMutable().append(beforeOp.getResult());
 
     AfterOpTy afterOp;
@@ -231,12 +231,12 @@ class OpenACCClauseCIREmitter final
         // Detach/Delete ops don't have the variable reference here, so they
         // take 1 fewer argument to their build function.
         afterOp =
-            builder.create<AfterOpTy>(opInfo.beginLoc, beforeOp, structured,
-                                      implicit, opInfo.name, opInfo.bounds);
+            AfterOpTy::create(builder, opInfo.beginLoc, beforeOp, structured,
+                              implicit, opInfo.name, opInfo.bounds);
       } else {
-        afterOp = builder.create<AfterOpTy>(
-            opInfo.beginLoc, beforeOp, opInfo.varValue, structured, implicit,
-            opInfo.name, opInfo.bounds);
+        afterOp = AfterOpTy::create(builder, opInfo.beginLoc, beforeOp,
+                                    opInfo.varValue, structured, implicit,
+                                    opInfo.name, opInfo.bounds);
       }
     }
 
@@ -258,8 +258,8 @@ class OpenACCClauseCIREmitter final
     CIRGenFunction::OpenACCDataOperandInfo opInfo =
         cgf.getOpenACCDataOperandInfo(varOperand);
     auto beforeOp =
-        builder.create<BeforeOpTy>(opInfo.beginLoc, opInfo.varValue, structured,
-                                   implicit, opInfo.name, opInfo.bounds);
+        BeforeOpTy::create(builder, opInfo.beginLoc, opInfo.varValue,
+                           structured, implicit, opInfo.name, opInfo.bounds);
     operation.getDataClauseOperandsMutable().append(beforeOp.getResult());
 
     // Set the 'rest' of the info for the operation.
@@ -553,12 +553,15 @@ class OpenACCClauseCIREmitter final
   }
 
   void VisitIfClause(const OpenACCIfClause &clause) {
-    if constexpr (isOneOfTypes<OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp,
-                               mlir::acc::KernelsOp, mlir::acc::InitOp,
-                               mlir::acc::ShutdownOp, mlir::acc::SetOp,
-                               mlir::acc::DataOp, mlir::acc::WaitOp,
-                               mlir::acc::HostDataOp, mlir::acc::EnterDataOp,
-                               mlir::acc::ExitDataOp, mlir::acc::UpdateOp>) {
+    if constexpr (isOneOfTypes<
+                      OpTy, mlir::acc::ParallelOp, mlir::acc::SerialOp,
+                      mlir::acc::KernelsOp, mlir::acc::InitOp,
+                      mlir::acc::ShutdownOp, mlir::acc::SetOp,
+                      mlir::acc::DataOp, mlir::acc::WaitOp,
+                      mlir::acc::HostDataOp, mlir::acc::EnterDataOp,
+                      mlir::acc::ExitDataOp, mlir::acc::UpdateOp,
+                      mlir::acc::AtomicReadOp, mlir::acc::AtomicWriteOp,
+                      mlir::acc::AtomicUpdateOp, mlir::acc::AtomicCaptureOp>) {
       operation.getIfCondMutable().append(
           createCondition(clause.getConditionExpr()));
     } else if constexpr (isCombinedType<OpTy>) {
@@ -1144,6 +1147,10 @@ EXPL_SPEC(mlir::acc::HostDataOp)
 EXPL_SPEC(mlir::acc::EnterDataOp)
 EXPL_SPEC(mlir::acc::ExitDataOp)
 EXPL_SPEC(mlir::acc::UpdateOp)
+EXPL_SPEC(mlir::acc::AtomicReadOp)
+EXPL_SPEC(mlir::acc::AtomicWriteOp)
+EXPL_SPEC(mlir::acc::AtomicCaptureOp)
+EXPL_SPEC(mlir::acc::AtomicUpdateOp)
 #undef EXPL_SPEC
 
 template <typename ComputeOp, typename LoopOp>
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
index ad8c4d004654f..1eb7199ce6dfe 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmt.cpp
@@ -91,8 +91,9 @@ mlir::LogicalResult CIRGenFunction::emitCompoundStmt(const CompoundStmt &s,
   SymTableScopeTy varScope(symbolTable);
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
   mlir::OpBuilder::InsertPoint scopeInsPt;
-  builder.create<cir::ScopeOp>(
-      scopeLoc, [&](mlir::OpBuilder &b, mlir::Type &type, mlir::Location loc) {
+  cir::ScopeOp::create(
+      builder, scopeLoc,
+      [&](mlir::OpBuilder &b, mlir::Type &type, mlir::Location loc) {
         scopeInsPt = b.saveInsertionPoint();
       });
   mlir::OpBuilder::InsertionGuard guard(builder);
@@ -423,12 +424,12 @@ mlir::LogicalResult CIRGenFunction::emitIfStmt(const IfStmt &s) {
   // LexicalScope ConditionScope(*this, S.getCond()->getSourceRange());
   // The if scope contains the full source range for IfStmt.
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
-  builder.create<cir::ScopeOp>(
-      scopeLoc, /*scopeBuilder=*/
-      [&](mlir::OpBuilder &b, mlir::Location loc) {
-        LexicalScope lexScope{*this, scopeLoc, builder.getInsertionBlock()};
-        res = ifStmtBuilder();
-      });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         LexicalScope lexScope{*this, scopeLoc,
+                                               builder.getInsertionBlock()};
+                         res = ifStmtBuilder();
+                       });
 
   return res;
 }
@@ -446,54 +447,89 @@ mlir::LogicalResult CIRGenFunction::emitReturnStmt(const ReturnStmt &s) {
   mlir::Location loc = getLoc(s.getSourceRange());
   const Expr *rv = s.getRetValue();
 
-  if (getContext().getLangOpts().ElideConstructors && s.getNRVOCandidate() &&
-      s.getNRVOCandidate()->isNRVOVariable()) {
-    assert(!cir::MissingFeatures::openMP());
-    assert(!cir::MissingFeatures::nrvo());
-  } else if (!rv) {
-    // No return expression. Do nothing.
-  } else if (rv->getType()->isVoidType()) {
-    // Make sure not to return anything, but evaluate the expression
-    // for side effects.
-    if (rv) {
-      emitAnyExpr(rv);
+  RunCleanupsScope cleanupScope(*this);
+  bool createNewScope = false;
+  if (const auto *ewc = dyn_cast_or_null<ExprWithCleanups>(rv)) {
+    rv = ewc->getSubExpr();
+    createNewScope = true;
+  }
+
+  auto handleReturnVal = [&]() {
+    if (getContext().getLangOpts().ElideConstructors && s.getNRVOCandidate() &&
+        s.getNRVOCandidate()->isNRVOVariable()) {
+      assert(!cir::MissingFeatures::openMP());
+      assert(!cir::MissingFeatures::nrvo());
+    } else if (!rv) {
+      // No return expression. Do nothing.
+    } else if (rv->getType()->isVoidType()) {
+      // Make sure not to return anything, but evaluate the expression
+      // for side effects.
+      if (rv) {
+        emitAnyExpr(rv);
+      }
+    } else if (cast<FunctionDecl>(curGD.getDecl())
+                   ->getReturnType()
+                   ->isReferenceType()) {
+      // If this function returns a reference, take the address of the
+      // expression rather than the value.
+      RValue result = emitReferenceBindingToExpr(rv);
+      builder.CIRBaseBuilderTy::createStore(loc, result.getValue(),
+                                            *fnRetAlloca);
+    } else {
+      mlir::Value value = nullptr;
+      switch (CIRGenFunction::getEvaluationKind(rv->getType())) {
+      case cir::TEK_Scalar:
+        value = emitScalarExpr(rv);
+        if (value) { // Change this to an assert once emitScalarExpr is complete
+          builder.CIRBaseBuilderTy::createStore(loc, value, *fnRetAlloca);
+        }
+        break;
+      case cir::TEK_Complex:
+        emitComplexExprIntoLValue(rv,
+                                  makeAddrLValue(returnValue, rv->getType()),
+                                  /*isInit=*/true);
+        break;
+      case cir::TEK_Aggregate:
+        assert(!cir::MissingFeatures::aggValueSlotGC());
+        emitAggExpr(rv, AggValueSlot::forAddr(returnValue, Qualifiers(),
+                                              AggValueSlot::IsDestructed,
+                                              AggValueSlot::IsNotAliased,
+                                              getOverlapForReturnValue()));
+        break;
+      }
     }
-  } else if (cast<FunctionDecl>(curGD.getDecl())
-                 ->getReturnType()
-                 ->isReferenceType()) {
-    // If this function returns a reference, take the address of the
-    // expression rather than the value.
-    RValue result = emitReferenceBindingToExpr(rv);
-    builder.CIRBaseBuilderTy::createStore(loc, result.getValue(), *fnRetAlloca);
+  };
+
+  if (!createNewScope) {
+    handleReturnVal();
   } else {
-    mlir::Value value = nullptr;
-    switch (CIRGenFunction::getEvaluationKind(rv->getType())) {
-    case cir::TEK_Scalar:
-      value = emitScalarExpr(rv);
-      if (value) { // Change this to an assert once emitScalarExpr is complete
-        builder.CIRBaseBuilderTy::createStore(loc, value, *fnRetAlloca);
-      }
-      break;
-    case cir::TEK_Complex:
-      emitComplexExprIntoLValue(rv, makeAddrLValue(returnValue, rv->getType()),
-                                /*isInit=*/true);
-      break;
-    case cir::TEK_Aggregate:
-      assert(!cir::MissingFeatures::aggValueSlotGC());
-      emitAggExpr(rv, AggValueSlot::forAddr(returnValue, Qualifiers(),
-                                            AggValueSlot::IsDestructed,
-                                            AggValueSlot::IsNotAliased,
-                                            getOverlapForReturnValue()));
-      break;
+    mlir::Location scopeLoc =
+        getLoc(rv ? rv->getSourceRange() : s.getSourceRange());
+    // First create cir.scope and later emit it's body. Otherwise all CIRGen
+    // dispatched by `handleReturnVal()` might needs to manipulate blocks and
+    // look into parents, which are all unlinked.
+    mlir::OpBuilder::InsertPoint scopeBody;
+    cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                         [&](mlir::OpBuilder &b, mlir::Location loc) {
+                           scopeBody = b.saveInsertionPoint();
+                         });
+    {
+      mlir::OpBuilder::InsertionGuard guard(builder);
+      builder.restoreInsertionPoint(scopeBody);
+      CIRGenFunction::LexicalScope lexScope{*this, scopeLoc,
+                                            builder.getInsertionBlock()};
+      handleReturnVal();
     }
   }
 
+  cleanupScope.forceCleanup();
+
+  // In CIR we might have returns in different scopes.
+  // FIXME(cir): cleanup code is handling actual return emission, the logic
+  // should try to match traditional codegen more closely (to the extent which
+  // is possible).
   auto *retBlock = curLexScope->getOrCreateRetBlock(*this, loc);
-  // This should emit a branch through the cleanup block if one exists.
-  builder.create<cir::BrOp>(loc, retBlock);
-  assert(!cir::MissingFeatures::emitBranchThroughCleanup());
-  if (ehStack.stable_begin() != currentCleanupStackDepth)
-    cgm.errorNYI(s.getSourceRange(), "return with cleanup stack");
+  emitBranchThroughCleanup(loc, returnBlock(retBlock));
 
   // Insert the new block to continue codegen after branch to ret block.
   builder.createBlock(builder.getBlock()->getParent());
@@ -541,11 +577,11 @@ mlir::LogicalResult CIRGenFunction::emitLabel(const clang::LabelDecl &d) {
       mlir::OpBuilder::InsertionGuard guard(builder);
       labelBlock = builder.createBlock(builder.getBlock()->getParent());
     }
-    builder.create<cir::BrOp>(getLoc(d.getSourceRange()), labelBlock);
+    cir::BrOp::create(builder, getLoc(d.getSourceRange()), labelBlock);
   }
 
   builder.setInsertionPointToEnd(labelBlock);
-  builder.create<cir::LabelOp>(getLoc(d.getSourceRange()), d.getName());
+  cir::LabelOp::create(builder, getLoc(d.getSourceRange()), d.getName());
   builder.setInsertionPointToEnd(labelBlock);
 
   //  FIXME: emit debug info for labels, incrementProfileCounter
@@ -582,7 +618,7 @@ CIRGenFunction::emitCaseDefaultCascade(const T *stmt, mlir::Type condType,
   const Stmt *sub = stmt->getSubStmt();
 
   mlir::OpBuilder::InsertPoint insertPoint;
-  builder.create<CaseOp>(loc, value, kind, insertPoint);
+  CaseOp::create(builder, loc, value, kind, insertPoint);
 
   {
     mlir::OpBuilder::InsertionGuard guardSwitch(builder);
@@ -754,16 +790,16 @@ CIRGenFunction::emitCXXForRangeStmt(const CXXForRangeStmt &s,
 
   mlir::LogicalResult res = mlir::success();
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
-  builder.create<cir::ScopeOp>(scopeLoc, /*scopeBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 // Create a cleanup scope for the condition
-                                 // variable cleanups. Logical equivalent from
-                                 // LLVM codegn for LexicalScope
-                                 // ConditionScope(*this, S.getSourceRange())...
-                                 LexicalScope lexScope{
-                                     *this, loc, builder.getInsertionBlock()};
-                                 res = forStmtBuilder();
-                               });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         // Create a cleanup scope for the condition
+                         // variable cleanups. Logical equivalent from
+                         // LLVM codegn for LexicalScope
+                         // ConditionScope(*this, S.getSourceRange())...
+                         LexicalScope lexScope{*this, loc,
+                                               builder.getInsertionBlock()};
+                         res = forStmtBuilder();
+                       });
 
   if (res.failed())
     return res;
@@ -806,7 +842,7 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) {
             // scalar type.
             condVal = evaluateExprAsBool(s.getCond());
           } else {
-            condVal = b.create<cir::ConstantOp>(loc, builder.getTrueAttr());
+            condVal = cir::ConstantOp::create(b, loc, builder.getTrueAttr());
           }
           builder.createCondition(condVal);
         },
@@ -830,12 +866,12 @@ mlir::LogicalResult CIRGenFunction::emitForStmt(const ForStmt &s) {
 
   auto res = mlir::success();
   auto scopeLoc = getLoc(s.getSourceRange());
-  builder.create<cir::ScopeOp>(scopeLoc, /*scopeBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 LexicalScope lexScope{
-                                     *this, loc, builder.getInsertionBlock()};
-                                 res = forStmtBuilder();
-                               });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         LexicalScope lexScope{*this, loc,
+                                               builder.getInsertionBlock()};
+                         res = forStmtBuilder();
+                       });
 
   if (res.failed())
     return res;
@@ -881,12 +917,12 @@ mlir::LogicalResult CIRGenFunction::emitDoStmt(const DoStmt &s) {
 
   mlir::LogicalResult res = mlir::success();
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
-  builder.create<cir::ScopeOp>(scopeLoc, /*scopeBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 LexicalScope lexScope{
-                                     *this, loc, builder.getInsertionBlock()};
-                                 res = doStmtBuilder();
-                               });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         LexicalScope lexScope{*this, loc,
+                                               builder.getInsertionBlock()};
+                         res = doStmtBuilder();
+                       });
 
   if (res.failed())
     return res;
@@ -937,12 +973,12 @@ mlir::LogicalResult CIRGenFunction::emitWhileStmt(const WhileStmt &s) {
 
   mlir::LogicalResult res = mlir::success();
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
-  builder.create<cir::ScopeOp>(scopeLoc, /*scopeBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 LexicalScope lexScope{
-                                     *this, loc, builder.getInsertionBlock()};
-                                 res = whileStmtBuilder();
-                               });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         LexicalScope lexScope{*this, loc,
+                                               builder.getInsertionBlock()};
+                         res = whileStmtBuilder();
+                       });
 
   if (res.failed())
     return res;
@@ -1013,8 +1049,8 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) {
     assert(!cir::MissingFeatures::insertBuiltinUnpredictable());
 
     mlir::LogicalResult res = mlir::success();
-    swop = builder.create<SwitchOp>(
-        getLoc(s.getBeginLoc()), condV,
+    swop = SwitchOp::create(
+        builder, getLoc(s.getBeginLoc()), condV,
         /*switchBuilder=*/
         [&](mlir::OpBuilder &b, mlir::Location loc, mlir::OperationState &os) {
           curLexScope->setAsSwitch();
@@ -1032,12 +1068,12 @@ mlir::LogicalResult CIRGenFunction::emitSwitchStmt(const clang::SwitchStmt &s) {
   // The switch scope contains the full source range for SwitchStmt.
   mlir::Location scopeLoc = getLoc(s.getSourceRange());
   mlir::LogicalResult res = mlir::success();
-  builder.create<cir::ScopeOp>(scopeLoc, /*scopeBuilder=*/
-                               [&](mlir::OpBuilder &b, mlir::Location loc) {
-                                 LexicalScope lexScope{
-                                     *this, loc, builder.getInsertionBlock()};
-                                 res = switchStmtBuilder();
-                               });
+  cir::ScopeOp::create(builder, scopeLoc, /*scopeBuilder=*/
+                       [&](mlir::OpBuilder &b, mlir::Location loc) {
+                         LexicalScope lexScope{*this, loc,
+                                               builder.getInsertionBlock()};
+                         res = switchStmtBuilder();
+                       });
 
   llvm::SmallVector<CaseOp> cases;
   swop.collectCases(cases);
@@ -1061,7 +1097,7 @@ void CIRGenFunction::emitReturnOfRValue(mlir::Location loc, RValue rv,
   }
   mlir::Block *retBlock = curLexScope->getOrCreateRetBlock(*this, loc);
   assert(!cir::MissingFeatures::emitBranchThroughCleanup());
-  builder.create<cir::BrOp>(loc, retBlock);
+  cir::BrOp::create(builder, loc, retBlock);
   if (ehStack.stable_begin() != currentCleanupStackDepth)
-    cgm.errorNYI(loc, "return with cleanup stack");
+    cgm.errorNYI(loc, "return of r-value with cleanup stack");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
index e89393c92db33..9e55bd5b7ae71 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACC.cpp
@@ -30,7 +30,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt(
 
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
-  auto op = builder.create<Op>(start, retTy, operands);
+  auto op = Op::create(builder, start, retTy, operands);
 
   emitOpenACCClauses(op, dirKind, dirLoc, clauses);
 
@@ -42,7 +42,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpAssociatedStmt(
     LexicalScope ls{*this, start, builder.getInsertionBlock()};
     res = emitStmt(associatedStmt, /*useCurrentScope=*/true);
 
-    builder.create<TermOp>(end);
+    TermOp::create(builder, end);
   }
   return res;
 }
@@ -73,7 +73,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
 
-  auto computeOp = builder.create<Op>(start, retTy, operands);
+  auto computeOp = Op::create(builder, start, retTy, operands);
   computeOp.setCombinedAttr(builder.getUnitAttr());
   mlir::acc::LoopOp loopOp;
 
@@ -85,7 +85,7 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
     builder.setInsertionPointToEnd(&block);
 
     LexicalScope ls{*this, start, builder.getInsertionBlock()};
-    auto loopOp = builder.create<LoopOp>(start, retTy, operands);
+    auto loopOp = LoopOp::create(builder, start, retTy, operands);
     loopOp.setCombinedAttr(mlir::acc::CombinedConstructsTypeAttr::get(
         builder.getContext(), CombinedType<Op>::value));
 
@@ -99,14 +99,14 @@ mlir::LogicalResult CIRGenFunction::emitOpenACCOpCombinedConstruct(
 
       res = emitStmt(loopStmt, /*useCurrentScope=*/true);
 
-      builder.create<mlir::acc::YieldOp>(end);
+      mlir::acc::YieldOp::create(builder, end);
     }
 
     emitOpenACCClauses(computeOp, loopOp, dirKind, dirLoc, clauses);
 
     updateLoopOpParallelism(loopOp, /*isOrphan=*/false, dirKind);
 
-    builder.create<TermOp>(end);
+    TermOp::create(builder, end);
   }
 
   return res;
@@ -118,7 +118,7 @@ Op CIRGenFunction::emitOpenACCOp(
     llvm::ArrayRef<const OpenACCClause *> clauses) {
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
-  auto op = builder.create<Op>(start, retTy, operands);
+  auto op = Op::create(builder, start, retTy, operands);
 
   emitOpenACCClauses(op, dirKind, dirLoc, clauses);
   return op;
@@ -197,8 +197,8 @@ CIRGenFunction::emitOpenACCWaitConstruct(const OpenACCWaitConstruct &s) {
             ? mlir::IntegerType::SignednessSemantics::Signed
             : mlir::IntegerType::SignednessSemantics::Unsigned);
 
-    auto conversionOp = builder.create<mlir::UnrealizedConversionCastOp>(
-        exprLoc, targetType, expr);
+    auto conversionOp = mlir::UnrealizedConversionCastOp::create(
+        builder, exprLoc, targetType, expr);
     return conversionOp.getResult(0);
   };
 
@@ -294,9 +294,9 @@ CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) {
     CIRGenFunction::OpenACCDataOperandInfo opInfo =
         getOpenACCDataOperandInfo(var);
 
-    auto cacheOp = builder.create<CacheOp>(
-        opInfo.beginLoc, opInfo.varValue,
-        /*structured=*/false, /*implicit=*/false, opInfo.name, opInfo.bounds);
+    auto cacheOp = CacheOp::create(builder, opInfo.beginLoc, opInfo.varValue,
+                                   /*structured=*/false, /*implicit=*/false,
+                                   opInfo.name, opInfo.bounds);
 
     loopOp.getCacheOperandsMutable().append(cacheOp.getResult());
   }
@@ -304,8 +304,108 @@ CIRGenFunction::emitOpenACCCacheConstruct(const OpenACCCacheConstruct &s) {
   return mlir::success();
 }
 
+const VarDecl *getLValueDecl(const Expr *e) {
+  // We are going to assume that after stripping implicit casts, that the LValue
+  // is just a DRE around the var-decl.
+
+  e = e->IgnoreImpCasts();
+
+  const auto *dre = cast<DeclRefExpr>(e);
+  return cast<VarDecl>(dre->getDecl());
+}
+
 mlir::LogicalResult
 CIRGenFunction::emitOpenACCAtomicConstruct(const OpenACCAtomicConstruct &s) {
-  cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct");
-  return mlir::failure();
+  // For now, we are only support 'read'/'write'/'update', so diagnose. We can
+  // switch on the kind later once we implement the 'capture' form.
+  if (s.getAtomicKind() == OpenACCAtomicKind::Capture) {
+    cgm.errorNYI(s.getSourceRange(), "OpenACC Atomic Construct");
+    return mlir::failure();
+  }
+
+  // While Atomic is an 'associated statement' construct, it 'steals' the
+  // expression it is associated with rather than emitting it inside of it.  So
+  // it has custom emit logic.
+  mlir::Location start = getLoc(s.getSourceRange().getBegin());
+  mlir::Location end = getLoc(s.getSourceRange().getEnd());
+  OpenACCAtomicConstruct::StmtInfo inf = s.getAssociatedStmtInfo();
+
+  switch (s.getAtomicKind()) {
+  case OpenACCAtomicKind::Capture:
+    llvm_unreachable("Unimplemented atomic construct type, should have "
+                     "diagnosed/returned above");
+    return mlir::failure();
+  case OpenACCAtomicKind::Read: {
+
+    // Atomic 'read' only permits 'v = x', where v and x are both scalar L
+    // values. The getAssociatedStmtInfo strips off implicit casts, which
+    // includes implicit conversions and L-to-R-Value conversions, so we can
+    // just emit it as an L value.  The Flang implementation has no problem with
+    // different types, so it appears that the dialect can handle the
+    // conversions.
+    mlir::Value v = emitLValue(inf.V).getPointer();
+    mlir::Value x = emitLValue(inf.X).getPointer();
+    mlir::Type resTy = convertType(inf.V->getType());
+    auto op = mlir::acc::AtomicReadOp::create(builder, start, x, v, resTy,
+                                              /*ifCond=*/{});
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
+                       s.clauses());
+    return mlir::success();
+  }
+  case OpenACCAtomicKind::Write: {
+    mlir::Value x = emitLValue(inf.X).getPointer();
+    mlir::Value expr = emitAnyExpr(inf.RefExpr).getValue();
+    auto op = mlir::acc::AtomicWriteOp::create(builder, start, x, expr,
+                                               /*ifCond=*/{});
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
+                       s.clauses());
+    return mlir::success();
+  }
+  case OpenACCAtomicKind::None:
+  case OpenACCAtomicKind::Update: {
+    mlir::Value x = emitLValue(inf.X).getPointer();
+    auto op =
+        mlir::acc::AtomicUpdateOp::create(builder, start, x, /*ifCond=*/{});
+    emitOpenACCClauses(op, s.getDirectiveKind(), s.getDirectiveLoc(),
+                       s.clauses());
+    mlir::LogicalResult res = mlir::success();
+    {
+      mlir::OpBuilder::InsertionGuard guardCase(builder);
+      mlir::Type argTy = cast<cir::PointerType>(x.getType()).getPointee();
+      std::array<mlir::Type, 1> recipeType{argTy};
+      std::array<mlir::Location, 1> recipeLoc{start};
+      mlir::Block *recipeBlock = builder.createBlock(
+          &op.getRegion(), op.getRegion().end(), recipeType, recipeLoc);
+      builder.setInsertionPointToEnd(recipeBlock);
+
+      // Since we have an initial value that we know is a scalar type, we can
+      // just emit the entire statement here after sneaking-in our 'alloca' in
+      // the right place, then loading out of it. Flang does a lot less work
+      // (probably does its own emitting!), but we have more complicated AST
+      // nodes to worry about, so we can just count on opt to remove the extra
+      // alloca/load/store set.
+      auto alloca = cir::AllocaOp::create(
+          builder, start, x.getType(), argTy, "x_var",
+          cgm.getSize(getContext().getTypeAlignInChars(inf.X->getType())));
+
+      alloca.setInitAttr(mlir::UnitAttr::get(&getMLIRContext()));
+      builder.CIRBaseBuilderTy::createStore(start, recipeBlock->getArgument(0),
+                                            alloca);
+
+      const VarDecl *xval = getLValueDecl(inf.X);
+      CIRGenFunction::DeclMapRevertingRAII declMapRAII{*this, xval};
+      replaceAddrOfLocalVar(
+          xval, Address{alloca, argTy, getContext().getDeclAlign(xval)});
+
+      res = emitStmt(s.getAssociatedStmt(), /*useCurrentScope=*/true);
+
+      auto load = cir::LoadOp::create(builder, start, {alloca});
+      mlir::acc::YieldOp::create(builder, end, {load});
+    }
+
+    return res;
+  }
+  }
+
+  llvm_unreachable("unknown OpenACC atomic kind");
 }
diff --git a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
index f3911ae8b6772..c5b89bd7b1fb6 100644
--- a/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
+++ b/clang/lib/CIR/CodeGen/CIRGenStmtOpenACCLoop.cpp
@@ -58,7 +58,7 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
   mlir::Location end = getLoc(s.getSourceRange().getEnd());
   llvm::SmallVector<mlir::Type> retTy;
   llvm::SmallVector<mlir::Value> operands;
-  auto op = builder.create<LoopOp>(start, retTy, operands);
+  auto op = LoopOp::create(builder, start, retTy, operands);
 
   // TODO(OpenACC): In the future we are going to need to come up with a
   // transformation here that can teach the acc.loop how to figure out the
@@ -133,7 +133,7 @@ CIRGenFunction::emitOpenACCLoopConstruct(const OpenACCLoopConstruct &s) {
     ActiveOpenACCLoopRAII activeLoop{*this, &op};
 
     stmtRes = emitStmt(s.getLoop(), /*useCurrentScope=*/true);
-    builder.create<mlir::acc::YieldOp>(end);
+    mlir::acc::YieldOp::create(builder, end);
   }
 
   return stmtRes;
diff --git a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
index b5612d9127506..ff5842cd86e04 100644
--- a/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
+++ b/clang/lib/CIR/CodeGen/CIRGenTypeCache.h
@@ -74,11 +74,17 @@ struct CIRGenTypeCache {
     unsigned char PointerSizeInBytes;
   };
 
-  /// The alignment of size_t.
-  unsigned char SizeAlignInBytes;
+  /// The size and alignment of size_t.
+  union {
+    unsigned char SizeSizeInBytes; // sizeof(size_t)
+    unsigned char SizeAlignInBytes;
+  };
 
   cir::TargetAddressSpaceAttr cirAllocaAddressSpace;
 
+  clang::CharUnits getSizeSize() const {
+    return clang::CharUnits::fromQuantity(SizeSizeInBytes);
+  }
   clang::CharUnits getSizeAlign() const {
     return clang::CharUnits::fromQuantity(SizeAlignInBytes);
   }
diff --git a/clang/lib/CIR/CodeGen/CMakeLists.txt b/clang/lib/CIR/CodeGen/CMakeLists.txt
index 36db4bd9bab25..7c31beacc5fb3 100644
--- a/clang/lib/CIR/CodeGen/CMakeLists.txt
+++ b/clang/lib/CIR/CodeGen/CMakeLists.txt
@@ -11,13 +11,14 @@ add_clang_library(clangCIR
   CIRGenAsm.cpp
   CIRGenAtomic.cpp
   CIRGenBuilder.cpp
+  CIRGenBuiltin.cpp
+  CIRGenBuiltinX86.cpp
   CIRGenCall.cpp
   CIRGenClass.cpp
   CIRGenCleanup.cpp
   CIRGenCoroutine.cpp
   CIRGenCXX.cpp
   CIRGenCXXABI.cpp
-  CIRGenBuiltin.cpp
   CIRGenDecl.cpp
   CIRGenDeclCXX.cpp
   CIRGenDeclOpenACC.cpp
diff --git a/clang/lib/CIR/CodeGen/EHScopeStack.h b/clang/lib/CIR/CodeGen/EHScopeStack.h
index 1d6e671b21b79..4198c23c9cbed 100644
--- a/clang/lib/CIR/CodeGen/EHScopeStack.h
+++ b/clang/lib/CIR/CodeGen/EHScopeStack.h
@@ -18,12 +18,38 @@
 #ifndef CLANG_LIB_CIR_CODEGEN_EHSCOPESTACK_H
 #define CLANG_LIB_CIR_CODEGEN_EHSCOPESTACK_H
 
+#include "clang/CIR/Dialect/IR/CIRDialect.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace clang::CIRGen {
 
 class CIRGenFunction;
 
+/// A branch fixup.  These are required when emitting a goto to a
+/// label which hasn't been emitted yet.  The goto is optimistically
+/// emitted as a branch to the basic block for the label, and (if it
+/// occurs in a scope with non-trivial cleanups) a fixup is added to
+/// the innermost cleanup.  When a (normal) cleanup is popped, any
+/// unresolved fixups in that scope are threaded through the cleanup.
+struct BranchFixup {
+  /// The block containing the terminator which needs to be modified
+  /// into a switch if this fixup is resolved into the current scope.
+  /// If null, LatestBranch points directly to the destination.
+  mlir::Block *optimisticBranchBlock = nullptr;
+
+  /// The ultimate destination of the branch.
+  ///
+  /// This can be set to null to indicate that this fixup was
+  /// successfully resolved.
+  mlir::Block *destination = nullptr;
+
+  /// The destination index value.
+  unsigned destinationIndex = 0;
+
+  /// The initial branch of the fixup.
+  cir::BrOp initialBranch = {};
+};
+
 enum CleanupKind : unsigned {
   /// Denotes a cleanup that should run when a scope is exited using exceptional
   /// control flow (a throw statement leading to stack unwinding, ).
@@ -126,9 +152,31 @@ class EHScopeStack {
   /// The first valid entry in the buffer.
   char *startOfData = nullptr;
 
+  /// The innermost normal cleanup on the stack.
+  stable_iterator innermostNormalCleanup = stable_end();
+
   /// The CGF this Stack belong to
   CIRGenFunction *cgf = nullptr;
 
+  /// The current set of branch fixups.  A branch fixup is a jump to
+  /// an as-yet unemitted label, i.e. a label for which we don't yet
+  /// know the EH stack depth.  Whenever we pop a cleanup, we have
+  /// to thread all the current branch fixups through it.
+  ///
+  /// Fixups are recorded as the Use of the respective branch or
+  /// switch statement.  The use points to the final destination.
+  /// When popping out of a cleanup, these uses are threaded through
+  /// the cleanup and adjusted to point to the new cleanup.
+  ///
+  /// Note that branches are allowed to jump into protected scopes
+  /// in certain situations;  e.g. the following code is legal:
+  ///     struct A { ~A(); }; // trivial ctor, non-trivial dtor
+  ///     goto foo;
+  ///     A a;
+  ///    foo:
+  ///     bar();
+  llvm::SmallVector<BranchFixup> branchFixups;
+
   // This class uses a custom allocator for maximum efficiency because cleanups
   // are allocated and freed very frequently. It's basically a bump pointer
   // allocator, but we can't use LLVM's BumpPtrAllocator because we use offsets
@@ -166,6 +214,18 @@ class EHScopeStack {
   /// Determines whether the exception-scopes stack is empty.
   bool empty() const { return startOfData == endOfBuffer; }
 
+  /// Determines whether there are any normal cleanups on the stack.
+  bool hasNormalCleanups() const {
+    return innermostNormalCleanup != stable_end();
+  }
+
+  /// Returns the innermost normal cleanup on the stack, or
+  /// stable_end() if there are no normal cleanups.
+  stable_iterator getInnermostNormalCleanup() const {
+    return innermostNormalCleanup;
+  }
+  stable_iterator getInnermostActiveNormalCleanup() const;
+
   /// An unstable reference to a scope-stack depth.  Invalidated by
   /// pushes but not pops.
   class iterator;
@@ -180,12 +240,30 @@ class EHScopeStack {
     return stable_iterator(endOfBuffer - startOfData);
   }
 
+  /// Create a stable reference to the bottom of the EH stack.
+  static stable_iterator stable_end() { return stable_iterator(0); }
+
   /// Turn a stable reference to a scope depth into a unstable pointer
   /// to the EH stack.
   iterator find(stable_iterator savePoint) const;
 
-  /// Create a stable reference to the bottom of the EH stack.
-  static stable_iterator stable_end() { return stable_iterator(0); }
+  /// Add a branch fixup to the current cleanup scope.
+  BranchFixup &addBranchFixup() {
+    assert(hasNormalCleanups() && "adding fixup in scope without cleanups");
+    branchFixups.push_back(BranchFixup());
+    return branchFixups.back();
+  }
+
+  unsigned getNumBranchFixups() const { return branchFixups.size(); }
+  BranchFixup &getBranchFixup(unsigned i) {
+    assert(i < getNumBranchFixups());
+    return branchFixups[i];
+  }
+
+  /// Pops lazily-removed fixups from the end of the list.  This
+  /// should only be called by procedures which have just popped a
+  /// cleanup or resolved one or more fixups.
+  void popNullFixups();
 };
 
 } // namespace clang::CIRGen
diff --git a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
index fa180f54c4474..2d2ef422bfaef 100644
--- a/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRDialect.cpp
@@ -95,8 +95,8 @@ Operation *cir::CIRDialect::materializeConstant(mlir::OpBuilder &builder,
                                                 mlir::Attribute value,
                                                 mlir::Type type,
                                                 mlir::Location loc) {
-  return builder.create<cir::ConstantOp>(loc, type,
-                                         mlir::cast<mlir::TypedAttr>(value));
+  return cir::ConstantOp::create(builder, loc, type,
+                                 mlir::cast<mlir::TypedAttr>(value));
 }
 
 //===----------------------------------------------------------------------===//
@@ -184,7 +184,7 @@ static LogicalResult ensureRegionTerm(OpAsmParser &parser, Region &region,
 
   // Terminator was omitted correctly: recreate it.
   builder.setInsertionPointToEnd(&block);
-  builder.create<cir::YieldOp>(eLoc);
+  cir::YieldOp::create(builder, eLoc);
   return success();
 }
 
@@ -977,7 +977,7 @@ void cir::IfOp::print(OpAsmPrinter &p) {
 /// Default callback for IfOp builders.
 void cir::buildTerminatedBody(OpBuilder &builder, Location loc) {
   // add cir.yield to end of the block
-  builder.create<cir::YieldOp>(loc);
+  cir::YieldOp::create(builder, loc);
 }
 
 /// Given the region at `index`, or the parent operation if `index` is None,
diff --git a/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp b/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp
index 7e96ae99b690b..66469e208d7b0 100644
--- a/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp
+++ b/clang/lib/CIR/Dialect/IR/CIRMemorySlot.cpp
@@ -34,8 +34,8 @@ llvm::SmallVector<MemorySlot> cir::AllocaOp::getPromotableSlots() {
 
 Value cir::AllocaOp::getDefaultValue(const MemorySlot &slot,
                                      OpBuilder &builder) {
-  return builder.create<cir::ConstantOp>(getLoc(),
-                                         cir::UndefAttr::get(slot.elemType));
+  return cir::ConstantOp::create(builder, getLoc(),
+                                 cir::UndefAttr::get(slot.elemType));
 }
 
 void cir::AllocaOp::handleBlockArgument(const MemorySlot &slot,
diff --git a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
index 46bd186ccada1..21c96febf8403 100644
--- a/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/FlattenCFG.cpp
@@ -100,8 +100,8 @@ struct CIRIfFlattening : public mlir::OpRewritePattern<cir::IfOp> {
     }
 
     rewriter.setInsertionPointToEnd(currentBlock);
-    rewriter.create<cir::BrCondOp>(loc, ifOp.getCondition(), thenBeforeBody,
-                                   elseBeforeBody);
+    cir::BrCondOp::create(rewriter, loc, ifOp.getCondition(), thenBeforeBody,
+                          elseBeforeBody);
 
     if (!emptyElse) {
       rewriter.setInsertionPointToEnd(elseAfterBody);
@@ -154,7 +154,7 @@ class CIRScopeOpFlattening : public mlir::OpRewritePattern<cir::ScopeOp> {
     // Save stack and then branch into the body of the region.
     rewriter.setInsertionPointToEnd(currentBlock);
     assert(!cir::MissingFeatures::stackSaveOp());
-    rewriter.create<cir::BrOp>(loc, mlir::ValueRange(), beforeBody);
+    cir::BrOp::create(rewriter, loc, mlir::ValueRange(), beforeBody);
 
     // Replace the scopeop return with a branch that jumps out of the body.
     // Stack restore before leaving the body region.
@@ -195,26 +195,27 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern<cir::SwitchOp> {
     cir::IntType sIntType = cir::IntType::get(op.getContext(), 32, true);
     cir::IntType uIntType = cir::IntType::get(op.getContext(), 32, false);
 
-    cir::ConstantOp rangeLength = rewriter.create<cir::ConstantOp>(
-        op.getLoc(), cir::IntAttr::get(sIntType, upperBound - lowerBound));
+    cir::ConstantOp rangeLength = cir::ConstantOp::create(
+        rewriter, op.getLoc(),
+        cir::IntAttr::get(sIntType, upperBound - lowerBound));
 
-    cir::ConstantOp lowerBoundValue = rewriter.create<cir::ConstantOp>(
-        op.getLoc(), cir::IntAttr::get(sIntType, lowerBound));
+    cir::ConstantOp lowerBoundValue = cir::ConstantOp::create(
+        rewriter, op.getLoc(), cir::IntAttr::get(sIntType, lowerBound));
     cir::BinOp diffValue =
-        rewriter.create<cir::BinOp>(op.getLoc(), sIntType, cir::BinOpKind::Sub,
-                                    op.getCondition(), lowerBoundValue);
+        cir::BinOp::create(rewriter, op.getLoc(), sIntType, cir::BinOpKind::Sub,
+                           op.getCondition(), lowerBoundValue);
 
     // Use unsigned comparison to check if the condition is in the range.
-    cir::CastOp uDiffValue = rewriter.create<cir::CastOp>(
-        op.getLoc(), uIntType, CastKind::integral, diffValue);
-    cir::CastOp uRangeLength = rewriter.create<cir::CastOp>(
-        op.getLoc(), uIntType, CastKind::integral, rangeLength);
-
-    cir::CmpOp cmpResult = rewriter.create<cir::CmpOp>(
-        op.getLoc(), cir::BoolType::get(op.getContext()), cir::CmpOpKind::le,
-        uDiffValue, uRangeLength);
-    rewriter.create<cir::BrCondOp>(op.getLoc(), cmpResult, rangeDestination,
-                                   defaultDestination);
+    cir::CastOp uDiffValue = cir::CastOp::create(
+        rewriter, op.getLoc(), uIntType, CastKind::integral, diffValue);
+    cir::CastOp uRangeLength = cir::CastOp::create(
+        rewriter, op.getLoc(), uIntType, CastKind::integral, rangeLength);
+
+    cir::CmpOp cmpResult = cir::CmpOp::create(
+        rewriter, op.getLoc(), cir::BoolType::get(op.getContext()),
+        cir::CmpOpKind::le, uDiffValue, uRangeLength);
+    cir::BrCondOp::create(rewriter, op.getLoc(), cmpResult, rangeDestination,
+                          defaultDestination);
     return resBlock;
   }
 
@@ -262,7 +263,7 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern<cir::SwitchOp> {
         rewriteYieldOp(rewriter, switchYield, exitBlock);
 
       rewriter.setInsertionPointToEnd(originalBlock);
-      rewriter.create<cir::BrOp>(op.getLoc(), swopBlock);
+      cir::BrOp::create(rewriter, op.getLoc(), swopBlock);
     }
 
     // Allocate required data structures (disconsider default case in
@@ -331,8 +332,8 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern<cir::SwitchOp> {
           mlir::Block *newBlock =
               rewriter.splitBlock(oldBlock, nextOp->getIterator());
           rewriter.setInsertionPointToEnd(oldBlock);
-          rewriter.create<cir::BrOp>(nextOp->getLoc(), mlir::ValueRange(),
-                                     newBlock);
+          cir::BrOp::create(rewriter, nextOp->getLoc(), mlir::ValueRange(),
+                            newBlock);
           rewriteYieldOp(rewriter, yieldOp, newBlock);
         }
       }
@@ -346,7 +347,7 @@ class CIRSwitchOpFlattening : public mlir::OpRewritePattern<cir::SwitchOp> {
 
       // Create a branch to the entry of the inlined region.
       rewriter.setInsertionPointToEnd(oldBlock);
-      rewriter.create<cir::BrOp>(caseOp.getLoc(), &entryBlock);
+      cir::BrOp::create(rewriter, caseOp.getLoc(), &entryBlock);
     }
 
     // Remove all cases since we've inlined the regions.
@@ -427,7 +428,7 @@ class CIRLoopOpInterfaceFlattening
 
     // Setup loop entry branch.
     rewriter.setInsertionPointToEnd(entry);
-    rewriter.create<cir::BrOp>(op.getLoc(), &op.getEntry().front());
+    cir::BrOp::create(rewriter, op.getLoc(), &op.getEntry().front());
 
     // Branch from condition region to body or exit.
     auto conditionOp = cast<cir::ConditionOp>(cond->getTerminator());
@@ -499,7 +500,7 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern<cir::TernaryOp> {
       locs.push_back(loc);
     Block *continueBlock =
         rewriter.createBlock(remainingOpsBlock, op->getResultTypes(), locs);
-    rewriter.create<cir::BrOp>(loc, remainingOpsBlock);
+    cir::BrOp::create(rewriter, loc, remainingOpsBlock);
 
     Region &trueRegion = op.getTrueRegion();
     Block *trueBlock = &trueRegion.front();
@@ -542,7 +543,7 @@ class CIRTernaryOpFlattening : public mlir::OpRewritePattern<cir::TernaryOp> {
     rewriter.inlineRegionBefore(falseRegion, continueBlock);
 
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<cir::BrCondOp>(loc, op.getCond(), trueBlock, falseBlock);
+    cir::BrCondOp::create(rewriter, loc, op.getCond(), trueBlock, falseBlock);
 
     rewriter.replaceOp(op, continueBlock->getArguments());
 
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
index d99c36241ab68..cba04649ca05e 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepare.cpp
@@ -155,7 +155,7 @@ cir::FuncOp LoweringPreparePass::buildRuntimeFunction(
   cir::FuncOp f = dyn_cast_or_null<FuncOp>(SymbolTable::lookupNearestSymbolFrom(
       mlirModule, StringAttr::get(mlirModule->getContext(), name)));
   if (!f) {
-    f = builder.create<cir::FuncOp>(loc, name, type);
+    f = cir::FuncOp::create(builder, loc, name, type);
     f.setLinkageAttr(
         cir::GlobalLinkageKindAttr::get(builder.getContext(), linkage));
     mlir::SymbolTable::setSymbolVisibility(
@@ -400,12 +400,12 @@ buildRangeReductionComplexDiv(CIRBaseBuilderTy &builder, mlir::Location loc,
     builder.createYield(loc, result);
   };
 
-  auto cFabs = builder.create<cir::FAbsOp>(loc, c);
-  auto dFabs = builder.create<cir::FAbsOp>(loc, d);
+  auto cFabs = cir::FAbsOp::create(builder, loc, c);
+  auto dFabs = cir::FAbsOp::create(builder, loc, d);
   cir::CmpOp cmpResult =
       builder.createCompare(loc, cir::CmpOpKind::ge, cFabs, dFabs);
-  auto ternary = builder.create<cir::TernaryOp>(
-      loc, cmpResult, trueBranchBuilder, falseBranchBuilder);
+  auto ternary = cir::TernaryOp::create(builder, loc, cmpResult,
+                                        trueBranchBuilder, falseBranchBuilder);
 
   return ternary.getResult();
 }
@@ -612,18 +612,17 @@ static mlir::Value lowerComplexMul(LoweringPreparePass &pass,
   mlir::Value resultRealAndImagAreNaN =
       builder.createLogicalAnd(loc, resultRealIsNaN, resultImagIsNaN);
 
-  return builder
-      .create<cir::TernaryOp>(
-          loc, resultRealAndImagAreNaN,
-          [&](mlir::OpBuilder &, mlir::Location) {
-            mlir::Value libCallResult = buildComplexBinOpLibCall(
-                pass, builder, &getComplexMulLibCallName, loc, complexTy,
-                lhsReal, lhsImag, rhsReal, rhsImag);
-            builder.createYield(loc, libCallResult);
-          },
-          [&](mlir::OpBuilder &, mlir::Location) {
-            builder.createYield(loc, algebraicResult);
-          })
+  return cir::TernaryOp::create(
+             builder, loc, resultRealAndImagAreNaN,
+             [&](mlir::OpBuilder &, mlir::Location) {
+               mlir::Value libCallResult = buildComplexBinOpLibCall(
+                   pass, builder, &getComplexMulLibCallName, loc, complexTy,
+                   lhsReal, lhsImag, rhsReal, rhsImag);
+               builder.createYield(loc, libCallResult);
+             },
+             [&](mlir::OpBuilder &, mlir::Location) {
+               builder.createYield(loc, algebraicResult);
+             })
       .getResult();
 }
 
@@ -920,15 +919,15 @@ static void lowerArrayDtorCtorIntoLoop(cir::CIRBaseBuilderTy &builder,
       loc,
       /*condBuilder=*/
       [&](mlir::OpBuilder &b, mlir::Location loc) {
-        auto currentElement = b.create<cir::LoadOp>(loc, eltTy, tmpAddr);
+        auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr);
         mlir::Type boolTy = cir::BoolType::get(b.getContext());
-        auto cmp = builder.create<cir::CmpOp>(loc, boolTy, cir::CmpOpKind::ne,
-                                              currentElement, stop);
+        auto cmp = cir::CmpOp::create(builder, loc, boolTy, cir::CmpOpKind::ne,
+                                      currentElement, stop);
         builder.createCondition(cmp);
       },
       /*bodyBuilder=*/
       [&](mlir::OpBuilder &b, mlir::Location loc) {
-        auto currentElement = b.create<cir::LoadOp>(loc, eltTy, tmpAddr);
+        auto currentElement = cir::LoadOp::create(b, loc, eltTy, tmpAddr);
 
         cir::CallOp ctorCall;
         op->walk([&](cir::CallOp c) { ctorCall = c; });
diff --git a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
index 11ce2a81e7f39..5a067f8ec0383 100644
--- a/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
+++ b/clang/lib/CIR/Dialect/Transforms/LoweringPrepareItaniumCXXABI.cpp
@@ -77,10 +77,11 @@ buildDynamicCastAfterNullCheck(cir::CIRBaseBuilderTy &builder,
   if (op.isRefCast()) {
     // Emit a cir.if that checks the casted value.
     mlir::Value castedValueIsNull = builder.createPtrIsNull(castedPtr);
-    builder.create<cir::IfOp>(
-        loc, castedValueIsNull, false, [&](mlir::OpBuilder &, mlir::Location) {
-          buildBadCastCall(builder, loc, castInfo.getBadCastFunc());
-        });
+    cir::IfOp::create(builder, loc, castedValueIsNull, false,
+                      [&](mlir::OpBuilder &, mlir::Location) {
+                        buildBadCastCall(builder, loc,
+                                         castInfo.getBadCastFunc());
+                      });
   }
 
   // Note that castedPtr is a void*. Cast it to a pointer to the destination
@@ -154,19 +155,19 @@ LoweringPrepareItaniumCXXABI::lowerDynamicCast(cir::CIRBaseBuilderTy &builder,
     return buildDynamicCastAfterNullCheck(builder, op);
 
   mlir::Value srcValueIsNotNull = builder.createPtrToBoolCast(srcValue);
-  return builder
-      .create<cir::TernaryOp>(
-          loc, srcValueIsNotNull,
-          [&](mlir::OpBuilder &, mlir::Location) {
-            mlir::Value castedValue =
-                op.isCastToVoid()
-                    ? buildDynamicCastToVoidAfterNullCheck(builder, astCtx, op)
-                    : buildDynamicCastAfterNullCheck(builder, op);
-            builder.createYield(loc, castedValue);
-          },
-          [&](mlir::OpBuilder &, mlir::Location) {
-            builder.createYield(
-                loc, builder.getNullPtr(op.getType(), loc).getResult());
-          })
+  return cir::TernaryOp::create(
+             builder, loc, srcValueIsNotNull,
+             [&](mlir::OpBuilder &, mlir::Location) {
+               mlir::Value castedValue =
+                   op.isCastToVoid()
+                       ? buildDynamicCastToVoidAfterNullCheck(builder, astCtx,
+                                                              op)
+                       : buildDynamicCastAfterNullCheck(builder, op);
+               builder.createYield(loc, castedValue);
+             },
+             [&](mlir::OpBuilder &, mlir::Location) {
+               builder.createYield(
+                   loc, builder.getNullPtr(op.getType(), loc).getResult());
+             })
       .getResult();
 }
diff --git a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
index bb75f2d94066f..5a6193fa8d840 100644
--- a/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
+++ b/clang/lib/CIR/Lowering/DirectToLLVM/LowerToLLVM.cpp
@@ -90,12 +90,12 @@ static mlir::Value createIntCast(mlir::OpBuilder &bld, mlir::Value src,
   mlir::Location loc = src.getLoc();
 
   if (dstWidth > srcWidth && isSigned)
-    return bld.create<mlir::LLVM::SExtOp>(loc, dstTy, src);
+    return mlir::LLVM::SExtOp::create(bld, loc, dstTy, src);
   if (dstWidth > srcWidth)
-    return bld.create<mlir::LLVM::ZExtOp>(loc, dstTy, src);
+    return mlir::LLVM::ZExtOp::create(bld, loc, dstTy, src);
   if (dstWidth < srcWidth)
-    return bld.create<mlir::LLVM::TruncOp>(loc, dstTy, src);
-  return bld.create<mlir::LLVM::BitcastOp>(loc, dstTy, src);
+    return mlir::LLVM::TruncOp::create(bld, loc, dstTy, src);
+  return mlir::LLVM::BitcastOp::create(bld, loc, dstTy, src);
 }
 
 static mlir::LLVM::Visibility
@@ -204,12 +204,12 @@ static mlir::Value getLLVMIntCast(mlir::ConversionPatternRewriter &rewriter,
   auto loc = llvmSrc.getLoc();
   if (cirSrcWidth < cirDstIntWidth) {
     if (isUnsigned)
-      return rewriter.create<mlir::LLVM::ZExtOp>(loc, llvmDstIntTy, llvmSrc);
-    return rewriter.create<mlir::LLVM::SExtOp>(loc, llvmDstIntTy, llvmSrc);
+      return mlir::LLVM::ZExtOp::create(rewriter, loc, llvmDstIntTy, llvmSrc);
+    return mlir::LLVM::SExtOp::create(rewriter, loc, llvmDstIntTy, llvmSrc);
   }
 
   // Otherwise truncate
-  return rewriter.create<mlir::LLVM::TruncOp>(loc, llvmDstIntTy, llvmSrc);
+  return mlir::LLVM::TruncOp::create(rewriter, loc, llvmDstIntTy, llvmSrc);
 }
 
 class CIRAttrToValue {
@@ -315,15 +315,17 @@ static mlir::LLVM::CallIntrinsicOp replaceOpWithCallLLVMIntrinsicOp(
 /// IntAttr visitor.
 mlir::Value CIRAttrToValue::visitCirAttr(cir::IntAttr intAttr) {
   mlir::Location loc = parentOp->getLoc();
-  return rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, converter->convertType(intAttr.getType()), intAttr.getValue());
+  return mlir::LLVM::ConstantOp::create(
+      rewriter, loc, converter->convertType(intAttr.getType()),
+      intAttr.getValue());
 }
 
 /// FPAttr visitor.
 mlir::Value CIRAttrToValue::visitCirAttr(cir::FPAttr fltAttr) {
   mlir::Location loc = parentOp->getLoc();
-  return rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, converter->convertType(fltAttr.getType()), fltAttr.getValue());
+  return mlir::LLVM::ConstantOp::create(
+      rewriter, loc, converter->convertType(fltAttr.getType()),
+      fltAttr.getValue());
 }
 
 /// ConstComplexAttr visitor.
@@ -350,8 +352,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstComplexAttr complexAttr) {
   }
 
   mlir::Location loc = parentOp->getLoc();
-  return rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, converter->convertType(complexAttr.getType()),
+  return mlir::LLVM::ConstantOp::create(
+      rewriter, loc, converter->convertType(complexAttr.getType()),
       rewriter.getArrayAttr(components));
 }
 
@@ -359,15 +361,16 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstComplexAttr complexAttr) {
 mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstPtrAttr ptrAttr) {
   mlir::Location loc = parentOp->getLoc();
   if (ptrAttr.isNullValue()) {
-    return rewriter.create<mlir::LLVM::ZeroOp>(
-        loc, converter->convertType(ptrAttr.getType()));
+    return mlir::LLVM::ZeroOp::create(
+        rewriter, loc, converter->convertType(ptrAttr.getType()));
   }
   mlir::DataLayout layout(parentOp->getParentOfType<mlir::ModuleOp>());
-  mlir::Value ptrVal = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, rewriter.getIntegerType(layout.getTypeSizeInBits(ptrAttr.getType())),
+  mlir::Value ptrVal = mlir::LLVM::ConstantOp::create(
+      rewriter, loc,
+      rewriter.getIntegerType(layout.getTypeSizeInBits(ptrAttr.getType())),
       ptrAttr.getValue().getInt());
-  return rewriter.create<mlir::LLVM::IntToPtrOp>(
-      loc, converter->convertType(ptrAttr.getType()), ptrVal);
+  return mlir::LLVM::IntToPtrOp::create(
+      rewriter, loc, converter->convertType(ptrAttr.getType()), ptrVal);
 }
 
 // ConstArrayAttr visitor
@@ -378,10 +381,10 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) {
 
   if (attr.hasTrailingZeros()) {
     mlir::Type arrayTy = attr.getType();
-    result = rewriter.create<mlir::LLVM::ZeroOp>(
-        loc, converter->convertType(arrayTy));
+    result = mlir::LLVM::ZeroOp::create(rewriter, loc,
+                                        converter->convertType(arrayTy));
   } else {
-    result = rewriter.create<mlir::LLVM::UndefOp>(loc, llvmTy);
+    result = mlir::LLVM::UndefOp::create(rewriter, loc, llvmTy);
   }
 
   // Iteratively lower each constant element of the array.
@@ -390,7 +393,7 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) {
       mlir::DataLayout dataLayout(parentOp->getParentOfType<mlir::ModuleOp>());
       mlir::Value init = visit(elt);
       result =
-          rewriter.create<mlir::LLVM::InsertValueOp>(loc, result, init, idx);
+          mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx);
     }
   } else if (auto strAttr = mlir::dyn_cast<mlir::StringAttr>(attr.getElts())) {
     // TODO(cir): this diverges from traditional lowering. Normally the string
@@ -399,10 +402,10 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) {
     assert(arrayTy && "String attribute must have an array type");
     mlir::Type eltTy = arrayTy.getElementType();
     for (auto [idx, elt] : llvm::enumerate(strAttr)) {
-      auto init = rewriter.create<mlir::LLVM::ConstantOp>(
-          loc, converter->convertType(eltTy), elt);
+      auto init = mlir::LLVM::ConstantOp::create(
+          rewriter, loc, converter->convertType(eltTy), elt);
       result =
-          rewriter.create<mlir::LLVM::InsertValueOp>(loc, result, init, idx);
+          mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx);
     }
   } else {
     llvm_unreachable("unexpected ConstArrayAttr elements");
@@ -415,12 +418,13 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstArrayAttr attr) {
 mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstRecordAttr constRecord) {
   const mlir::Type llvmTy = converter->convertType(constRecord.getType());
   const mlir::Location loc = parentOp->getLoc();
-  mlir::Value result = rewriter.create<mlir::LLVM::UndefOp>(loc, llvmTy);
+  mlir::Value result = mlir::LLVM::UndefOp::create(rewriter, loc, llvmTy);
 
   // Iteratively lower each constant element of the record.
   for (auto [idx, elt] : llvm::enumerate(constRecord.getMembers())) {
     mlir::Value init = visit(elt);
-    result = rewriter.create<mlir::LLVM::InsertValueOp>(loc, result, init, idx);
+    result =
+        mlir::LLVM::InsertValueOp::create(rewriter, loc, result, init, idx);
   }
 
   return result;
@@ -447,8 +451,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::ConstVectorAttr attr) {
     mlirValues.push_back(mlirAttr);
   }
 
-  return rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, llvmTy,
+  return mlir::LLVM::ConstantOp::create(
+      rewriter, loc, llvmTy,
       mlir::DenseElementsAttr::get(mlir::cast<mlir::ShapedType>(llvmTy),
                                    mlirValues));
 }
@@ -483,8 +487,9 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) {
   }
 
   mlir::Location loc = parentOp->getLoc();
-  mlir::Value addrOp = rewriter.create<mlir::LLVM::AddressOfOp>(
-      loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()), symName);
+  mlir::Value addrOp = mlir::LLVM::AddressOfOp::create(
+      rewriter, loc, mlir::LLVM::LLVMPointerType::get(rewriter.getContext()),
+      symName);
 
   if (globalAttr.getIndices()) {
     llvm::SmallVector<mlir::LLVM::GEPArg> indices;
@@ -499,8 +504,9 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) {
     }
     mlir::Type resTy = addrOp.getType();
     mlir::Type eltTy = converter->convertType(sourceType);
-    addrOp = rewriter.create<mlir::LLVM::GEPOp>(
-        loc, resTy, eltTy, addrOp, indices, mlir::LLVM::GEPNoWrapFlags::none);
+    addrOp =
+        mlir::LLVM::GEPOp::create(rewriter, loc, resTy, eltTy, addrOp, indices,
+                                  mlir::LLVM::GEPNoWrapFlags::none);
   }
 
   // The incubator has handling here for the attribute having integer type, but
@@ -517,8 +523,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::GlobalViewAttr globalAttr) {
       return addrOp;
 
     mlir::Type llvmDstTy = converter->convertType(globalAttr.getType());
-    return rewriter.create<mlir::LLVM::BitcastOp>(parentOp->getLoc(), llvmDstTy,
-                                                  addrOp);
+    return mlir::LLVM::BitcastOp::create(rewriter, parentOp->getLoc(),
+                                         llvmDstTy, addrOp);
   }
 
   llvm_unreachable("Expecting pointer or integer type for GlobalViewAttr");
@@ -557,8 +563,8 @@ mlir::Value CIRAttrToValue::visitCirAttr(cir::VTableAttr vtableArr) {
 /// ZeroAttr visitor.
 mlir::Value CIRAttrToValue::visitCirAttr(cir::ZeroAttr attr) {
   mlir::Location loc = parentOp->getLoc();
-  return rewriter.create<mlir::LLVM::ZeroOp>(
-      loc, converter->convertType(attr.getType()));
+  return mlir::LLVM::ZeroOp::create(rewriter, loc,
+                                    converter->convertType(attr.getType()));
 }
 
 // This class handles rewriting initializer attributes for types that do not
@@ -666,8 +672,8 @@ mlir::LogicalResult CIRToLLVMAssumeAlignedOpLowering::matchAndRewrite(
 mlir::LogicalResult CIRToLLVMAssumeSepStorageOpLowering::matchAndRewrite(
     cir::AssumeSepStorageOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
-  auto cond = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(),
-                                                      rewriter.getI1Type(), 1);
+  auto cond = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                             rewriter.getI1Type(), 1);
   rewriter.replaceOpWithNewOp<mlir::LLVM::AssumeOp>(
       op, cond, mlir::LLVM::AssumeSeparateStorageTag{}, adaptor.getPtr1(),
       adaptor.getPtr2());
@@ -914,28 +920,28 @@ mlir::LogicalResult CIRToLLVMAtomicFetchOpLowering::matchAndRewrite(
 mlir::LogicalResult CIRToLLVMBitClrsbOpLowering::matchAndRewrite(
     cir::BitClrsbOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
-  auto zero = rewriter.create<mlir::LLVM::ConstantOp>(
-      op.getLoc(), adaptor.getInput().getType(), 0);
-  auto isNeg = rewriter.create<mlir::LLVM::ICmpOp>(
-      op.getLoc(),
+  auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                             adaptor.getInput().getType(), 0);
+  auto isNeg = mlir::LLVM::ICmpOp::create(
+      rewriter, op.getLoc(),
       mlir::LLVM::ICmpPredicateAttr::get(rewriter.getContext(),
                                          mlir::LLVM::ICmpPredicate::slt),
       adaptor.getInput(), zero);
 
-  auto negOne = rewriter.create<mlir::LLVM::ConstantOp>(
-      op.getLoc(), adaptor.getInput().getType(), -1);
-  auto flipped = rewriter.create<mlir::LLVM::XOrOp>(op.getLoc(),
-                                                    adaptor.getInput(), negOne);
+  auto negOne = mlir::LLVM::ConstantOp::create(
+      rewriter, op.getLoc(), adaptor.getInput().getType(), -1);
+  auto flipped = mlir::LLVM::XOrOp::create(rewriter, op.getLoc(),
+                                           adaptor.getInput(), negOne);
 
-  auto select = rewriter.create<mlir::LLVM::SelectOp>(
-      op.getLoc(), isNeg, flipped, adaptor.getInput());
+  auto select = mlir::LLVM::SelectOp::create(rewriter, op.getLoc(), isNeg,
+                                             flipped, adaptor.getInput());
 
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto clz = rewriter.create<mlir::LLVM::CountLeadingZerosOp>(
-      op.getLoc(), resTy, select, /*is_zero_poison=*/false);
+  auto clz = mlir::LLVM::CountLeadingZerosOp::create(
+      rewriter, op.getLoc(), resTy, select, /*is_zero_poison=*/false);
 
-  auto one = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(), resTy, 1);
-  auto res = rewriter.create<mlir::LLVM::SubOp>(op.getLoc(), clz, one);
+  auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1);
+  auto res = mlir::LLVM::SubOp::create(rewriter, op.getLoc(), clz, one);
   rewriter.replaceOp(op, res);
 
   return mlir::LogicalResult::success();
@@ -945,8 +951,8 @@ mlir::LogicalResult CIRToLLVMBitClzOpLowering::matchAndRewrite(
     cir::BitClzOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto llvmOp = rewriter.create<mlir::LLVM::CountLeadingZerosOp>(
-      op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero());
+  auto llvmOp = mlir::LLVM::CountLeadingZerosOp::create(
+      rewriter, op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero());
   rewriter.replaceOp(op, llvmOp);
   return mlir::LogicalResult::success();
 }
@@ -955,8 +961,8 @@ mlir::LogicalResult CIRToLLVMBitCtzOpLowering::matchAndRewrite(
     cir::BitCtzOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto llvmOp = rewriter.create<mlir::LLVM::CountTrailingZerosOp>(
-      op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero());
+  auto llvmOp = mlir::LLVM::CountTrailingZerosOp::create(
+      rewriter, op.getLoc(), resTy, adaptor.getInput(), op.getPoisonZero());
   rewriter.replaceOp(op, llvmOp);
   return mlir::LogicalResult::success();
 }
@@ -965,23 +971,24 @@ mlir::LogicalResult CIRToLLVMBitFfsOpLowering::matchAndRewrite(
     cir::BitFfsOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto ctz = rewriter.create<mlir::LLVM::CountTrailingZerosOp>(
-      op.getLoc(), resTy, adaptor.getInput(), /*is_zero_poison=*/true);
+  auto ctz = mlir::LLVM::CountTrailingZerosOp::create(rewriter, op.getLoc(),
+                                                      resTy, adaptor.getInput(),
+                                                      /*is_zero_poison=*/true);
 
-  auto one = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(), resTy, 1);
-  auto ctzAddOne = rewriter.create<mlir::LLVM::AddOp>(op.getLoc(), ctz, one);
+  auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1);
+  auto ctzAddOne = mlir::LLVM::AddOp::create(rewriter, op.getLoc(), ctz, one);
 
-  auto zeroInputTy = rewriter.create<mlir::LLVM::ConstantOp>(
-      op.getLoc(), adaptor.getInput().getType(), 0);
-  auto isZero = rewriter.create<mlir::LLVM::ICmpOp>(
-      op.getLoc(),
+  auto zeroInputTy = mlir::LLVM::ConstantOp::create(
+      rewriter, op.getLoc(), adaptor.getInput().getType(), 0);
+  auto isZero = mlir::LLVM::ICmpOp::create(
+      rewriter, op.getLoc(),
       mlir::LLVM::ICmpPredicateAttr::get(rewriter.getContext(),
                                          mlir::LLVM::ICmpPredicate::eq),
       adaptor.getInput(), zeroInputTy);
 
-  auto zero = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(), resTy, 0);
-  auto res = rewriter.create<mlir::LLVM::SelectOp>(op.getLoc(), isZero, zero,
-                                                   ctzAddOne);
+  auto zero = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 0);
+  auto res = mlir::LLVM::SelectOp::create(rewriter, op.getLoc(), isZero, zero,
+                                          ctzAddOne);
   rewriter.replaceOp(op, res);
 
   return mlir::LogicalResult::success();
@@ -991,12 +998,12 @@ mlir::LogicalResult CIRToLLVMBitParityOpLowering::matchAndRewrite(
     cir::BitParityOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto popcnt = rewriter.create<mlir::LLVM::CtPopOp>(op.getLoc(), resTy,
-                                                     adaptor.getInput());
+  auto popcnt = mlir::LLVM::CtPopOp::create(rewriter, op.getLoc(), resTy,
+                                            adaptor.getInput());
 
-  auto one = rewriter.create<mlir::LLVM::ConstantOp>(op.getLoc(), resTy, 1);
+  auto one = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(), resTy, 1);
   auto popcntMod2 =
-      rewriter.create<mlir::LLVM::AndOp>(op.getLoc(), popcnt, one);
+      mlir::LLVM::AndOp::create(rewriter, op.getLoc(), popcnt, one);
   rewriter.replaceOp(op, popcntMod2);
 
   return mlir::LogicalResult::success();
@@ -1006,8 +1013,8 @@ mlir::LogicalResult CIRToLLVMBitPopcountOpLowering::matchAndRewrite(
     cir::BitPopcountOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto resTy = getTypeConverter()->convertType(op.getType());
-  auto llvmOp = rewriter.create<mlir::LLVM::CtPopOp>(op.getLoc(), resTy,
-                                                     adaptor.getInput());
+  auto llvmOp = mlir::LLVM::CtPopOp::create(rewriter, op.getLoc(), resTy,
+                                            adaptor.getInput());
   rewriter.replaceOp(op, llvmOp);
   return mlir::LogicalResult::success();
 }
@@ -1067,8 +1074,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
   }
   case cir::CastKind::int_to_bool: {
     mlir::Value llvmSrcVal = adaptor.getSrc();
-    mlir::Value zeroInt = rewriter.create<mlir::LLVM::ConstantOp>(
-        castOp.getLoc(), llvmSrcVal.getType(), 0);
+    mlir::Value zeroInt = mlir::LLVM::ConstantOp::create(
+        rewriter, castOp.getLoc(), llvmSrcVal.getType(), 0);
     rewriter.replaceOpWithNewOp<mlir::LLVM::ICmpOp>(
         castOp, mlir::LLVM::ICmpPredicate::ne, llvmSrcVal, zeroInt);
     break;
@@ -1132,8 +1139,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
     auto kind = mlir::LLVM::FCmpPredicate::une;
 
     // Check if float is not equal to zero.
-    auto zeroFloat = rewriter.create<mlir::LLVM::ConstantOp>(
-        castOp.getLoc(), llvmSrcVal.getType(),
+    auto zeroFloat = mlir::LLVM::ConstantOp::create(
+        rewriter, castOp.getLoc(), llvmSrcVal.getType(),
         mlir::FloatAttr::get(llvmSrcVal.getType(), 0.0));
 
     // Extend comparison result to either bool (C++) or int (C).
@@ -1204,8 +1211,8 @@ mlir::LogicalResult CIRToLLVMCastOpLowering::matchAndRewrite(
   }
   case cir::CastKind::ptr_to_bool: {
     mlir::Value llvmSrcVal = adaptor.getSrc();
-    mlir::Value zeroPtr = rewriter.create<mlir::LLVM::ZeroOp>(
-        castOp.getLoc(), llvmSrcVal.getType());
+    mlir::Value zeroPtr = mlir::LLVM::ZeroOp::create(rewriter, castOp.getLoc(),
+                                                     llvmSrcVal.getType());
     rewriter.replaceOpWithNewOp<mlir::LLVM::ICmpOp>(
         castOp, mlir::LLVM::ICmpPredicate::ne, llvmSrcVal, zeroPtr);
     break;
@@ -1275,10 +1282,10 @@ mlir::LogicalResult CIRToLLVMPtrStrideOpLowering::matchAndRewrite(
 
     // Rewrite the sub in front of extensions/trunc
     if (rewriteSub) {
-      index = rewriter.create<mlir::LLVM::SubOp>(
-          index.getLoc(), index.getType(),
-          rewriter.create<mlir::LLVM::ConstantOp>(index.getLoc(),
-                                                  index.getType(), 0),
+      index = mlir::LLVM::SubOp::create(
+          rewriter, index.getLoc(), index.getType(),
+          mlir::LLVM::ConstantOp::create(rewriter, index.getLoc(),
+                                         index.getType(), 0),
           index);
       rewriter.eraseOp(sub);
     }
@@ -1310,11 +1317,11 @@ mlir::LogicalResult CIRToLLVMBaseClassAddrOpLowering::matchAndRewrite(
         baseClassOp, resultType, byteType, derivedAddr, offset);
   } else {
     auto loc = baseClassOp.getLoc();
-    mlir::Value isNull = rewriter.create<mlir::LLVM::ICmpOp>(
-        loc, mlir::LLVM::ICmpPredicate::eq, derivedAddr,
-        rewriter.create<mlir::LLVM::ZeroOp>(loc, derivedAddr.getType()));
-    mlir::Value adjusted = rewriter.create<mlir::LLVM::GEPOp>(
-        loc, resultType, byteType, derivedAddr, offset);
+    mlir::Value isNull = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, derivedAddr,
+        mlir::LLVM::ZeroOp::create(rewriter, loc, derivedAddr.getType()));
+    mlir::Value adjusted = mlir::LLVM::GEPOp::create(
+        rewriter, loc, resultType, byteType, derivedAddr, offset);
     rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(baseClassOp, isNull,
                                                       derivedAddr, adjusted);
   }
@@ -1335,8 +1342,8 @@ mlir::LogicalResult CIRToLLVMAllocaOpLowering::matchAndRewrite(
   mlir::Value size =
       op.isDynamic()
           ? adaptor.getDynAllocSize()
-          : rewriter.create<mlir::LLVM::ConstantOp>(
-                op.getLoc(),
+          : mlir::LLVM::ConstantOp::create(
+                rewriter, op.getLoc(),
                 typeConverter->convertType(rewriter.getIndexType()), 1);
   mlir::Type elementTy =
       convertTypeForMemory(*getTypeConverter(), dataLayout, op.getAllocaType());
@@ -1688,19 +1695,28 @@ static uint64_t getTypeSize(mlir::Type type, mlir::Operation &op) {
   return llvm::divideCeil(layout.getTypeSizeInBits(type), 8);
 }
 
+mlir::LogicalResult CIRToLLVMPrefetchOpLowering::matchAndRewrite(
+    cir::PrefetchOp op, OpAdaptor adaptor,
+    mlir::ConversionPatternRewriter &rewriter) const {
+  rewriter.replaceOpWithNewOp<mlir::LLVM::Prefetch>(
+      op, adaptor.getAddr(), adaptor.getIsWrite(), adaptor.getLocality(),
+      /*DataCache=*/1);
+  return mlir::success();
+}
+
 mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite(
     cir::PtrDiffOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   auto dstTy = mlir::cast<cir::IntType>(op.getType());
   mlir::Type llvmDstTy = getTypeConverter()->convertType(dstTy);
 
-  auto lhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
-                                                     adaptor.getLhs());
-  auto rhs = rewriter.create<mlir::LLVM::PtrToIntOp>(op.getLoc(), llvmDstTy,
-                                                     adaptor.getRhs());
+  auto lhs = mlir::LLVM::PtrToIntOp::create(rewriter, op.getLoc(), llvmDstTy,
+                                            adaptor.getLhs());
+  auto rhs = mlir::LLVM::PtrToIntOp::create(rewriter, op.getLoc(), llvmDstTy,
+                                            adaptor.getRhs());
 
   auto diff =
-      rewriter.create<mlir::LLVM::SubOp>(op.getLoc(), llvmDstTy, lhs, rhs);
+      mlir::LLVM::SubOp::create(rewriter, op.getLoc(), llvmDstTy, lhs, rhs);
 
   cir::PointerType ptrTy = op.getLhs().getType();
   assert(!cir::MissingFeatures::llvmLoweringPtrDiffConsidersPointee());
@@ -1709,17 +1725,17 @@ mlir::LogicalResult CIRToLLVMPtrDiffOpLowering::matchAndRewrite(
   // Avoid silly division by 1.
   mlir::Value resultVal = diff.getResult();
   if (typeSize != 1) {
-    auto typeSizeVal = rewriter.create<mlir::LLVM::ConstantOp>(
-        op.getLoc(), llvmDstTy, typeSize);
+    auto typeSizeVal = mlir::LLVM::ConstantOp::create(rewriter, op.getLoc(),
+                                                      llvmDstTy, typeSize);
 
     if (dstTy.isUnsigned()) {
       auto uDiv =
-          rewriter.create<mlir::LLVM::UDivOp>(op.getLoc(), diff, typeSizeVal);
+          mlir::LLVM::UDivOp::create(rewriter, op.getLoc(), diff, typeSizeVal);
       uDiv.setIsExact(true);
       resultVal = uDiv.getResult();
     } else {
       auto sDiv =
-          rewriter.create<mlir::LLVM::SDivOp>(op.getLoc(), diff, typeSizeVal);
+          mlir::LLVM::SDivOp::create(rewriter, op.getLoc(), diff, typeSizeVal);
       sDiv.setIsExact(true);
       resultVal = sDiv.getResult();
     }
@@ -1847,8 +1863,8 @@ mlir::LogicalResult CIRToLLVMFuncOpLowering::matchAndRewrite(
   SmallVector<mlir::NamedAttribute, 4> attributes;
   lowerFuncAttributes(op, /*filterArgAndResAttrs=*/false, attributes);
 
-  mlir::LLVM::LLVMFuncOp fn = rewriter.create<mlir::LLVM::LLVMFuncOp>(
-      loc, op.getName(), llvmFnTy, linkage, isDsoLocal, cconv,
+  mlir::LLVM::LLVMFuncOp fn = mlir::LLVM::LLVMFuncOp::create(
+      rewriter, loc, op.getName(), llvmFnTy, linkage, isDsoLocal, cconv,
       mlir::SymbolRefAttr(), attributes);
 
   assert(!cir::MissingFeatures::opFuncMultipleReturnVals());
@@ -1884,8 +1900,8 @@ mlir::LogicalResult CIRToLLVMGetGlobalOpLowering::matchAndRewrite(
   }
 
   mlir::Type type = getTypeConverter()->convertType(op.getType());
-  mlir::Operation *newop =
-      rewriter.create<mlir::LLVM::AddressOfOp>(op.getLoc(), type, op.getName());
+  mlir::Operation *newop = mlir::LLVM::AddressOfOp::create(
+      rewriter, op.getLoc(), type, op.getName());
 
   assert(!cir::MissingFeatures::opGlobalThreadLocal());
 
@@ -1941,7 +1957,7 @@ CIRToLLVMGlobalOpLowering::matchAndRewriteRegionInitializedGlobal(
   setupRegionInitializedLLVMGlobalOp(op, rewriter);
   CIRAttrToValue valueConverter(op, rewriter, typeConverter);
   mlir::Value value = valueConverter.visit(init);
-  rewriter.create<mlir::LLVM::ReturnOp>(loc, value);
+  mlir::LLVM::ReturnOp::create(rewriter, loc, value);
   return mlir::success();
 }
 
@@ -2094,14 +2110,14 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
     switch (op.getKind()) {
     case cir::UnaryOpKind::Inc: {
       assert(!isVector && "++ not allowed on vector types");
-      auto one = rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, 1);
+      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
       rewriter.replaceOpWithNewOp<mlir::LLVM::AddOp>(
           op, llvmType, adaptor.getInput(), one, maybeNSW);
       return mlir::success();
     }
     case cir::UnaryOpKind::Dec: {
       assert(!isVector && "-- not allowed on vector types");
-      auto one = rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, 1);
+      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
       rewriter.replaceOpWithNewOp<mlir::LLVM::SubOp>(op, adaptor.getInput(),
                                                      one, maybeNSW);
       return mlir::success();
@@ -2112,9 +2128,9 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
     case cir::UnaryOpKind::Minus: {
       mlir::Value zero;
       if (isVector)
-        zero = rewriter.create<mlir::LLVM::ZeroOp>(loc, llvmType);
+        zero = mlir::LLVM::ZeroOp::create(rewriter, loc, llvmType);
       else
-        zero = rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, 0);
+        zero = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 0);
       rewriter.replaceOpWithNewOp<mlir::LLVM::SubOp>(
           op, zero, adaptor.getInput(), maybeNSW);
       return mlir::success();
@@ -2128,9 +2144,9 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
         std::vector<int32_t> values(numElements, -1);
         mlir::DenseIntElementsAttr denseVec = rewriter.getI32VectorAttr(values);
         minusOne =
-            rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, denseVec);
+            mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, denseVec);
       } else {
-        minusOne = rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, -1);
+        minusOne = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, -1);
       }
       rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(),
                                                      minusOne);
@@ -2145,16 +2161,16 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
     switch (op.getKind()) {
     case cir::UnaryOpKind::Inc: {
       assert(!isVector && "++ not allowed on vector types");
-      mlir::LLVM::ConstantOp one = rewriter.create<mlir::LLVM::ConstantOp>(
-          loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0));
+      mlir::LLVM::ConstantOp one = mlir::LLVM::ConstantOp::create(
+          rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, 1.0));
       rewriter.replaceOpWithNewOp<mlir::LLVM::FAddOp>(op, llvmType, one,
                                                       adaptor.getInput());
       return mlir::success();
     }
     case cir::UnaryOpKind::Dec: {
       assert(!isVector && "-- not allowed on vector types");
-      mlir::LLVM::ConstantOp minusOne = rewriter.create<mlir::LLVM::ConstantOp>(
-          loc, llvmType, rewriter.getFloatAttr(llvmType, -1.0));
+      mlir::LLVM::ConstantOp minusOne = mlir::LLVM::ConstantOp::create(
+          rewriter, loc, llvmType, rewriter.getFloatAttr(llvmType, -1.0));
       rewriter.replaceOpWithNewOp<mlir::LLVM::FAddOp>(op, llvmType, minusOne,
                                                       adaptor.getInput());
       return mlir::success();
@@ -2185,7 +2201,7 @@ mlir::LogicalResult CIRToLLVMUnaryOpLowering::matchAndRewrite(
       return op.emitError() << "Unsupported unary operation on boolean type";
     case cir::UnaryOpKind::Not: {
       assert(!isVector && "NYI: op! on vector mask");
-      auto one = rewriter.create<mlir::LLVM::ConstantOp>(loc, llvmType, 1);
+      auto one = mlir::LLVM::ConstantOp::create(rewriter, loc, llvmType, 1);
       rewriter.replaceOpWithNewOp<mlir::LLVM::XOrOp>(op, adaptor.getInput(),
                                                      one);
       return mlir::success();
@@ -2404,6 +2420,15 @@ mlir::LogicalResult CIRToLLVMCmpOpLowering::matchAndRewrite(
     return mlir::success();
   }
 
+  if (auto vptrTy = mlir::dyn_cast<cir::VPtrType>(type)) {
+    // !cir.vptr is a special case, but it's just a pointer to LLVM.
+    auto kind = convertCmpKindToICmpPredicate(cmpOp.getKind(),
+                                              /* isSigned=*/false);
+    rewriter.replaceOpWithNewOp<mlir::LLVM::ICmpOp>(
+        cmpOp, kind, adaptor.getLhs(), adaptor.getRhs());
+    return mlir::success();
+  }
+
   if (mlir::isa<cir::FPTypeInterface>(type)) {
     mlir::LLVM::FCmpPredicate kind =
         convertCmpKindToFCmpPredicate(cmpOp.getKind());
@@ -2421,47 +2446,47 @@ mlir::LogicalResult CIRToLLVMCmpOpLowering::matchAndRewrite(
     mlir::Type complexElemTy =
         getTypeConverter()->convertType(complexType.getElementType());
 
-    auto lhsReal =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 0);
-    auto lhsImag =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 1);
-    auto rhsReal =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 0);
-    auto rhsImag =
-        rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 1);
+    auto lhsReal = mlir::LLVM::ExtractValueOp::create(
+        rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0}));
+    auto lhsImag = mlir::LLVM::ExtractValueOp::create(
+        rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1}));
+    auto rhsReal = mlir::LLVM::ExtractValueOp::create(
+        rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0}));
+    auto rhsImag = mlir::LLVM::ExtractValueOp::create(
+        rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1}));
 
     if (cmpOp.getKind() == cir::CmpOpKind::eq) {
       if (complexElemTy.isInteger()) {
-        auto realCmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::eq, lhsReal, rhsReal);
-        auto imagCmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::eq, lhsImag, rhsImag);
+        auto realCmp = mlir::LLVM::ICmpOp::create(
+            rewriter, loc, mlir::LLVM::ICmpPredicate::eq, lhsReal, rhsReal);
+        auto imagCmp = mlir::LLVM::ICmpOp::create(
+            rewriter, loc, mlir::LLVM::ICmpPredicate::eq, lhsImag, rhsImag);
         rewriter.replaceOpWithNewOp<mlir::LLVM::AndOp>(cmpOp, realCmp, imagCmp);
         return mlir::success();
       }
 
-      auto realCmp = rewriter.create<mlir::LLVM::FCmpOp>(
-          loc, mlir::LLVM::FCmpPredicate::oeq, lhsReal, rhsReal);
-      auto imagCmp = rewriter.create<mlir::LLVM::FCmpOp>(
-          loc, mlir::LLVM::FCmpPredicate::oeq, lhsImag, rhsImag);
+      auto realCmp = mlir::LLVM::FCmpOp::create(
+          rewriter, loc, mlir::LLVM::FCmpPredicate::oeq, lhsReal, rhsReal);
+      auto imagCmp = mlir::LLVM::FCmpOp::create(
+          rewriter, loc, mlir::LLVM::FCmpPredicate::oeq, lhsImag, rhsImag);
       rewriter.replaceOpWithNewOp<mlir::LLVM::AndOp>(cmpOp, realCmp, imagCmp);
       return mlir::success();
     }
 
     if (cmpOp.getKind() == cir::CmpOpKind::ne) {
       if (complexElemTy.isInteger()) {
-        auto realCmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::ne, lhsReal, rhsReal);
-        auto imagCmp = rewriter.create<mlir::LLVM::ICmpOp>(
-            loc, mlir::LLVM::ICmpPredicate::ne, lhsImag, rhsImag);
+        auto realCmp = mlir::LLVM::ICmpOp::create(
+            rewriter, loc, mlir::LLVM::ICmpPredicate::ne, lhsReal, rhsReal);
+        auto imagCmp = mlir::LLVM::ICmpOp::create(
+            rewriter, loc, mlir::LLVM::ICmpPredicate::ne, lhsImag, rhsImag);
         rewriter.replaceOpWithNewOp<mlir::LLVM::OrOp>(cmpOp, realCmp, imagCmp);
         return mlir::success();
       }
 
-      auto realCmp = rewriter.create<mlir::LLVM::FCmpOp>(
-          loc, mlir::LLVM::FCmpPredicate::une, lhsReal, rhsReal);
-      auto imagCmp = rewriter.create<mlir::LLVM::FCmpOp>(
-          loc, mlir::LLVM::FCmpPredicate::une, lhsImag, rhsImag);
+      auto realCmp = mlir::LLVM::FCmpOp::create(
+          rewriter, loc, mlir::LLVM::FCmpPredicate::une, lhsReal, rhsReal);
+      auto imagCmp = mlir::LLVM::FCmpOp::create(
+          rewriter, loc, mlir::LLVM::FCmpPredicate::une, lhsImag, rhsImag);
       rewriter.replaceOpWithNewOp<mlir::LLVM::OrOp>(cmpOp, realCmp, imagCmp);
       return mlir::success();
     }
@@ -2725,7 +2750,7 @@ static void buildCtorDtorList(
                                                index);
   }
 
-  builder.create<mlir::LLVM::ReturnOp>(loc, result);
+  mlir::LLVM::ReturnOp::create(builder, loc, result);
 }
 
 // The applyPartialConversion function traverses blocks in the dominance order,
@@ -2904,7 +2929,7 @@ void createLLVMFuncOpIfNotExist(mlir::ConversionPatternRewriter &rewriter,
   if (!sourceSymbol) {
     mlir::OpBuilder::InsertionGuard guard(rewriter);
     rewriter.setInsertionPoint(enclosingFnOp);
-    rewriter.create<mlir::LLVM::LLVMFuncOp>(srcOp->getLoc(), fnName, fnTy);
+    mlir::LLVM::LLVMFuncOp::create(rewriter, srcOp->getLoc(), fnName, fnTy);
   }
 }
 
@@ -2983,12 +3008,12 @@ mlir::LogicalResult CIRToLLVMTrapOpLowering::matchAndRewrite(
   mlir::Location loc = op->getLoc();
   rewriter.eraseOp(op);
 
-  rewriter.create<mlir::LLVM::Trap>(loc);
+  mlir::LLVM::Trap::create(rewriter, loc);
 
   // Note that the call to llvm.trap is not a terminator in LLVM dialect.
   // So we must emit an additional llvm.unreachable to terminate the current
   // block.
-  rewriter.create<mlir::LLVM::UnreachableOp>(loc);
+  mlir::LLVM::UnreachableOp::create(rewriter, loc);
 
   return mlir::success();
 }
@@ -3114,15 +3139,15 @@ mlir::LogicalResult CIRToLLVMVecCreateOpLowering::matchAndRewrite(
   const auto vecTy = mlir::cast<cir::VectorType>(op.getType());
   const mlir::Type llvmTy = typeConverter->convertType(vecTy);
   const mlir::Location loc = op.getLoc();
-  mlir::Value result = rewriter.create<mlir::LLVM::PoisonOp>(loc, llvmTy);
+  mlir::Value result = mlir::LLVM::PoisonOp::create(rewriter, loc, llvmTy);
   assert(vecTy.getSize() == op.getElements().size() &&
          "cir.vec.create op count doesn't match vector type elements count");
 
   for (uint64_t i = 0; i < vecTy.getSize(); ++i) {
     const mlir::Value indexValue =
-        rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), i);
-    result = rewriter.create<mlir::LLVM::InsertElementOp>(
-        loc, result, adaptor.getElements()[i], indexValue);
+        mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i);
+    result = mlir::LLVM::InsertElementOp::create(
+        rewriter, loc, result, adaptor.getElements()[i], indexValue);
   }
 
   rewriter.replaceOp(op, result);
@@ -3151,13 +3176,13 @@ mlir::LogicalResult CIRToLLVMVecCmpOpLowering::matchAndRewrite(
   mlir::Type elementType = elementTypeIfVector(op.getLhs().getType());
   mlir::Value bitResult;
   if (auto intType = mlir::dyn_cast<cir::IntType>(elementType)) {
-    bitResult = rewriter.create<mlir::LLVM::ICmpOp>(
-        op.getLoc(),
+    bitResult = mlir::LLVM::ICmpOp::create(
+        rewriter, op.getLoc(),
         convertCmpKindToICmpPredicate(op.getKind(), intType.isSigned()),
         adaptor.getLhs(), adaptor.getRhs());
   } else if (mlir::isa<cir::FPTypeInterface>(elementType)) {
-    bitResult = rewriter.create<mlir::LLVM::FCmpOp>(
-        op.getLoc(), convertCmpKindToFCmpPredicate(op.getKind()),
+    bitResult = mlir::LLVM::FCmpOp::create(
+        rewriter, op.getLoc(), convertCmpKindToFCmpPredicate(op.getKind()),
         adaptor.getLhs(), adaptor.getRhs());
   } else {
     return op.emitError() << "unsupported type for VecCmpOp: " << elementType;
@@ -3181,7 +3206,7 @@ mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite(
   cir::VectorType vecTy = op.getType();
   mlir::Type llvmTy = typeConverter->convertType(vecTy);
   mlir::Location loc = op.getLoc();
-  mlir::Value poison = rewriter.create<mlir::LLVM::PoisonOp>(loc, llvmTy);
+  mlir::Value poison = mlir::LLVM::PoisonOp::create(rewriter, loc, llvmTy);
 
   mlir::Value elementValue = adaptor.getValue();
   if (elementValue.getDefiningOp<mlir::LLVM::PoisonOp>()) {
@@ -3210,9 +3235,9 @@ mlir::LogicalResult CIRToLLVMVecSplatOpLowering::matchAndRewrite(
   }
 
   mlir::Value indexValue =
-      rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), 0);
-  mlir::Value oneElement = rewriter.create<mlir::LLVM::InsertElementOp>(
-      loc, poison, elementValue, indexValue);
+      mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), 0);
+  mlir::Value oneElement = mlir::LLVM::InsertElementOp::create(
+      rewriter, loc, poison, elementValue, indexValue);
   SmallVector<int32_t> zeroValues(vecTy.getSize(), 0);
   rewriter.replaceOpWithNewOp<mlir::LLVM::ShuffleVectorOp>(op, oneElement,
                                                            poison, zeroValues);
@@ -3260,31 +3285,32 @@ mlir::LogicalResult CIRToLLVMVecShuffleDynamicOpLowering::matchAndRewrite(
       mlir::cast<cir::VectorType>(op.getVec().getType()).getSize();
 
   uint64_t maskBits = llvm::NextPowerOf2(numElements - 1) - 1;
-  mlir::Value maskValue = rewriter.create<mlir::LLVM::ConstantOp>(
-      loc, llvmIndexType, rewriter.getIntegerAttr(llvmIndexType, maskBits));
+  mlir::Value maskValue = mlir::LLVM::ConstantOp::create(
+      rewriter, loc, llvmIndexType,
+      rewriter.getIntegerAttr(llvmIndexType, maskBits));
   mlir::Value maskVector =
-      rewriter.create<mlir::LLVM::UndefOp>(loc, llvmIndexVecType);
+      mlir::LLVM::UndefOp::create(rewriter, loc, llvmIndexVecType);
 
   for (uint64_t i = 0; i < numElements; ++i) {
     mlir::Value idxValue =
-        rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), i);
-    maskVector = rewriter.create<mlir::LLVM::InsertElementOp>(
-        loc, maskVector, maskValue, idxValue);
+        mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i);
+    maskVector = mlir::LLVM::InsertElementOp::create(rewriter, loc, maskVector,
+                                                     maskValue, idxValue);
   }
 
-  mlir::Value maskedIndices = rewriter.create<mlir::LLVM::AndOp>(
-      loc, llvmIndexVecType, adaptor.getIndices(), maskVector);
-  mlir::Value result = rewriter.create<mlir::LLVM::UndefOp>(
-      loc, getTypeConverter()->convertType(op.getVec().getType()));
+  mlir::Value maskedIndices = mlir::LLVM::AndOp::create(
+      rewriter, loc, llvmIndexVecType, adaptor.getIndices(), maskVector);
+  mlir::Value result = mlir::LLVM::UndefOp::create(
+      rewriter, loc, getTypeConverter()->convertType(op.getVec().getType()));
   for (uint64_t i = 0; i < numElements; ++i) {
     mlir::Value iValue =
-        rewriter.create<mlir::LLVM::ConstantOp>(loc, rewriter.getI64Type(), i);
-    mlir::Value indexValue = rewriter.create<mlir::LLVM::ExtractElementOp>(
-        loc, maskedIndices, iValue);
+        mlir::LLVM::ConstantOp::create(rewriter, loc, rewriter.getI64Type(), i);
+    mlir::Value indexValue = mlir::LLVM::ExtractElementOp::create(
+        rewriter, loc, maskedIndices, iValue);
     mlir::Value valueAtIndex =
-        rewriter.create<mlir::LLVM::ExtractElementOp>(loc, input, indexValue);
-    result = rewriter.create<mlir::LLVM::InsertElementOp>(loc, result,
-                                                          valueAtIndex, iValue);
+        mlir::LLVM::ExtractElementOp::create(rewriter, loc, input, indexValue);
+    result = mlir::LLVM::InsertElementOp::create(rewriter, loc, result,
+                                                 valueAtIndex, iValue);
   }
   rewriter.replaceOp(op, result);
   return mlir::success();
@@ -3294,10 +3320,10 @@ mlir::LogicalResult CIRToLLVMVecTernaryOpLowering::matchAndRewrite(
     cir::VecTernaryOp op, OpAdaptor adaptor,
     mlir::ConversionPatternRewriter &rewriter) const {
   // Convert `cond` into a vector of i1, then use that in a `select` op.
-  mlir::Value bitVec = rewriter.create<mlir::LLVM::ICmpOp>(
-      op.getLoc(), mlir::LLVM::ICmpPredicate::ne, adaptor.getCond(),
-      rewriter.create<mlir::LLVM::ZeroOp>(
-          op.getCond().getLoc(),
+  mlir::Value bitVec = mlir::LLVM::ICmpOp::create(
+      rewriter, op.getLoc(), mlir::LLVM::ICmpPredicate::ne, adaptor.getCond(),
+      mlir::LLVM::ZeroOp::create(
+          rewriter, op.getCond().getLoc(),
           typeConverter->convertType(op.getCond().getType())));
   rewriter.replaceOpWithNewOp<mlir::LLVM::SelectOp>(
       op, bitVec, adaptor.getLhs(), adaptor.getRhs());
@@ -3314,41 +3340,41 @@ mlir::LogicalResult CIRToLLVMComplexAddOpLowering::matchAndRewrite(
   auto complexType = mlir::cast<cir::ComplexType>(op.getLhs().getType());
   mlir::Type complexElemTy =
       getTypeConverter()->convertType(complexType.getElementType());
-  auto lhsReal =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 0);
-  auto lhsImag =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 1);
-  auto rhsReal =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 0);
-  auto rhsImag =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 1);
+  auto lhsReal = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0}));
+  auto lhsImag = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1}));
+  auto rhsReal = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0}));
+  auto rhsImag = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1}));
 
   mlir::Value newReal;
   mlir::Value newImag;
   if (complexElemTy.isInteger()) {
-    newReal = rewriter.create<mlir::LLVM::AddOp>(loc, complexElemTy, lhsReal,
-                                                 rhsReal);
-    newImag = rewriter.create<mlir::LLVM::AddOp>(loc, complexElemTy, lhsImag,
-                                                 rhsImag);
+    newReal = mlir::LLVM::AddOp::create(rewriter, loc, complexElemTy, lhsReal,
+                                        rhsReal);
+    newImag = mlir::LLVM::AddOp::create(rewriter, loc, complexElemTy, lhsImag,
+                                        rhsImag);
   } else {
     assert(!cir::MissingFeatures::fastMathFlags());
     assert(!cir::MissingFeatures::fpConstraints());
-    newReal = rewriter.create<mlir::LLVM::FAddOp>(loc, complexElemTy, lhsReal,
-                                                  rhsReal);
-    newImag = rewriter.create<mlir::LLVM::FAddOp>(loc, complexElemTy, lhsImag,
-                                                  rhsImag);
+    newReal = mlir::LLVM::FAddOp::create(rewriter, loc, complexElemTy, lhsReal,
+                                         rhsReal);
+    newImag = mlir::LLVM::FAddOp::create(rewriter, loc, complexElemTy, lhsImag,
+                                         rhsImag);
   }
 
   mlir::Type complexLLVMTy =
       getTypeConverter()->convertType(op.getResult().getType());
   auto initialComplex =
-      rewriter.create<mlir::LLVM::PoisonOp>(op->getLoc(), complexLLVMTy);
+      mlir::LLVM::PoisonOp::create(rewriter, op->getLoc(), complexLLVMTy);
 
-  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
-      op->getLoc(), initialComplex, newReal, 0);
+  auto realComplex = mlir::LLVM::InsertValueOp::create(
+      rewriter, op->getLoc(), initialComplex, newReal, ArrayRef(int64_t{0}));
 
-  rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(op, realComplex,
-                                                         newImag, 1);
+  rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(
+      op, realComplex, newImag, ArrayRef(int64_t{1}));
 
   return mlir::success();
 }
@@ -3359,13 +3385,15 @@ mlir::LogicalResult CIRToLLVMComplexCreateOpLowering::matchAndRewrite(
   mlir::Type complexLLVMTy =
       getTypeConverter()->convertType(op.getResult().getType());
   auto initialComplex =
-      rewriter.create<mlir::LLVM::UndefOp>(op->getLoc(), complexLLVMTy);
+      mlir::LLVM::UndefOp::create(rewriter, op->getLoc(), complexLLVMTy);
 
-  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
-      op->getLoc(), initialComplex, adaptor.getReal(), 0);
+  auto realComplex = mlir::LLVM::InsertValueOp::create(
+      rewriter, op->getLoc(), initialComplex, adaptor.getReal(),
+      ArrayRef(int64_t{0}));
 
-  auto complex = rewriter.create<mlir::LLVM::InsertValueOp>(
-      op->getLoc(), realComplex, adaptor.getImag(), 1);
+  auto complex = mlir::LLVM::InsertValueOp::create(
+      rewriter, op->getLoc(), realComplex, adaptor.getImag(),
+      ArrayRef(int64_t{1}));
 
   rewriter.replaceOp(op, complex);
   return mlir::success();
@@ -3395,41 +3423,41 @@ mlir::LogicalResult CIRToLLVMComplexSubOpLowering::matchAndRewrite(
   auto complexType = mlir::cast<cir::ComplexType>(op.getLhs().getType());
   mlir::Type complexElemTy =
       getTypeConverter()->convertType(complexType.getElementType());
-  auto lhsReal =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 0);
-  auto lhsImag =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, lhs, 1);
-  auto rhsReal =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 0);
-  auto rhsImag =
-      rewriter.create<mlir::LLVM::ExtractValueOp>(loc, complexElemTy, rhs, 1);
+  auto lhsReal = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{0}));
+  auto lhsImag = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, lhs, ArrayRef(int64_t{1}));
+  auto rhsReal = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{0}));
+  auto rhsImag = mlir::LLVM::ExtractValueOp::create(
+      rewriter, loc, complexElemTy, rhs, ArrayRef(int64_t{1}));
 
   mlir::Value newReal;
   mlir::Value newImag;
   if (complexElemTy.isInteger()) {
-    newReal = rewriter.create<mlir::LLVM::SubOp>(loc, complexElemTy, lhsReal,
-                                                 rhsReal);
-    newImag = rewriter.create<mlir::LLVM::SubOp>(loc, complexElemTy, lhsImag,
-                                                 rhsImag);
+    newReal = mlir::LLVM::SubOp::create(rewriter, loc, complexElemTy, lhsReal,
+                                        rhsReal);
+    newImag = mlir::LLVM::SubOp::create(rewriter, loc, complexElemTy, lhsImag,
+                                        rhsImag);
   } else {
     assert(!cir::MissingFeatures::fastMathFlags());
     assert(!cir::MissingFeatures::fpConstraints());
-    newReal = rewriter.create<mlir::LLVM::FSubOp>(loc, complexElemTy, lhsReal,
-                                                  rhsReal);
-    newImag = rewriter.create<mlir::LLVM::FSubOp>(loc, complexElemTy, lhsImag,
-                                                  rhsImag);
+    newReal = mlir::LLVM::FSubOp::create(rewriter, loc, complexElemTy, lhsReal,
+                                         rhsReal);
+    newImag = mlir::LLVM::FSubOp::create(rewriter, loc, complexElemTy, lhsImag,
+                                         rhsImag);
   }
 
   mlir::Type complexLLVMTy =
       getTypeConverter()->convertType(op.getResult().getType());
   auto initialComplex =
-      rewriter.create<mlir::LLVM::PoisonOp>(op->getLoc(), complexLLVMTy);
+      mlir::LLVM::PoisonOp::create(rewriter, op->getLoc(), complexLLVMTy);
 
-  auto realComplex = rewriter.create<mlir::LLVM::InsertValueOp>(
-      op->getLoc(), initialComplex, newReal, 0);
+  auto realComplex = mlir::LLVM::InsertValueOp::create(
+      rewriter, op->getLoc(), initialComplex, newReal, ArrayRef(int64_t{0}));
 
-  rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(op, realComplex,
-                                                         newImag, 1);
+  rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(
+      op, realComplex, newImag, ArrayRef(int64_t{1}));
 
   return mlir::success();
 }
@@ -3496,8 +3524,8 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite(
   if (storageSize != size) {
     assert(storageSize > size && "Invalid bitfield size.");
 
-    mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
-        op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
+    mlir::Value val = mlir::LLVM::LoadOp::create(
+        rewriter, op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
         op.getIsVolatile());
 
     srcVal =
@@ -3510,11 +3538,11 @@ mlir::LogicalResult CIRToLLVMSetBitfieldOpLowering::matchAndRewrite(
                     ~llvm::APInt::getBitsSet(srcWidth, offset, offset + size));
 
     // Or together the unchanged values and the source value.
-    srcVal = rewriter.create<mlir::LLVM::OrOp>(op.getLoc(), val, srcVal);
+    srcVal = mlir::LLVM::OrOp::create(rewriter, op.getLoc(), val, srcVal);
   }
 
-  rewriter.create<mlir::LLVM::StoreOp>(op.getLoc(), srcVal, adaptor.getAddr(),
-                                       op.getAlignment(), op.getIsVolatile());
+  mlir::LLVM::StoreOp::create(rewriter, op.getLoc(), srcVal, adaptor.getAddr(),
+                              op.getAlignment(), op.getIsVolatile());
 
   mlir::Type resultTy = getTypeConverter()->convertType(op.getType());
 
@@ -3587,10 +3615,10 @@ mlir::LogicalResult CIRToLLVMGetBitfieldOpLowering::matchAndRewrite(
   mlir::IntegerType intType =
       computeBitfieldIntType(storageType, context, storageSize);
 
-  mlir::Value val = rewriter.create<mlir::LLVM::LoadOp>(
-      op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
+  mlir::Value val = mlir::LLVM::LoadOp::create(
+      rewriter, op.getLoc(), intType, adaptor.getAddr(), op.getAlignment(),
       op.getIsVolatile());
-  val = rewriter.create<mlir::LLVM::BitcastOp>(op.getLoc(), intType, val);
+  val = mlir::LLVM::BitcastOp::create(rewriter, op.getLoc(), intType, val);
 
   if (info.getIsSigned()) {
     assert(static_cast<unsigned>(offset + size) <= storageSize);
diff --git a/clang/lib/CIR/Lowering/LoweringHelpers.cpp b/clang/lib/CIR/Lowering/LoweringHelpers.cpp
index d5f132431aba2..0786579a601b1 100644
--- a/clang/lib/CIR/Lowering/LoweringHelpers.cpp
+++ b/clang/lib/CIR/Lowering/LoweringHelpers.cpp
@@ -148,37 +148,37 @@ lowerConstArrayAttr(cir::ConstArrayAttr constArr,
 
 mlir::Value getConstAPInt(mlir::OpBuilder &bld, mlir::Location loc,
                           mlir::Type typ, const llvm::APInt &val) {
-  return bld.create<mlir::LLVM::ConstantOp>(loc, typ, val);
+  return mlir::LLVM::ConstantOp::create(bld, loc, typ, val);
 }
 
 mlir::Value getConst(mlir::OpBuilder &bld, mlir::Location loc, mlir::Type typ,
                      unsigned val) {
-  return bld.create<mlir::LLVM::ConstantOp>(loc, typ, val);
+  return mlir::LLVM::ConstantOp::create(bld, loc, typ, val);
 }
 
 mlir::Value createShL(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) {
   if (!rhs)
     return lhs;
   mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs);
-  return bld.create<mlir::LLVM::ShlOp>(lhs.getLoc(), lhs, rhsVal);
+  return mlir::LLVM::ShlOp::create(bld, lhs.getLoc(), lhs, rhsVal);
 }
 
 mlir::Value createAShR(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) {
   if (!rhs)
     return lhs;
   mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs);
-  return bld.create<mlir::LLVM::AShrOp>(lhs.getLoc(), lhs, rhsVal);
+  return mlir::LLVM::AShrOp::create(bld, lhs.getLoc(), lhs, rhsVal);
 }
 
 mlir::Value createAnd(mlir::OpBuilder &bld, mlir::Value lhs,
                       const llvm::APInt &rhs) {
   mlir::Value rhsVal = getConstAPInt(bld, lhs.getLoc(), lhs.getType(), rhs);
-  return bld.create<mlir::LLVM::AndOp>(lhs.getLoc(), lhs, rhsVal);
+  return mlir::LLVM::AndOp::create(bld, lhs.getLoc(), lhs, rhsVal);
 }
 
 mlir::Value createLShR(mlir::OpBuilder &bld, mlir::Value lhs, unsigned rhs) {
   if (!rhs)
     return lhs;
   mlir::Value rhsVal = getConst(bld, lhs.getLoc(), lhs.getType(), rhs);
-  return bld.create<mlir::LLVM::LShrOp>(lhs.getLoc(), lhs, rhsVal);
+  return mlir::LLVM::LShrOp::create(bld, lhs.getLoc(), lhs, rhsVal);
 }
diff --git a/clang/lib/CodeGen/BackendUtil.cpp b/clang/lib/CodeGen/BackendUtil.cpp
index 602068436101b..468c930acacbd 100644
--- a/clang/lib/CodeGen/BackendUtil.cpp
+++ b/clang/lib/CodeGen/BackendUtil.cpp
@@ -234,9 +234,12 @@ class EmitAssemblyHelper {
 };
 } // namespace
 
-static AllocTokenOptions getAllocTokenOptions(const CodeGenOptions &CGOpts) {
+static AllocTokenOptions getAllocTokenOptions(const LangOptions &LangOpts,
+                                              const CodeGenOptions &CGOpts) {
   AllocTokenOptions Opts;
-  Opts.MaxTokens = CGOpts.AllocTokenMax;
+  if (LangOpts.AllocTokenMode)
+    Opts.Mode = *LangOpts.AllocTokenMode;
+  Opts.MaxTokens = LangOpts.AllocTokenMax;
   Opts.Extended = CGOpts.SanitizeAllocTokenExtended;
   Opts.FastABI = CGOpts.SanitizeAllocTokenFastABI;
   return Opts;
@@ -430,12 +433,6 @@ static bool initTargetOptions(const CompilerInstance &CI,
   Options.NoInfsFPMath = LangOpts.NoHonorInfs;
   Options.NoNaNsFPMath = LangOpts.NoHonorNaNs;
   Options.NoZerosInBSS = CodeGenOpts.NoZeroInitializedInBSS;
-  Options.UnsafeFPMath = LangOpts.AllowFPReassoc && LangOpts.AllowRecip &&
-                         LangOpts.NoSignedZero && LangOpts.ApproxFunc &&
-                         (LangOpts.getDefaultFPContractMode() ==
-                              LangOptions::FPModeKind::FPM_Fast ||
-                          LangOpts.getDefaultFPContractMode() ==
-                              LangOptions::FPModeKind::FPM_FastHonorPragmas);
 
   Options.BBAddrMap = CodeGenOpts.BBAddrMap;
   Options.BBSections =
@@ -687,7 +684,8 @@ static void addKCFIPass(const Triple &TargetTriple, const LangOptions &LangOpts,
                         PassBuilder &PB) {
   // If the back-end supports KCFI operand bundle lowering, skip KCFIPass.
   if (TargetTriple.getArch() == llvm::Triple::x86_64 ||
-      TargetTriple.isAArch64(64) || TargetTriple.isRISCV())
+      TargetTriple.isAArch64(64) || TargetTriple.isRISCV() ||
+      TargetTriple.isARM() || TargetTriple.isThumb())
     return;
 
   // Ensure we lower KCFI operand bundles with -O0.
@@ -808,7 +806,7 @@ static void addSanitizers(const Triple &TargetTriple,
         // memory allocation function detection.
         MPM.addPass(InferFunctionAttrsPass());
       }
-      MPM.addPass(AllocTokenPass(getAllocTokenOptions(CodeGenOpts)));
+      MPM.addPass(AllocTokenPass(getAllocTokenOptions(LangOpts, CodeGenOpts)));
     }
   };
   if (ClSanitizeOnOptimizerEarlyEP) {
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
index 657bae4f97945..866a0e6cad209 100644
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -29,6 +29,7 @@
 #include "clang/AST/ASTLambda.h"
 #include "clang/AST/Attr.h"
 #include "clang/AST/DeclObjC.h"
+#include "clang/AST/InferAlloc.h"
 #include "clang/AST/NSAPI.h"
 #include "clang/AST/ParentMapContext.h"
 #include "clang/AST/StmtVisitor.h"
@@ -1273,194 +1274,39 @@ void CodeGenFunction::EmitBoundsCheckImpl(const Expr *E, llvm::Value *Bound,
   EmitCheck(std::make_pair(Check, CheckKind), CheckHandler, StaticData, Index);
 }
 
-static bool
-typeContainsPointer(QualType T,
-                    llvm::SmallPtrSet<const RecordDecl *, 4> &VisitedRD,
-                    bool &IncompleteType) {
-  QualType CanonicalType = T.getCanonicalType();
-  if (CanonicalType->isPointerType())
-    return true; // base case
-
-  // Look through typedef chain to check for special types.
-  for (QualType CurrentT = T; const auto *TT = CurrentT->getAs<TypedefType>();
-       CurrentT = TT->getDecl()->getUnderlyingType()) {
-    const IdentifierInfo *II = TT->getDecl()->getIdentifier();
-    // Special Case: Syntactically uintptr_t is not a pointer; semantically,
-    // however, very likely used as such. Therefore, classify uintptr_t as a
-    // pointer, too.
-    if (II && II->isStr("uintptr_t"))
-      return true;
-  }
-
-  // The type is an array; check the element type.
-  if (const ArrayType *AT = dyn_cast<ArrayType>(CanonicalType))
-    return typeContainsPointer(AT->getElementType(), VisitedRD, IncompleteType);
-  // The type is a struct, class, or union.
-  if (const RecordDecl *RD = CanonicalType->getAsRecordDecl()) {
-    if (!RD->isCompleteDefinition()) {
-      IncompleteType = true;
-      return false;
-    }
-    if (!VisitedRD.insert(RD).second)
-      return false; // already visited
-    // Check all fields.
-    for (const FieldDecl *Field : RD->fields()) {
-      if (typeContainsPointer(Field->getType(), VisitedRD, IncompleteType))
-        return true;
-    }
-    // For C++ classes, also check base classes.
-    if (const CXXRecordDecl *CXXRD = dyn_cast<CXXRecordDecl>(RD)) {
-      // Polymorphic types require a vptr.
-      if (CXXRD->isDynamicClass())
-        return true;
-      for (const CXXBaseSpecifier &Base : CXXRD->bases()) {
-        if (typeContainsPointer(Base.getType(), VisitedRD, IncompleteType))
-          return true;
-      }
-    }
-  }
-  return false;
-}
-
-void CodeGenFunction::EmitAllocToken(llvm::CallBase *CB, QualType AllocType) {
-  assert(SanOpts.has(SanitizerKind::AllocToken) &&
-         "Only needed with -fsanitize=alloc-token");
+llvm::MDNode *CodeGenFunction::buildAllocToken(QualType AllocType) {
+  auto ATMD = infer_alloc::getAllocTokenMetadata(AllocType, getContext());
+  if (!ATMD)
+    return nullptr;
 
   llvm::MDBuilder MDB(getLLVMContext());
-
-  // Get unique type name.
-  PrintingPolicy Policy(CGM.getContext().getLangOpts());
-  Policy.SuppressTagKeyword = true;
-  Policy.FullyQualifiedName = true;
-  SmallString<64> TypeName;
-  llvm::raw_svector_ostream TypeNameOS(TypeName);
-  AllocType.getCanonicalType().print(TypeNameOS, Policy);
-  auto *TypeNameMD = MDB.createString(TypeNameOS.str());
-
-  // Check if QualType contains a pointer. Implements a simple DFS to
-  // recursively check if a type contains a pointer type.
-  llvm::SmallPtrSet<const RecordDecl *, 4> VisitedRD;
-  bool IncompleteType = false;
-  const bool ContainsPtr =
-      typeContainsPointer(AllocType, VisitedRD, IncompleteType);
-  if (!ContainsPtr && IncompleteType)
-    return;
-  auto *ContainsPtrC = Builder.getInt1(ContainsPtr);
+  auto *TypeNameMD = MDB.createString(ATMD->TypeName);
+  auto *ContainsPtrC = Builder.getInt1(ATMD->ContainsPointer);
   auto *ContainsPtrMD = MDB.createConstant(ContainsPtrC);
 
   // Format: !{<type-name>, <contains-pointer>}
-  auto *MDN =
-      llvm::MDNode::get(CGM.getLLVMContext(), {TypeNameMD, ContainsPtrMD});
-  CB->setMetadata(llvm::LLVMContext::MD_alloc_token, MDN);
-}
-
-namespace {
-/// Infer type from a simple sizeof expression.
-QualType inferTypeFromSizeofExpr(const Expr *E) {
-  const Expr *Arg = E->IgnoreParenImpCasts();
-  if (const auto *UET = dyn_cast<UnaryExprOrTypeTraitExpr>(Arg)) {
-    if (UET->getKind() == UETT_SizeOf) {
-      if (UET->isArgumentType())
-        return UET->getArgumentTypeInfo()->getType();
-      else
-        return UET->getArgumentExpr()->getType();
-    }
-  }
-  return QualType();
-}
-
-/// Infer type from an arithmetic expression involving a sizeof. For example:
-///
-///   malloc(sizeof(MyType) + padding);  // infers 'MyType'
-///   malloc(sizeof(MyType) * 32);       // infers 'MyType'
-///   malloc(32 * sizeof(MyType));       // infers 'MyType'
-///   malloc(sizeof(MyType) << 1);       // infers 'MyType'
-///   ...
-///
-/// More complex arithmetic expressions are supported, but are a heuristic, e.g.
-/// when considering allocations for structs with flexible array members:
-///
-///   malloc(sizeof(HasFlexArray) + sizeof(int) * 32);  // infers 'HasFlexArray'
-///
-QualType inferPossibleTypeFromArithSizeofExpr(const Expr *E) {
-  const Expr *Arg = E->IgnoreParenImpCasts();
-  // The argument is a lone sizeof expression.
-  if (QualType T = inferTypeFromSizeofExpr(Arg); !T.isNull())
-    return T;
-  if (const auto *BO = dyn_cast<BinaryOperator>(Arg)) {
-    // Argument is an arithmetic expression. Cover common arithmetic patterns
-    // involving sizeof.
-    switch (BO->getOpcode()) {
-    case BO_Add:
-    case BO_Div:
-    case BO_Mul:
-    case BO_Shl:
-    case BO_Shr:
-    case BO_Sub:
-      if (QualType T = inferPossibleTypeFromArithSizeofExpr(BO->getLHS());
-          !T.isNull())
-        return T;
-      if (QualType T = inferPossibleTypeFromArithSizeofExpr(BO->getRHS());
-          !T.isNull())
-        return T;
-      break;
-    default:
-      break;
-    }
-  }
-  return QualType();
+  return llvm::MDNode::get(CGM.getLLVMContext(), {TypeNameMD, ContainsPtrMD});
 }
 
-/// If the expression E is a reference to a variable, infer the type from a
-/// variable's initializer if it contains a sizeof. Beware, this is a heuristic
-/// and ignores if a variable is later reassigned. For example:
-///
-///   size_t my_size = sizeof(MyType);
-///   void *x = malloc(my_size);  // infers 'MyType'
-///
-QualType inferPossibleTypeFromVarInitSizeofExpr(const Expr *E) {
-  const Expr *Arg = E->IgnoreParenImpCasts();
-  if (const auto *DRE = dyn_cast<DeclRefExpr>(Arg)) {
-    if (const auto *VD = dyn_cast<VarDecl>(DRE->getDecl())) {
-      if (const Expr *Init = VD->getInit())
-        return inferPossibleTypeFromArithSizeofExpr(Init);
-    }
-  }
-  return QualType();
+void CodeGenFunction::EmitAllocToken(llvm::CallBase *CB, QualType AllocType) {
+  assert(SanOpts.has(SanitizerKind::AllocToken) &&
+         "Only needed with -fsanitize=alloc-token");
+  CB->setMetadata(llvm::LLVMContext::MD_alloc_token,
+                  buildAllocToken(AllocType));
 }
 
-/// Deduces the allocated type by checking if the allocation call's result
-/// is immediately used in a cast expression. For example:
-///
-///   MyType *x = (MyType *)malloc(4096);  // infers 'MyType'
-///
-QualType inferPossibleTypeFromCastExpr(const CallExpr *CallE,
-                                       const CastExpr *CastE) {
-  if (!CastE)
-    return QualType();
-  QualType PtrType = CastE->getType();
-  if (PtrType->isPointerType())
-    return PtrType->getPointeeType();
-  return QualType();
+llvm::MDNode *CodeGenFunction::buildAllocToken(const CallExpr *E) {
+  QualType AllocType = infer_alloc::inferPossibleType(E, getContext(), CurCast);
+  if (!AllocType.isNull())
+    return buildAllocToken(AllocType);
+  return nullptr;
 }
-} // end anonymous namespace
 
 void CodeGenFunction::EmitAllocToken(llvm::CallBase *CB, const CallExpr *E) {
-  QualType AllocType;
-  // First check arguments.
-  for (const Expr *Arg : E->arguments()) {
-    AllocType = inferPossibleTypeFromArithSizeofExpr(Arg);
-    if (AllocType.isNull())
-      AllocType = inferPossibleTypeFromVarInitSizeofExpr(Arg);
-    if (!AllocType.isNull())
-      break;
-  }
-  // Then check later casts.
-  if (AllocType.isNull())
-    AllocType = inferPossibleTypeFromCastExpr(E, CurCast);
-  // Emit if we were able to infer the type.
-  if (!AllocType.isNull())
-    EmitAllocToken(CB, AllocType);
+  assert(SanOpts.has(SanitizerKind::AllocToken) &&
+         "Only needed with -fsanitize=alloc-token");
+  if (llvm::MDNode *MDN = buildAllocToken(E))
+    CB->setMetadata(llvm::LLVMContext::MD_alloc_token, MDN);
 }
 
 CodeGenFunction::ComplexPairTy CodeGenFunction::
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index ecab9336a9f82..945f9e2451bc1 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -562,17 +562,16 @@ static llvm::Value *createSPIRVBuiltinLoad(IRBuilder<> &B, llvm::Module &M,
   return B.CreateLoad(Ty, GV);
 }
 
-llvm::Value *
-CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
-                                      const clang::DeclaratorDecl *Decl,
-                                      SemanticInfo &ActiveSemantic) {
-  if (isa<HLSLSV_GroupIndexAttr>(ActiveSemantic.Semantic)) {
+llvm::Value *CGHLSLRuntime::emitSystemSemanticLoad(
+    IRBuilder<> &B, llvm::Type *Type, const clang::DeclaratorDecl *Decl,
+    Attr *Semantic, std::optional<unsigned> Index) {
+  if (isa<HLSLSV_GroupIndexAttr>(Semantic)) {
     llvm::Function *GroupIndex =
         CGM.getIntrinsic(getFlattenedThreadIdInGroupIntrinsic());
     return B.CreateCall(FunctionCallee(GroupIndex));
   }
 
-  if (isa<HLSLSV_DispatchThreadIDAttr>(ActiveSemantic.Semantic)) {
+  if (isa<HLSLSV_DispatchThreadIDAttr>(Semantic)) {
     llvm::Intrinsic::ID IntrinID = getThreadIdIntrinsic();
     llvm::Function *ThreadIDIntrinsic =
         llvm::Intrinsic::isOverloaded(IntrinID)
@@ -581,7 +580,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
     return buildVectorInput(B, ThreadIDIntrinsic, Type);
   }
 
-  if (isa<HLSLSV_GroupThreadIDAttr>(ActiveSemantic.Semantic)) {
+  if (isa<HLSLSV_GroupThreadIDAttr>(Semantic)) {
     llvm::Intrinsic::ID IntrinID = getGroupThreadIdIntrinsic();
     llvm::Function *GroupThreadIDIntrinsic =
         llvm::Intrinsic::isOverloaded(IntrinID)
@@ -590,7 +589,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
     return buildVectorInput(B, GroupThreadIDIntrinsic, Type);
   }
 
-  if (isa<HLSLSV_GroupIDAttr>(ActiveSemantic.Semantic)) {
+  if (isa<HLSLSV_GroupIDAttr>(Semantic)) {
     llvm::Intrinsic::ID IntrinID = getGroupIdIntrinsic();
     llvm::Function *GroupIDIntrinsic =
         llvm::Intrinsic::isOverloaded(IntrinID)
@@ -599,8 +598,7 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
     return buildVectorInput(B, GroupIDIntrinsic, Type);
   }
 
-  if (HLSLSV_PositionAttr *S =
-          dyn_cast<HLSLSV_PositionAttr>(ActiveSemantic.Semantic)) {
+  if (HLSLSV_PositionAttr *S = dyn_cast<HLSLSV_PositionAttr>(Semantic)) {
     if (CGM.getTriple().getEnvironment() == Triple::EnvironmentType::Pixel)
       return createSPIRVBuiltinLoad(B, CGM.getModule(), Type,
                                     S->getAttrName()->getName(),
@@ -611,29 +609,56 @@ CGHLSLRuntime::emitSystemSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
 }
 
 llvm::Value *
-CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
-                                        const clang::DeclaratorDecl *Decl,
-                                        SemanticInfo &ActiveSemantic) {
-
-  if (!ActiveSemantic.Semantic) {
-    ActiveSemantic.Semantic = Decl->getAttr<HLSLSemanticAttr>();
-    if (!ActiveSemantic.Semantic) {
-      CGM.getDiags().Report(Decl->getInnerLocStart(),
-                            diag::err_hlsl_semantic_missing);
-      return nullptr;
+CGHLSLRuntime::handleScalarSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
+                                        llvm::Type *Type,
+                                        const clang::DeclaratorDecl *Decl) {
+
+  HLSLSemanticAttr *Semantic = nullptr;
+  for (HLSLSemanticAttr *Item : FD->specific_attrs<HLSLSemanticAttr>()) {
+    if (Item->getTargetDecl() == Decl) {
+      Semantic = Item;
+      break;
     }
-    ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex();
   }
+  // Sema must create one attribute per scalar field.
+  assert(Semantic);
 
-  return emitSystemSemanticLoad(B, Type, Decl, ActiveSemantic);
+  std::optional<unsigned> Index = std::nullopt;
+  if (Semantic->isSemanticIndexExplicit())
+    Index = Semantic->getSemanticIndex();
+  return emitSystemSemanticLoad(B, Type, Decl, Semantic, Index);
 }
 
 llvm::Value *
-CGHLSLRuntime::handleSemanticLoad(IRBuilder<> &B, llvm::Type *Type,
-                                  const clang::DeclaratorDecl *Decl,
-                                  SemanticInfo &ActiveSemantic) {
-  assert(!Type->isStructTy());
-  return handleScalarSemanticLoad(B, Type, Decl, ActiveSemantic);
+CGHLSLRuntime::handleStructSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
+                                        llvm::Type *Type,
+                                        const clang::DeclaratorDecl *Decl) {
+  const llvm::StructType *ST = cast<StructType>(Type);
+  const clang::RecordDecl *RD = Decl->getType()->getAsRecordDecl();
+
+  assert(std::distance(RD->field_begin(), RD->field_end()) ==
+         ST->getNumElements());
+
+  llvm::Value *Aggregate = llvm::PoisonValue::get(Type);
+  auto FieldDecl = RD->field_begin();
+  for (unsigned I = 0; I < ST->getNumElements(); ++I) {
+    llvm::Value *ChildValue =
+        handleSemanticLoad(B, FD, ST->getElementType(I), *FieldDecl);
+    assert(ChildValue);
+    Aggregate = B.CreateInsertValue(Aggregate, ChildValue, I);
+    ++FieldDecl;
+  }
+
+  return Aggregate;
+}
+
+llvm::Value *
+CGHLSLRuntime::handleSemanticLoad(IRBuilder<> &B, const FunctionDecl *FD,
+                                  llvm::Type *Type,
+                                  const clang::DeclaratorDecl *Decl) {
+  if (Type->isStructTy())
+    return handleStructSemanticLoad(B, FD, Type, Decl);
+  return handleScalarSemanticLoad(B, FD, Type, Decl);
 }
 
 void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
@@ -680,8 +705,25 @@ void CGHLSLRuntime::emitEntryFunction(const FunctionDecl *FD,
     }
 
     const ParmVarDecl *PD = FD->getParamDecl(Param.getArgNo() - SRetOffset);
-    SemanticInfo ActiveSemantic = {nullptr, 0};
-    Args.push_back(handleSemanticLoad(B, Param.getType(), PD, ActiveSemantic));
+    llvm::Value *SemanticValue = nullptr;
+    if ([[maybe_unused]] HLSLParamModifierAttr *MA =
+            PD->getAttr<HLSLParamModifierAttr>()) {
+      llvm_unreachable("Not handled yet");
+    } else {
+      llvm::Type *ParamType =
+          Param.hasByValAttr() ? Param.getParamByValType() : Param.getType();
+      SemanticValue = handleSemanticLoad(B, FD, ParamType, PD);
+      if (!SemanticValue)
+        return;
+      if (Param.hasByValAttr()) {
+        llvm::Value *Var = B.CreateAlloca(Param.getParamByValType());
+        B.CreateStore(SemanticValue, Var);
+        SemanticValue = Var;
+      }
+    }
+
+    assert(SemanticValue);
+    Args.push_back(SemanticValue);
   }
 
   CallInst *CI = B.CreateCall(FunctionCallee(Fn), Args, OB);
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 103b4a98f6c26..d35df524fdc84 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -144,26 +144,24 @@ class CGHLSLRuntime {
 protected:
   CodeGenModule &CGM;
 
-  void collectInputSemantic(llvm::IRBuilder<> &B, const DeclaratorDecl *D,
-                            llvm::Type *Type,
-                            SmallVectorImpl<llvm::Value *> &Inputs);
-
-  struct SemanticInfo {
-    clang::HLSLSemanticAttr *Semantic;
-    uint32_t Index;
-  };
-
   llvm::Value *emitSystemSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
                                       const clang::DeclaratorDecl *Decl,
-                                      SemanticInfo &ActiveSemantic);
-
-  llvm::Value *handleScalarSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
-                                        const clang::DeclaratorDecl *Decl,
-                                        SemanticInfo &ActiveSemantic);
-
-  llvm::Value *handleSemanticLoad(llvm::IRBuilder<> &B, llvm::Type *Type,
-                                  const clang::DeclaratorDecl *Decl,
-                                  SemanticInfo &ActiveSemantic);
+                                      Attr *Semantic,
+                                      std::optional<unsigned> Index);
+
+  llvm::Value *handleScalarSemanticLoad(llvm::IRBuilder<> &B,
+                                        const FunctionDecl *FD,
+                                        llvm::Type *Type,
+                                        const clang::DeclaratorDecl *Decl);
+
+  llvm::Value *handleStructSemanticLoad(llvm::IRBuilder<> &B,
+                                        const FunctionDecl *FD,
+                                        llvm::Type *Type,
+                                        const clang::DeclaratorDecl *Decl);
+
+  llvm::Value *handleSemanticLoad(llvm::IRBuilder<> &B, const FunctionDecl *FD,
+                                  llvm::Type *Type,
+                                  const clang::DeclaratorDecl *Decl);
 
 public:
   CGHLSLRuntime(CodeGenModule &CGM) : CGM(CGM) {}
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.cpp b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
index 1c3647258163d..a044168205d67 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.cpp
@@ -2759,14 +2759,6 @@ llvm::Value *CGOpenMPRuntime::emitMessageClause(CodeGenFunction &CGF,
   return CGF.EmitScalarExpr(Message);
 }
 
-llvm::Value *
-CGOpenMPRuntime::emitMessageClause(CodeGenFunction &CGF,
-                                   const OMPMessageClause *MessageClause) {
-  return emitMessageClause(
-      CGF, MessageClause ? MessageClause->getMessageString() : nullptr,
-      MessageClause->getBeginLoc());
-}
-
 llvm::Value *
 CGOpenMPRuntime::emitSeverityClause(OpenMPSeverityClauseKind Severity,
                                     SourceLocation Loc) {
@@ -2776,13 +2768,6 @@ CGOpenMPRuntime::emitSeverityClause(OpenMPSeverityClauseKind Severity,
                                 Severity == OMPC_SEVERITY_warning ? 1 : 2);
 }
 
-llvm::Value *
-CGOpenMPRuntime::emitSeverityClause(const OMPSeverityClause *SeverityClause) {
-  return emitSeverityClause(SeverityClause ? SeverityClause->getSeverityKind()
-                                           : OMPC_SEVERITY_unknown,
-                            SeverityClause->getBeginLoc());
-}
-
 void CGOpenMPRuntime::emitNumThreadsClause(
     CodeGenFunction &CGF, llvm::Value *NumThreads, SourceLocation Loc,
     OpenMPNumThreadsClauseModifier Modifier, OpenMPSeverityClauseKind Severity,
diff --git a/clang/lib/CodeGen/CGOpenMPRuntime.h b/clang/lib/CodeGen/CGOpenMPRuntime.h
index 25d25255ecf36..105e6b65a6491 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntime.h
+++ b/clang/lib/CodeGen/CGOpenMPRuntime.h
@@ -1064,13 +1064,9 @@ class CGOpenMPRuntime {
   virtual llvm::Value *emitMessageClause(CodeGenFunction &CGF,
                                          const Expr *Message,
                                          SourceLocation Loc);
-  virtual llvm::Value *emitMessageClause(CodeGenFunction &CGF,
-                                         const OMPMessageClause *MessageClause);
 
   virtual llvm::Value *emitSeverityClause(OpenMPSeverityClauseKind Severity,
                                           SourceLocation Loc);
-  virtual llvm::Value *
-  emitSeverityClause(const OMPSeverityClause *SeverityClause);
 
   /// Emits call to void __kmpc_push_num_threads(ident_t *loc, kmp_int32
   /// global_tid, kmp_int32 num_threads) to generate code for 'num_threads'
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 71234ffdc0851..90ced670d33e1 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -3358,9 +3358,14 @@ class CodeGenFunction : public CodeGenTypeCache {
   SanitizerAnnotateDebugInfo(ArrayRef<SanitizerKind::SanitizerOrdinal> Ordinals,
                              SanitizerHandler Handler);
 
-  /// Emit additional metadata used by the AllocToken instrumentation.
+  /// Build metadata used by the AllocToken instrumentation.
+  llvm::MDNode *buildAllocToken(QualType AllocType);
+  /// Emit and set additional metadata used by the AllocToken instrumentation.
   void EmitAllocToken(llvm::CallBase *CB, QualType AllocType);
-  /// Emit additional metadata used by the AllocToken instrumentation,
+  /// Build additional metadata used by the AllocToken instrumentation,
+  /// inferring the type from an allocation call expression.
+  llvm::MDNode *buildAllocToken(const CallExpr *E);
+  /// Emit and set additional metadata used by the AllocToken instrumentation,
   /// inferring the type from an allocation call expression.
   void EmitAllocToken(llvm::CallBase *CB, const CallExpr *E);
 
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index a1fc17bb80212..56a1cc8d6ed2f 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -150,8 +150,6 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
       return createWindowsAArch64TargetCodeGenInfo(CGM, AArch64ABIKind::Win64);
     else if (Target.getABI() == "aapcs-soft")
       Kind = AArch64ABIKind::AAPCSSoft;
-    else if (Target.getABI() == "pauthtest")
-      Kind = AArch64ABIKind::PAuthTest;
 
     return createAArch64TargetCodeGenInfo(CGM, Kind);
   }
@@ -1329,22 +1327,29 @@ void CodeGenModule::Release() {
                               "tag-stack-memory-buildattr", 1);
 
   if (T.isARM() || T.isThumb() || T.isAArch64()) {
+    // Previously 1 is used and meant for the backed to derive the function
+    // attribute form it. 2 now means function attributes already set for all
+    // functions in this module, so no need to propagate those from the module
+    // flag. Value is only used in case of LTO module merge because the backend
+    // will see all required function attribute set already. Value is used
+    // before modules got merged. Any posive value means the feature is active
+    // and required binary markings need to be emit accordingly.
     if (LangOpts.BranchTargetEnforcement)
       getModule().addModuleFlag(llvm::Module::Min, "branch-target-enforcement",
-                                1);
+                                2);
     if (LangOpts.BranchProtectionPAuthLR)
       getModule().addModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr",
-                                1);
+                                2);
     if (LangOpts.GuardedControlStack)
-      getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 1);
+      getModule().addModuleFlag(llvm::Module::Min, "guarded-control-stack", 2);
     if (LangOpts.hasSignReturnAddress())
-      getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 1);
+      getModule().addModuleFlag(llvm::Module::Min, "sign-return-address", 2);
     if (LangOpts.isSignReturnAddressScopeAll())
       getModule().addModuleFlag(llvm::Module::Min, "sign-return-address-all",
-                                1);
+                                2);
     if (!LangOpts.isSignReturnAddressWithAKey())
       getModule().addModuleFlag(llvm::Module::Min,
-                                "sign-return-address-with-bkey", 1);
+                                "sign-return-address-with-bkey", 2);
 
     if (LangOpts.PointerAuthELFGOT)
       getModule().addModuleFlag(llvm::Module::Min, "ptrauth-elf-got", 1);
diff --git a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
index 920d28561fc2a..13007221a2ed2 100644
--- a/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/RISCV.cpp
@@ -1121,6 +1121,8 @@ Value *CodeGenFunction::EmitRISCVBuiltinExpr(unsigned BuiltinID,
   bool IsMasked = false;
   // This is used by segment load/store to determine it's llvm type.
   unsigned SegInstSEW = 8;
+  // This is used by XSfmm.
+  unsigned TWiden = 0;
 
   // Required for overloaded intrinsics.
   llvm::SmallVector<llvm::Type *, 2> IntrinsicTypes;
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index d0edae1295094..f63e900669d97 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -483,7 +483,6 @@ enum class AArch64ABIKind {
   DarwinPCS,
   Win64,
   AAPCSSoft,
-  PAuthTest,
 };
 
 std::unique_ptr<TargetCodeGenInfo>
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index 0bc4b4b7025f2..e4ad078dab197 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -439,9 +439,11 @@ void AMDGPUTargetCodeGenInfo::setTargetAttributes(
     return;
 
   const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (FD)
+  if (FD) {
     setFunctionDeclAttributes(FD, F, M);
-
+    if (FD->hasAttr<DeviceKernelAttr>() && !M.getLangOpts().OpenCL)
+      F->setCallingConv(getDeviceKernelCallingConv());
+  }
   if (!getABIInfo().getCodeGenOpts().EmitIEEENaNCompliantInsts)
     F->addFnAttr("amdgpu-ieee", "false");
 }
@@ -658,7 +660,7 @@ llvm::Value *AMDGPUTargetCodeGenInfo::createEnqueuedBlockKernel(
   // kernel address (only the kernel descriptor).
   auto *F = llvm::Function::Create(FT, llvm::GlobalValue::InternalLinkage, Name,
                                    &Mod);
-  F->setCallingConv(llvm::CallingConv::AMDGPU_KERNEL);
+  F->setCallingConv(getDeviceKernelCallingConv());
 
   llvm::AttrBuilder KernelAttrs(C);
   // FIXME: The invoke isn't applying the right attributes either
diff --git a/clang/lib/CodeGen/Targets/NVPTX.cpp b/clang/lib/CodeGen/Targets/NVPTX.cpp
index 53f2fc4a040a7..f6715861d91bc 100644
--- a/clang/lib/CodeGen/Targets/NVPTX.cpp
+++ b/clang/lib/CodeGen/Targets/NVPTX.cpp
@@ -264,7 +264,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
       // And kernel functions are not subject to inlining
       F->addFnAttr(llvm::Attribute::NoInline);
       if (FD->hasAttr<CUDAGlobalAttr>()) {
-        F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+        F->setCallingConv(getDeviceKernelCallingConv());
 
         for (auto IV : llvm::enumerate(FD->parameters()))
           if (IV.value()->hasAttr<CUDAGridConstantAttr>())
@@ -278,7 +278,7 @@ void NVPTXTargetCodeGenInfo::setTargetAttributes(
   }
   // Attach kernel metadata directly if compiling for NVPTX.
   if (FD->hasAttr<DeviceKernelAttr>())
-    F->setCallingConv(llvm::CallingConv::PTX_Kernel);
+    F->setCallingConv(getDeviceKernelCallingConv());
 }
 
 void NVPTXTargetCodeGenInfo::addNVVMMetadata(llvm::GlobalValue *GV,
diff --git a/clang/lib/CodeGen/Targets/SPIR.cpp b/clang/lib/CodeGen/Targets/SPIR.cpp
index 80e096ecf5ae9..15d0b353d748c 100644
--- a/clang/lib/CodeGen/Targets/SPIR.cpp
+++ b/clang/lib/CodeGen/Targets/SPIR.cpp
@@ -64,6 +64,8 @@ class CommonSPIRTargetCodeGenInfo : public TargetCodeGenInfo {
   llvm::Constant *getNullPointer(const CodeGen::CodeGenModule &CGM,
                                  llvm::PointerType *T,
                                  QualType QT) const override;
+  void setTargetAttributes(const Decl *D, llvm::GlobalValue *GV,
+                           CodeGen::CodeGenModule &M) const override;
 };
 class SPIRVTargetCodeGenInfo : public CommonSPIRTargetCodeGenInfo {
 public:
@@ -268,6 +270,22 @@ CommonSPIRTargetCodeGenInfo::getNullPointer(const CodeGen::CodeGenModule &CGM,
       llvm::ConstantPointerNull::get(NPT), PT);
 }
 
+void CommonSPIRTargetCodeGenInfo::setTargetAttributes(
+    const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
+  if (M.getLangOpts().OpenCL || GV->isDeclaration())
+    return;
+
+  const FunctionDecl *FD = dyn_cast<FunctionDecl>(D);
+  if (!FD)
+    return;
+
+  llvm::Function *F = dyn_cast<llvm::Function>(GV);
+  assert(F && "Expected GlobalValue to be a Function");
+
+  if (FD->hasAttr<DeviceKernelAttr>())
+    F->setCallingConv(getDeviceKernelCallingConv());
+}
+
 LangAS
 SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
                                                  const VarDecl *D) const {
@@ -292,19 +310,23 @@ SPIRVTargetCodeGenInfo::getGlobalVarAddressSpace(CodeGenModule &CGM,
 
 void SPIRVTargetCodeGenInfo::setTargetAttributes(
     const Decl *D, llvm::GlobalValue *GV, CodeGen::CodeGenModule &M) const {
-  if (!M.getLangOpts().HIP ||
-      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
-    return;
   if (GV->isDeclaration())
     return;
 
-  auto F = dyn_cast<llvm::Function>(GV);
-  if (!F)
+  const FunctionDecl *FD = dyn_cast_or_null<FunctionDecl>(D);
+  if (!FD)
     return;
 
-  auto FD = dyn_cast_or_null<FunctionDecl>(D);
-  if (!FD)
+  llvm::Function *F = dyn_cast<llvm::Function>(GV);
+  assert(F && "Expected GlobalValue to be a Function");
+
+  if (FD->hasAttr<DeviceKernelAttr>())
+    F->setCallingConv(getDeviceKernelCallingConv());
+
+  if (!M.getLangOpts().HIP ||
+      M.getTarget().getTriple().getVendor() != llvm::Triple::AMD)
     return;
+
   if (!FD->hasAttr<CUDAGlobalAttr>())
     return;
 
diff --git a/clang/lib/Driver/ToolChain.cpp b/clang/lib/Driver/ToolChain.cpp
index b1c6ef7bb7e51..5041f59a75c6d 100644
--- a/clang/lib/Driver/ToolChain.cpp
+++ b/clang/lib/Driver/ToolChain.cpp
@@ -1252,7 +1252,6 @@ std::string ToolChain::ComputeLLVMTriple(const ArgList &Args,
   }
   case llvm::Triple::aarch64: {
     llvm::Triple Triple = getTriple();
-    tools::aarch64::setPAuthABIInTriple(getDriver(), Args, Triple);
     if (!Triple.isOSBinFormatMachO())
       return Triple.getTriple();
 
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
index eb5d542f55605..e8d5e38a9064f 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.cpp
@@ -466,27 +466,6 @@ void aarch64::getAArch64TargetFeatures(const Driver &D,
     Features.push_back("+no-bti-at-return-twice");
 }
 
-void aarch64::setPAuthABIInTriple(const Driver &D, const ArgList &Args,
-                                  llvm::Triple &Triple) {
-  Arg *ABIArg = Args.getLastArg(options::OPT_mabi_EQ);
-  bool HasPAuthABI =
-      ABIArg ? (StringRef(ABIArg->getValue()) == "pauthtest") : false;
-
-  switch (Triple.getEnvironment()) {
-  case llvm::Triple::UnknownEnvironment:
-    if (HasPAuthABI)
-      Triple.setEnvironment(llvm::Triple::PAuthTest);
-    break;
-  case llvm::Triple::PAuthTest:
-    break;
-  default:
-    if (HasPAuthABI)
-      D.Diag(diag::err_drv_unsupported_opt_for_target)
-          << ABIArg->getAsString(Args) << Triple.getTriple();
-    break;
-  }
-}
-
 /// Is the triple {aarch64.aarch64_be}-none-elf?
 bool aarch64::isAArch64BareMetal(const llvm::Triple &Triple) {
   if (Triple.getArch() != llvm::Triple::aarch64 &&
diff --git a/clang/lib/Driver/ToolChains/Arch/AArch64.h b/clang/lib/Driver/ToolChains/Arch/AArch64.h
index 2765ee8c3a6aa..97ebfa61a0f0d 100644
--- a/clang/lib/Driver/ToolChains/Arch/AArch64.h
+++ b/clang/lib/Driver/ToolChains/Arch/AArch64.h
@@ -28,8 +28,6 @@ void getAArch64TargetFeatures(const Driver &D, const llvm::Triple &Triple,
 std::string getAArch64TargetCPU(const llvm::opt::ArgList &Args,
                                 const llvm::Triple &Triple, llvm::opt::Arg *&A);
 
-void setPAuthABIInTriple(const Driver &D, const llvm::opt::ArgList &Args,
-                         llvm::Triple &triple);
 bool isAArch64BareMetal(const llvm::Triple &Triple);
 
 } // end namespace aarch64
diff --git a/clang/lib/Driver/ToolChains/Arch/Mips.cpp b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
index 227c6a0d3d202..6a6a4ee1a647b 100644
--- a/clang/lib/Driver/ToolChains/Arch/Mips.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/Mips.cpp
@@ -502,8 +502,8 @@ bool mips::shouldUseFPXX(const ArgList &Args, const llvm::Triple &Triple,
   if (Arg *A = Args.getLastArg(options::OPT_mmsa))
     if (A->getOption().matches(options::OPT_mmsa))
       UseFPXX = llvm::StringSwitch<bool>(CPUName)
-                    .Cases("mips32r2", "mips32r3", "mips32r5", false)
-                    .Cases("mips64r2", "mips64r3", "mips64r5", false)
+                    .Cases({"mips32r2", "mips32r3", "mips32r5"}, false)
+                    .Cases({"mips64r2", "mips64r3", "mips64r5"}, false)
                     .Default(UseFPXX);
 
   return UseFPXX;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index efc9318498cea..9a1c454d5594a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1387,59 +1387,6 @@ void AddUnalignedAccessWarning(ArgStringList &CmdArgs) {
 }
 }
 
-// Each combination of options here forms a signing schema, and in most cases
-// each signing schema is its own incompatible ABI. The default values of the
-// options represent the default signing schema.
-static void handlePAuthABI(const ArgList &DriverArgs, ArgStringList &CC1Args) {
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
-                         options::OPT_fno_ptrauth_intrinsics))
-    CC1Args.push_back("-fptrauth-intrinsics");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
-                         options::OPT_fno_ptrauth_calls))
-    CC1Args.push_back("-fptrauth-calls");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
-                         options::OPT_fno_ptrauth_returns))
-    CC1Args.push_back("-fptrauth-returns");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
-                         options::OPT_fno_ptrauth_auth_traps))
-    CC1Args.push_back("-fptrauth-auth-traps");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_vtable_pointer_address_discrimination,
-          options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
-    CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_vtable_pointer_type_discrimination,
-          options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
-    CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
-          options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination))
-    CC1Args.push_back("-fptrauth-type-info-vtable-pointer-discrimination");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
-                         options::OPT_fno_ptrauth_indirect_gotos))
-    CC1Args.push_back("-fptrauth-indirect-gotos");
-
-  if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
-                         options::OPT_fno_ptrauth_init_fini))
-    CC1Args.push_back("-fptrauth-init-fini");
-
-  if (!DriverArgs.hasArg(
-          options::OPT_fptrauth_init_fini_address_discrimination,
-          options::OPT_fno_ptrauth_init_fini_address_discrimination))
-    CC1Args.push_back("-fptrauth-init-fini-address-discrimination");
-
-  if (!DriverArgs.hasArg(options::OPT_faarch64_jump_table_hardening,
-                         options::OPT_fno_aarch64_jump_table_hardening))
-    CC1Args.push_back("-faarch64-jump-table-hardening");
-}
-
 static void CollectARMPACBTIOptions(const ToolChain &TC, const ArgList &Args,
                                     ArgStringList &CmdArgs, bool isAArch64) {
   const llvm::Triple &Triple = TC.getEffectiveTriple();
@@ -1677,7 +1624,9 @@ void RenderAArch64ABI(const llvm::Triple &Triple, const ArgList &Args,
     ABIName = A->getValue();
   else if (Triple.isOSDarwin())
     ABIName = "darwinpcs";
-  else if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
+  // TODO: we probably want to have some target hook here.
+  else if (Triple.isOSLinux() &&
+           Triple.getEnvironment() == llvm::Triple::PAuthTest)
     ABIName = "pauthtest";
   else
     ABIName = "aapcs";
@@ -1797,8 +1746,6 @@ void Clang::AddAArch64TargetArgs(const ArgList &Args,
                     options::OPT_fno_ptrauth_objc_interface_sel);
   Args.addOptInFlag(CmdArgs, options::OPT_fptrauth_objc_class_ro,
                     options::OPT_fno_ptrauth_objc_class_ro);
-  if (Triple.getEnvironment() == llvm::Triple::PAuthTest)
-    handlePAuthABI(Args, CmdArgs);
 
   // Enable/disable return address signing and indirect branch targets.
   CollectARMPACBTIOptions(getToolChain(), Args, CmdArgs, true /*isAArch64*/);
@@ -9298,6 +9245,9 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
   };
   auto ShouldForward = [&](const llvm::DenseSet<unsigned> &Set, Arg *A,
                            const ToolChain &TC) {
+    // CMake hack to avoid printing verbose informatoin for HIP non-RDC mode.
+    if (A->getOption().matches(OPT_v) && JA.getType() == types::TY_Object)
+      return false;
     return (Set.contains(A->getOption().getID()) ||
             (A->getOption().getGroup().isValid() &&
              Set.contains(A->getOption().getGroup().getID()))) &&
@@ -9395,7 +9345,12 @@ void LinkerWrapper::ConstructJob(Compilation &C, const JobAction &JA,
 
   CmdArgs.push_back(
       Args.MakeArgString("--host-triple=" + getToolChain().getTripleString()));
-  if (Args.hasArg(options::OPT_v))
+
+  // CMake hack, suppress passing verbose arguments for the special-case HIP
+  // non-RDC mode compilation. This confuses default CMake implicit linker
+  // argument parsing when the language is set to HIP and the system linker is
+  // also `ld.lld`.
+  if (Args.hasArg(options::OPT_v) && JA.getType() != types::TY_Object)
     CmdArgs.push_back("--wrapper-verbose");
   if (Arg *A = Args.getLastArg(options::OPT_cuda_path_EQ))
     CmdArgs.push_back(
diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index b613ebaf17c49..52e43d1a003eb 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -687,11 +687,11 @@ static std::string getAMDGPUTargetGPU(const llvm::Triple &T,
   if (A) {
     auto GPUName = getProcessorFromTargetID(T, A->getValue());
     return llvm::StringSwitch<std::string>(GPUName)
-        .Cases("rv630", "rv635", "r600")
-        .Cases("rv610", "rv620", "rs780", "rs880")
+        .Cases({"rv630", "rv635"}, "r600")
+        .Cases({"rv610", "rv620", "rs780"}, "rs880")
         .Case("rv740", "rv770")
         .Case("palm", "cedar")
-        .Cases("sumo", "sumo2", "sumo")
+        .Cases({"sumo", "sumo2"}, "sumo")
         .Case("hemlock", "cypress")
         .Case("aruba", "cayman")
         .Default(GPUName.str());
@@ -974,6 +974,24 @@ bool tools::isTLSDESCEnabled(const ToolChain &TC,
   return EnableTLSDESC;
 }
 
+void tools::addDTLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
+                            llvm::opt::ArgStringList &CmdArgs) {
+  if (Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
+    CmdArgs.push_back(
+        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
+    const Driver &D = ToolChain.getDriver();
+    CmdArgs.push_back(Args.MakeArgString("--thinlto-remote-compiler=" +
+                                         Twine(D.getClangProgramPath())));
+    if (auto *PA = D.getPrependArg())
+      CmdArgs.push_back(Args.MakeArgString(
+          "--thinlto-remote-compiler-prepend-arg=" + Twine(PA)));
+
+    for (const auto &A :
+         Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
+      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
+  }
+}
+
 void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
                           ArgStringList &CmdArgs, const InputInfo &Output,
                           const InputInfoList &Inputs, bool IsThinLTO) {
@@ -1376,16 +1394,7 @@ void tools::addLTOOptions(const ToolChain &ToolChain, const ArgList &Args,
     CmdArgs.push_back(
         Args.MakeArgString(Twine(PluginOptPrefix) + "-time-passes"));
 
-  if (Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
-    CmdArgs.push_back(
-        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
-    CmdArgs.push_back(
-        Args.MakeArgString("--thinlto-remote-compiler=" +
-                           Twine(ToolChain.getDriver().getClangProgramPath())));
-
-    for (auto A : Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
-      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
-  }
+  addDTLTOOptions(ToolChain, Args, CmdArgs);
 }
 
 void tools::addOpenMPRuntimeSpecificRPath(const ToolChain &TC,
diff --git a/clang/lib/Driver/ToolChains/Fuchsia.cpp b/clang/lib/Driver/ToolChains/Fuchsia.cpp
index 146dc8bbd5313..31c2f3f7e1be4 100644
--- a/clang/lib/Driver/ToolChains/Fuchsia.cpp
+++ b/clang/lib/Driver/ToolChains/Fuchsia.cpp
@@ -481,9 +481,11 @@ SanitizerMask Fuchsia::getSupportedSanitizers() const {
   Res |= SanitizerKind::Fuzzer;
   Res |= SanitizerKind::FuzzerNoLink;
   Res |= SanitizerKind::Leak;
-  Res |= SanitizerKind::SafeStack;
   Res |= SanitizerKind::Scudo;
   Res |= SanitizerKind::Thread;
+  if (getTriple().getArch() == llvm::Triple::x86_64) {
+    Res |= SanitizerKind::SafeStack;
+  }
   return Res;
 }
 
diff --git a/clang/lib/Driver/ToolChains/Linux.cpp b/clang/lib/Driver/ToolChains/Linux.cpp
index c2897f464bd3c..f452109134171 100644
--- a/clang/lib/Driver/ToolChains/Linux.cpp
+++ b/clang/lib/Driver/ToolChains/Linux.cpp
@@ -445,6 +445,102 @@ std::string Linux::computeSysRoot() const {
   return std::string();
 }
 
+static void setPAuthABIInTriple(const Driver &D, const ArgList &Args,
+                                llvm::Triple &Triple) {
+  Arg *ABIArg = Args.getLastArg(options::OPT_mabi_EQ);
+  bool HasPAuthABI =
+      ABIArg ? (StringRef(ABIArg->getValue()) == "pauthtest") : false;
+
+  switch (Triple.getEnvironment()) {
+  case llvm::Triple::UnknownEnvironment:
+    if (HasPAuthABI)
+      Triple.setEnvironment(llvm::Triple::PAuthTest);
+    break;
+  case llvm::Triple::PAuthTest:
+    break;
+  default:
+    if (HasPAuthABI)
+      D.Diag(diag::err_drv_unsupported_opt_for_target)
+          << ABIArg->getAsString(Args) << Triple.getTriple();
+    break;
+  }
+}
+
+std::string Linux::ComputeEffectiveClangTriple(const llvm::opt::ArgList &Args,
+                                               types::ID InputType) const {
+  std::string TripleString =
+      Generic_ELF::ComputeEffectiveClangTriple(Args, InputType);
+  if (getTriple().isAArch64()) {
+    llvm::Triple Triple(TripleString);
+    setPAuthABIInTriple(getDriver(), Args, Triple);
+    return Triple.getTriple();
+  }
+  return TripleString;
+}
+
+// Each combination of options here forms a signing schema, and in most cases
+// each signing schema is its own incompatible ABI. The default values of the
+// options represent the default signing schema.
+static void handlePAuthABI(const Driver &D, const ArgList &DriverArgs,
+                           ArgStringList &CC1Args) {
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_intrinsics,
+                         options::OPT_fno_ptrauth_intrinsics))
+    CC1Args.push_back("-fptrauth-intrinsics");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_calls,
+                         options::OPT_fno_ptrauth_calls))
+    CC1Args.push_back("-fptrauth-calls");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_returns,
+                         options::OPT_fno_ptrauth_returns))
+    CC1Args.push_back("-fptrauth-returns");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_auth_traps,
+                         options::OPT_fno_ptrauth_auth_traps))
+    CC1Args.push_back("-fptrauth-auth-traps");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_vtable_pointer_address_discrimination,
+          options::OPT_fno_ptrauth_vtable_pointer_address_discrimination))
+    CC1Args.push_back("-fptrauth-vtable-pointer-address-discrimination");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_vtable_pointer_type_discrimination,
+          options::OPT_fno_ptrauth_vtable_pointer_type_discrimination))
+    CC1Args.push_back("-fptrauth-vtable-pointer-type-discrimination");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_type_info_vtable_pointer_discrimination,
+          options::OPT_fno_ptrauth_type_info_vtable_pointer_discrimination))
+    CC1Args.push_back("-fptrauth-type-info-vtable-pointer-discrimination");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_indirect_gotos,
+                         options::OPT_fno_ptrauth_indirect_gotos))
+    CC1Args.push_back("-fptrauth-indirect-gotos");
+
+  if (!DriverArgs.hasArg(options::OPT_fptrauth_init_fini,
+                         options::OPT_fno_ptrauth_init_fini))
+    CC1Args.push_back("-fptrauth-init-fini");
+
+  if (!DriverArgs.hasArg(
+          options::OPT_fptrauth_init_fini_address_discrimination,
+          options::OPT_fno_ptrauth_init_fini_address_discrimination))
+    CC1Args.push_back("-fptrauth-init-fini-address-discrimination");
+
+  if (!DriverArgs.hasArg(options::OPT_faarch64_jump_table_hardening,
+                         options::OPT_fno_aarch64_jump_table_hardening))
+    CC1Args.push_back("-faarch64-jump-table-hardening");
+}
+
+void Linux::addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                                  llvm::opt::ArgStringList &CC1Args,
+                                  Action::OffloadKind DeviceOffloadKind) const {
+  llvm::Triple Triple(ComputeEffectiveClangTriple(DriverArgs));
+  if (Triple.isAArch64() && Triple.getEnvironment() == llvm::Triple::PAuthTest)
+    handlePAuthABI(getDriver(), DriverArgs, CC1Args);
+  Generic_ELF::addClangTargetOptions(DriverArgs, CC1Args, DeviceOffloadKind);
+}
+
 std::string Linux::getDynamicLinker(const ArgList &Args) const {
   const llvm::Triple::ArchType Arch = getArch();
   const llvm::Triple &Triple = getTriple();
diff --git a/clang/lib/Driver/ToolChains/Linux.h b/clang/lib/Driver/ToolChains/Linux.h
index c90e7308e86d2..74be01d026681 100644
--- a/clang/lib/Driver/ToolChains/Linux.h
+++ b/clang/lib/Driver/ToolChains/Linux.h
@@ -59,7 +59,14 @@ class LLVM_LIBRARY_VISIBILITY Linux : public Generic_ELF {
   SanitizerMask getSupportedSanitizers() const override;
   void addProfileRTLibs(const llvm::opt::ArgList &Args,
                         llvm::opt::ArgStringList &CmdArgs) const override;
+  std::string ComputeEffectiveClangTriple(
+      const llvm::opt::ArgList &Args,
+      types::ID InputType = types::TY_INVALID) const override;
   std::string computeSysRoot() const override;
+  void
+  addClangTargetOptions(const llvm::opt::ArgList &DriverArgs,
+                        llvm::opt::ArgStringList &CC1Args,
+                        Action::OffloadKind DeviceOffloadKind) const override;
 
   std::string getDynamicLinker(const llvm::opt::ArgList &Args) const override;
 
diff --git a/clang/lib/Driver/ToolChains/PS4CPU.cpp b/clang/lib/Driver/ToolChains/PS4CPU.cpp
index 61afc61a53dfd..34ec65ae59602 100644
--- a/clang/lib/Driver/ToolChains/PS4CPU.cpp
+++ b/clang/lib/Driver/ToolChains/PS4CPU.cpp
@@ -344,16 +344,7 @@ void tools::PS5cpu::Linker::ConstructJob(Compilation &C, const JobAction &JA,
   // pass LTO options to ensure proper codegen, metadata production, etc if
   // LTO indeed occurs.
 
-  if (const Arg *A = Args.getLastArg(options::OPT_fthinlto_distributor_EQ)) {
-    CmdArgs.push_back(
-        Args.MakeArgString("--thinlto-distributor=" + Twine(A->getValue())));
-    CmdArgs.push_back(Args.MakeArgString("--thinlto-remote-compiler=" +
-                                         Twine(D.getClangProgramPath())));
-
-    for (const auto &A :
-         Args.getAllArgValues(options::OPT_Xthinlto_distributor_EQ))
-      CmdArgs.push_back(Args.MakeArgString("--thinlto-distributor-arg=" + A));
-  }
+  tools::addDTLTOOptions(TC, Args, CmdArgs);
 
   if (Args.hasFlag(options::OPT_funified_lto, options::OPT_fno_unified_lto,
                    true))
diff --git a/clang/lib/Format/ContinuationIndenter.cpp b/clang/lib/Format/ContinuationIndenter.cpp
index 37c10c66503bb..e5abf833194d4 100644
--- a/clang/lib/Format/ContinuationIndenter.cpp
+++ b/clang/lib/Format/ContinuationIndenter.cpp
@@ -798,9 +798,11 @@ void ContinuationIndenter::addTokenOnCurrentLine(LineState &State, bool DryRun,
   }
 
   if (!DryRun) {
+    const bool ContinuePPDirective =
+        State.Line->InMacroBody && Current.isNot(TT_LineComment);
     Whitespaces.replaceWhitespace(Current, /*Newlines=*/0, Spaces,
                                   State.Column + Spaces + PPColumnCorrection,
-                                  /*IsAligned=*/false, State.Line->InMacroBody);
+                                  /*IsAligned=*/false, ContinuePPDirective);
   }
 
   // If "BreakBeforeInheritanceComma" mode, don't break within the inheritance
@@ -1176,10 +1178,11 @@ unsigned ContinuationIndenter::addTokenOnNewLine(LineState &State,
       // about removing empty lines on closing blocks. Special case them here.
       MaxEmptyLinesToKeep = 1;
     }
-    unsigned Newlines =
+    const unsigned Newlines =
         std::max(1u, std::min(Current.NewlinesBefore, MaxEmptyLinesToKeep));
-    bool ContinuePPDirective =
-        State.Line->InPPDirective && State.Line->Type != LT_ImportStatement;
+    const bool ContinuePPDirective = State.Line->InPPDirective &&
+                                     State.Line->Type != LT_ImportStatement &&
+                                     Current.isNot(TT_LineComment);
     Whitespaces.replaceWhitespace(Current, Newlines, State.Column, State.Column,
                                   CurrentState.IsAligned, ContinuePPDirective);
   }
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index c97a9e81eb59e..1d0dfd0b9c151 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -4407,8 +4407,12 @@ unsigned TokenAnnotator::splitPenalty(const AnnotatedLine &Line,
   // breaking after it.
   if (Right.is(TT_SelectorName))
     return 0;
-  if (Left.is(tok::colon) && Left.is(TT_ObjCMethodExpr))
-    return Line.MightBeFunctionDecl ? 50 : 500;
+  if (Left.is(tok::colon)) {
+    if (Left.is(TT_ObjCMethodExpr))
+      return Line.MightBeFunctionDecl ? 50 : 500;
+    if (Left.is(TT_ObjCSelector))
+      return 500;
+  }
 
   // In Objective-C type declarations, avoid breaking after the category's
   // open paren (we'll prefer breaking after the protocol list's opening
@@ -6291,7 +6295,9 @@ bool TokenAnnotator::canBreakBefore(const AnnotatedLine &Line,
                      TT_BitFieldColon)) {
     return false;
   }
-  if (Left.is(tok::colon) && Left.isOneOf(TT_DictLiteral, TT_ObjCMethodExpr)) {
+  if (Left.is(tok::colon) && Left.isOneOf(TT_ObjCSelector, TT_ObjCMethodExpr))
+    return true;
+  if (Left.is(tok::colon) && Left.is(TT_DictLiteral)) {
     if (Style.isProto()) {
       if (!Style.AlwaysBreakBeforeMultilineStrings && Right.isStringLiteral())
         return false;
diff --git a/clang/lib/Frontend/ASTUnit.cpp b/clang/lib/Frontend/ASTUnit.cpp
index cb445682ac48b..6cc7094846155 100644
--- a/clang/lib/Frontend/ASTUnit.cpp
+++ b/clang/lib/Frontend/ASTUnit.cpp
@@ -512,152 +512,73 @@ namespace {
 /// Gathers information from ASTReader that will be used to initialize
 /// a Preprocessor.
 class ASTInfoCollector : public ASTReaderListener {
-  Preprocessor &PP;
-  ASTContext *Context;
   HeaderSearchOptions &HSOpts;
+  std::string &SpecificModuleCachePath;
   PreprocessorOptions &PPOpts;
-  LangOptions &LangOpt;
+  LangOptions &LangOpts;
   CodeGenOptions &CodeGenOpts;
-  std::shared_ptr<TargetOptions> &TargetOpts;
-  IntrusiveRefCntPtr<TargetInfo> &Target;
+  TargetOptions &TargetOpts;
   unsigned &Counter;
-  bool InitializedLanguage = false;
-  bool InitializedHeaderSearchPaths = false;
 
 public:
-  ASTInfoCollector(Preprocessor &PP, ASTContext *Context,
-                   HeaderSearchOptions &HSOpts, PreprocessorOptions &PPOpts,
-                   LangOptions &LangOpt, CodeGenOptions &CodeGenOpts,
-                   std::shared_ptr<TargetOptions> &TargetOpts,
-                   IntrusiveRefCntPtr<TargetInfo> &Target, unsigned &Counter)
-      : PP(PP), Context(Context), HSOpts(HSOpts), PPOpts(PPOpts),
-        LangOpt(LangOpt), CodeGenOpts(CodeGenOpts), TargetOpts(TargetOpts),
-        Target(Target), Counter(Counter) {}
-
-  bool ReadLanguageOptions(const LangOptions &LangOpts,
+  ASTInfoCollector(HeaderSearchOptions &HSOpts,
+                   std::string &SpecificModuleCachePath,
+                   PreprocessorOptions &PPOpts, LangOptions &LangOpts,
+                   CodeGenOptions &CodeGenOpts, TargetOptions &TargetOpts,
+                   unsigned &Counter)
+      : HSOpts(HSOpts), SpecificModuleCachePath(SpecificModuleCachePath),
+        PPOpts(PPOpts), LangOpts(LangOpts), CodeGenOpts(CodeGenOpts),
+        TargetOpts(TargetOpts), Counter(Counter) {}
+
+  bool ReadLanguageOptions(const LangOptions &NewLangOpts,
                            StringRef ModuleFilename, bool Complain,
                            bool AllowCompatibleDifferences) override {
-    if (InitializedLanguage)
-      return false;
-
-    // FIXME: We did similar things in ReadHeaderSearchOptions too. But such
-    // style is not scaling. Probably we need to invite some mechanism to
-    // handle such patterns generally.
-    auto PICLevel = LangOpt.PICLevel;
-    auto PIE = LangOpt.PIE;
-
-    LangOpt = LangOpts;
-
-    LangOpt.PICLevel = PICLevel;
-    LangOpt.PIE = PIE;
-
-    InitializedLanguage = true;
-
-    updated();
+    LangOpts = NewLangOpts;
     return false;
   }
 
-  bool ReadCodeGenOptions(const CodeGenOptions &CGOpts,
+  bool ReadCodeGenOptions(const CodeGenOptions &NewCodeGenOpts,
                           StringRef ModuleFilename, bool Complain,
                           bool AllowCompatibleDifferences) override {
-    this->CodeGenOpts = CGOpts;
+    CodeGenOpts = NewCodeGenOpts;
     return false;
   }
 
-  bool ReadHeaderSearchOptions(const HeaderSearchOptions &HSOpts,
+  bool ReadHeaderSearchOptions(const HeaderSearchOptions &NewHSOpts,
                                StringRef ModuleFilename,
-                               StringRef SpecificModuleCachePath,
+                               StringRef NewSpecificModuleCachePath,
                                bool Complain) override {
-    // llvm::SaveAndRestore doesn't support bit field.
-    auto ForceCheckCXX20ModulesInputFiles =
-        this->HSOpts.ForceCheckCXX20ModulesInputFiles;
-    llvm::SaveAndRestore X(this->HSOpts.UserEntries);
-    llvm::SaveAndRestore Y(this->HSOpts.SystemHeaderPrefixes);
-    llvm::SaveAndRestore Z(this->HSOpts.VFSOverlayFiles);
-
-    this->HSOpts = HSOpts;
-    this->HSOpts.ForceCheckCXX20ModulesInputFiles =
-        ForceCheckCXX20ModulesInputFiles;
-
+    HSOpts = NewHSOpts;
+    SpecificModuleCachePath = NewSpecificModuleCachePath;
     return false;
   }
 
-  bool ReadHeaderSearchPaths(const HeaderSearchOptions &HSOpts,
+  bool ReadHeaderSearchPaths(const HeaderSearchOptions &NewHSOpts,
                              bool Complain) override {
-    if (InitializedHeaderSearchPaths)
-      return false;
-
-    this->HSOpts.UserEntries = HSOpts.UserEntries;
-    this->HSOpts.SystemHeaderPrefixes = HSOpts.SystemHeaderPrefixes;
-    this->HSOpts.VFSOverlayFiles = HSOpts.VFSOverlayFiles;
-
-    // Initialize the FileManager. We can't do this in update(), since that
-    // performs the initialization too late (once both target and language
-    // options are read).
-    PP.getFileManager().setVirtualFileSystem(createVFSFromOverlayFiles(
-        HSOpts.VFSOverlayFiles, PP.getDiagnostics(),
-        PP.getFileManager().getVirtualFileSystemPtr()));
-
-    InitializedHeaderSearchPaths = true;
-
+    HSOpts.UserEntries = NewHSOpts.UserEntries;
+    HSOpts.SystemHeaderPrefixes = NewHSOpts.SystemHeaderPrefixes;
+    HSOpts.VFSOverlayFiles = NewHSOpts.VFSOverlayFiles;
     return false;
   }
 
-  bool ReadPreprocessorOptions(const PreprocessorOptions &PPOpts,
+  bool ReadPreprocessorOptions(const PreprocessorOptions &NewPPOpts,
                                StringRef ModuleFilename, bool ReadMacros,
                                bool Complain,
                                std::string &SuggestedPredefines) override {
-    this->PPOpts = PPOpts;
+    PPOpts = NewPPOpts;
     return false;
   }
 
-  bool ReadTargetOptions(const TargetOptions &TargetOpts,
+  bool ReadTargetOptions(const TargetOptions &NewTargetOpts,
                          StringRef ModuleFilename, bool Complain,
                          bool AllowCompatibleDifferences) override {
-    // If we've already initialized the target, don't do it again.
-    if (Target)
-      return false;
-
-    this->TargetOpts = std::make_shared<TargetOptions>(TargetOpts);
-    Target =
-        TargetInfo::CreateTargetInfo(PP.getDiagnostics(), *this->TargetOpts);
-
-    updated();
+    TargetOpts = NewTargetOpts;
     return false;
   }
 
   void ReadCounter(const serialization::ModuleFile &M,
-                   unsigned Value) override {
-    Counter = Value;
-  }
-
-private:
-  void updated() {
-    if (!Target || !InitializedLanguage)
-      return;
-
-    // Inform the target of the language options.
-    //
-    // FIXME: We shouldn't need to do this, the target should be immutable once
-    // created. This complexity should be lifted elsewhere.
-    Target->adjust(PP.getDiagnostics(), LangOpt, /*AuxTarget=*/nullptr);
-
-    // Initialize the preprocessor.
-    PP.Initialize(*Target);
-
-    if (!Context)
-      return;
-
-    // Initialize the ASTContext
-    Context->InitBuiltinTypes(*Target);
-
-    // Adjust printing policy based on language options.
-    Context->setPrintingPolicy(PrintingPolicy(LangOpt));
-
-    // We didn't have access to the comment options when the ASTContext was
-    // constructed, so register them now.
-    Context->getCommentCommandTraits().registerCommentOptions(
-        LangOpt.CommentOpts);
+                   unsigned NewCounter) override {
+    Counter = NewCounter;
   }
 };
 
@@ -812,7 +733,7 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
     std::shared_ptr<DiagnosticOptions> DiagOpts,
     IntrusiveRefCntPtr<DiagnosticsEngine> Diags,
     const FileSystemOptions &FileSystemOpts, const HeaderSearchOptions &HSOpts,
-    const LangOptions *LangOpts, bool OnlyLocalDecls,
+    const LangOptions *ProvidedLangOpts, bool OnlyLocalDecls,
     CaptureDiagsKind CaptureDiagnostics, bool AllowASTWithCompilerErrors,
     bool UserFilesAreVolatile) {
   std::unique_ptr<ASTUnit> AST(new ASTUnit(true));
@@ -826,41 +747,71 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
 
   ConfigureDiags(Diags, *AST, CaptureDiagnostics);
 
-  AST->LangOpts = LangOpts ? std::make_unique<LangOptions>(*LangOpts)
-                           : std::make_unique<LangOptions>();
+  std::unique_ptr<LangOptions> LocalLangOpts;
+  const LangOptions &LangOpts = [&]() -> const LangOptions & {
+    if (ProvidedLangOpts)
+      return *ProvidedLangOpts;
+    LocalLangOpts = std::make_unique<LangOptions>();
+    return *LocalLangOpts;
+  }();
+
+  AST->LangOpts = std::make_unique<LangOptions>(LangOpts);
   AST->OnlyLocalDecls = OnlyLocalDecls;
   AST->CaptureDiagnostics = CaptureDiagnostics;
   AST->DiagOpts = DiagOpts;
   AST->Diagnostics = Diags;
-  AST->FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(FileSystemOpts, VFS);
   AST->UserFilesAreVolatile = UserFilesAreVolatile;
-  AST->SourceMgr = llvm::makeIntrusiveRefCnt<SourceManager>(
-      AST->getDiagnostics(), AST->getFileManager(), UserFilesAreVolatile);
-  AST->ModCache = createCrossProcessModuleCache();
   AST->HSOpts = std::make_unique<HeaderSearchOptions>(HSOpts);
   AST->HSOpts->ModuleFormat = std::string(PCHContainerRdr.getFormats().front());
-  AST->HeaderInfo.reset(new HeaderSearch(AST->getHeaderSearchOpts(),
-                                         AST->getSourceManager(),
-                                         AST->getDiagnostics(),
-                                         AST->getLangOpts(),
-                                         /*Target=*/nullptr));
   AST->PPOpts = std::make_shared<PreprocessorOptions>();
+  AST->CodeGenOpts = std::make_unique<CodeGenOptions>();
+  AST->TargetOpts = std::make_shared<TargetOptions>();
+
+  AST->ModCache = createCrossProcessModuleCache();
+
+  // Gather info for preprocessor construction later on.
+  std::string SpecificModuleCachePath;
+  unsigned Counter = 0;
+  // Using a temporary FileManager since the AST file might specify custom
+  // HeaderSearchOptions::VFSOverlayFiles that affect the underlying VFS.
+  FileManager TmpFileMgr(FileSystemOpts, VFS);
+  ASTInfoCollector Collector(*AST->HSOpts, SpecificModuleCachePath,
+                             *AST->PPOpts, *AST->LangOpts, *AST->CodeGenOpts,
+                             *AST->TargetOpts, Counter);
+  if (ASTReader::readASTFileControlBlock(
+          Filename, TmpFileMgr, *AST->ModCache, PCHContainerRdr,
+          /*FindModuleFileExtensions=*/true, Collector,
+          /*ValidateDiagnosticOptions=*/true, ASTReader::ARR_None)) {
+    AST->getDiagnostics().Report(diag::err_fe_unable_to_load_pch);
+    return nullptr;
+  }
+
+  VFS = createVFSFromOverlayFiles(AST->HSOpts->VFSOverlayFiles,
+                                  *AST->Diagnostics, std::move(VFS));
 
-  // Gather Info for preprocessor construction later on.
+  AST->FileMgr = llvm::makeIntrusiveRefCnt<FileManager>(FileSystemOpts, VFS);
+
+  AST->SourceMgr = llvm::makeIntrusiveRefCnt<SourceManager>(
+      AST->getDiagnostics(), AST->getFileManager(), UserFilesAreVolatile);
 
-  HeaderSearch &HeaderInfo = *AST->HeaderInfo;
+  AST->HSOpts->PrebuiltModuleFiles = HSOpts.PrebuiltModuleFiles;
+  AST->HSOpts->PrebuiltModulePaths = HSOpts.PrebuiltModulePaths;
+  AST->HeaderInfo = std::make_unique<HeaderSearch>(
+      AST->getHeaderSearchOpts(), AST->getSourceManager(),
+      AST->getDiagnostics(), AST->getLangOpts(),
+      /*Target=*/nullptr);
+  AST->HeaderInfo->setModuleCachePath(SpecificModuleCachePath);
 
   AST->PP = std::make_shared<Preprocessor>(
       *AST->PPOpts, AST->getDiagnostics(), *AST->LangOpts,
-      AST->getSourceManager(), HeaderInfo, AST->ModuleLoader,
+      AST->getSourceManager(), *AST->HeaderInfo, AST->ModuleLoader,
       /*IILookup=*/nullptr,
       /*OwnsHeaderSearch=*/false);
-  Preprocessor &PP = *AST->PP;
 
   if (ToLoad >= LoadASTOnly)
     AST->Ctx = llvm::makeIntrusiveRefCnt<ASTContext>(
-        *AST->LangOpts, AST->getSourceManager(), PP.getIdentifierTable(),
-        PP.getSelectorTable(), PP.getBuiltinInfo(),
+        *AST->LangOpts, AST->getSourceManager(), AST->PP->getIdentifierTable(),
+        AST->PP->getSelectorTable(), AST->PP->getBuiltinInfo(),
         AST->getTranslationUnitKind());
 
   DisableValidationForModuleKind disableValid =
@@ -868,24 +819,60 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
   if (::getenv("LIBCLANG_DISABLE_PCH_VALIDATION"))
     disableValid = DisableValidationForModuleKind::All;
   AST->Reader = llvm::makeIntrusiveRefCnt<ASTReader>(
-      PP, *AST->ModCache, AST->Ctx.get(), PCHContainerRdr, *AST->CodeGenOpts,
-      ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
+      *AST->PP, *AST->ModCache, AST->Ctx.get(), PCHContainerRdr,
+      *AST->CodeGenOpts, ArrayRef<std::shared_ptr<ModuleFileExtension>>(),
       /*isysroot=*/"",
       /*DisableValidationKind=*/disableValid, AllowASTWithCompilerErrors);
 
-  unsigned Counter = 0;
-  AST->Reader->setListener(std::make_unique<ASTInfoCollector>(
-      *AST->PP, AST->Ctx.get(), *AST->HSOpts, *AST->PPOpts, *AST->LangOpts,
-      *AST->CodeGenOpts, AST->TargetOpts, AST->Target, Counter));
-
-  // Attach the AST reader to the AST context as an external AST
-  // source, so that declarations will be deserialized from the
-  // AST file as needed.
+  // Attach the AST reader to the AST context as an external AST source, so that
+  // declarations will be deserialized from the AST file as needed.
   // We need the external source to be set up before we read the AST, because
   // eagerly-deserialized declarations may use it.
   if (AST->Ctx)
     AST->Ctx->setExternalSource(AST->Reader);
 
+  AST->Target =
+      TargetInfo::CreateTargetInfo(AST->PP->getDiagnostics(), *AST->TargetOpts);
+  // Inform the target of the language options.
+  //
+  // FIXME: We shouldn't need to do this, the target should be immutable once
+  // created. This complexity should be lifted elsewhere.
+  AST->Target->adjust(AST->PP->getDiagnostics(), *AST->LangOpts,
+                      /*AuxTarget=*/nullptr);
+
+  // Initialize the preprocessor.
+  AST->PP->Initialize(*AST->Target);
+
+  AST->PP->setCounterValue(Counter);
+
+  if (AST->Ctx) {
+    // Initialize the ASTContext
+    AST->Ctx->InitBuiltinTypes(*AST->Target);
+
+    // Adjust printing policy based on language options.
+    AST->Ctx->setPrintingPolicy(PrintingPolicy(*AST->LangOpts));
+
+    // We didn't have access to the comment options when the ASTContext was
+    // constructed, so register them now.
+    AST->Ctx->getCommentCommandTraits().registerCommentOptions(
+        AST->LangOpts->CommentOpts);
+  }
+
+  // The temporary FileManager we used for ASTReader::readASTFileControlBlock()
+  // might have already read stdin, and reading it again will fail. Let's
+  // explicitly forward the buffer.
+  if (Filename == "-")
+    if (auto FE = llvm::expectedToOptional(TmpFileMgr.getSTDIN()))
+      if (auto BufRef = TmpFileMgr.getBufferForFile(*FE)) {
+        auto Buf = llvm::MemoryBuffer::getMemBufferCopy(
+            (*BufRef)->getBuffer(), (*BufRef)->getBufferIdentifier());
+        AST->Reader->getModuleManager().addInMemoryBuffer("-", std::move(Buf));
+      }
+
+  // Reinstate the provided options that are relevant for reading AST files.
+  AST->HSOpts->ForceCheckCXX20ModulesInputFiles =
+      HSOpts.ForceCheckCXX20ModulesInputFiles;
+
   switch (AST->Reader->ReadAST(Filename, serialization::MK_MainFile,
                                SourceLocation(), ASTReader::ARR_None)) {
   case ASTReader::Success:
@@ -901,11 +888,18 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
     return nullptr;
   }
 
-  AST->OriginalSourceFile = std::string(AST->Reader->getOriginalSourceFile());
+  // Now that we have successfully loaded the AST file, we can reinstate some
+  // options that the clients expect us to preserve (but would trip AST file
+  // validation, so we couldn't set them earlier).
+  AST->HSOpts->UserEntries = HSOpts.UserEntries;
+  AST->HSOpts->SystemHeaderPrefixes = HSOpts.SystemHeaderPrefixes;
+  AST->HSOpts->VFSOverlayFiles = HSOpts.VFSOverlayFiles;
+  AST->LangOpts->PICLevel = LangOpts.PICLevel;
+  AST->LangOpts->PIE = LangOpts.PIE;
 
-  PP.setCounterValue(Counter);
+  AST->OriginalSourceFile = std::string(AST->Reader->getOriginalSourceFile());
 
-  Module *M = HeaderInfo.lookupModule(AST->getLangOpts().CurrentModule);
+  Module *M = AST->HeaderInfo->lookupModule(AST->getLangOpts().CurrentModule);
   if (M && AST->getLangOpts().isCompilingModule() && M->isNamedModule())
     AST->Ctx->setCurrentNamedModule(M);
 
@@ -915,13 +909,14 @@ std::unique_ptr<ASTUnit> ASTUnit::LoadFromASTFile(
 
   // Create a semantic analysis object and tell the AST reader about it.
   if (ToLoad >= LoadEverything) {
-    AST->TheSema.reset(new Sema(PP, *AST->Ctx, *AST->Consumer));
+    AST->TheSema = std::make_unique<Sema>(*AST->PP, *AST->Ctx, *AST->Consumer);
     AST->TheSema->Initialize();
     AST->Reader->InitializeSema(*AST->TheSema);
   }
 
   // Tell the diagnostic client that we have started a source file.
-  AST->getDiagnostics().getClient()->BeginSourceFile(PP.getLangOpts(), &PP);
+  AST->getDiagnostics().getClient()->BeginSourceFile(AST->PP->getLangOpts(),
+                                                     AST->PP.get());
 
   return AST;
 }
@@ -1651,6 +1646,7 @@ ASTUnit *ASTUnit::LoadFromCompilerInvocationAction(
   AST->Reader = nullptr;
 
   // Create a file manager object to provide access to and cache the filesystem.
+  Clang->setVirtualFileSystem(AST->getVirtualFileSystemPtr());
   Clang->setFileManager(AST->getFileManagerPtr());
 
   // Create the source manager.
@@ -2290,6 +2286,7 @@ void ASTUnit::CodeComplete(
          "IR inputs not support here!");
 
   // Use the source and file managers that we were given.
+  Clang->setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr());
   Clang->setFileManager(FileMgr);
   Clang->setSourceManager(SourceMgr);
 
diff --git a/clang/lib/Frontend/CompilerInstance.cpp b/clang/lib/Frontend/CompilerInstance.cpp
index 584436665622d..6b09f7f9fc1e3 100644
--- a/clang/lib/Frontend/CompilerInstance.cpp
+++ b/clang/lib/Frontend/CompilerInstance.cpp
@@ -160,8 +160,6 @@ bool CompilerInstance::createTarget() {
 }
 
 void CompilerInstance::setFileManager(IntrusiveRefCntPtr<FileManager> Value) {
-  if (!hasVirtualFileSystem())
-    setVirtualFileSystem(Value->getVirtualFileSystemPtr());
   assert(Value == nullptr ||
          getVirtualFileSystemPtr() == Value->getVirtualFileSystemPtr());
   FileMgr = std::move(Value);
@@ -548,14 +546,11 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
 std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {
   assert(FileMgr && "Specific module cache path requires a FileManager");
 
-  if (getHeaderSearchOpts().ModuleCachePath.empty())
-    return "";
-
   // Set up the module path, including the hash for the module-creation options.
   SmallString<256> SpecificModuleCache;
   normalizeModuleCachePath(*FileMgr, getHeaderSearchOpts().ModuleCachePath,
                            SpecificModuleCache);
-  if (!getHeaderSearchOpts().DisableModuleHash)
+  if (!SpecificModuleCache.empty() && !getHeaderSearchOpts().DisableModuleHash)
     llvm::sys::path::append(SpecificModuleCache, ModuleHash);
   return std::string(SpecificModuleCache);
 }
@@ -887,7 +882,7 @@ CompilerInstance::createOutputFileImpl(StringRef OutputPath, bool Binary,
            "File Manager is required to fix up relative path.\n");
 
     AbsPath.emplace(OutputPath);
-    FileMgr->FixupRelativePath(*AbsPath);
+    FileManager::fixupRelativePath(getFileSystemOpts(), *AbsPath);
     OutputPath = *AbsPath;
   }
 
diff --git a/clang/lib/Frontend/CompilerInvocation.cpp b/clang/lib/Frontend/CompilerInvocation.cpp
index 2fd283aaf8b64..051ae2b9af0e7 100644
--- a/clang/lib/Frontend/CompilerInvocation.cpp
+++ b/clang/lib/Frontend/CompilerInvocation.cpp
@@ -1841,10 +1841,6 @@ void CompilerInvocationBase::GenerateCodeGenArgs(const CodeGenOptions &Opts,
        serializeSanitizerKinds(Opts.SanitizeAnnotateDebugInfo))
     GenerateArg(Consumer, OPT_fsanitize_annotate_debug_info_EQ, Sanitizer);
 
-  if (Opts.AllocTokenMax)
-    GenerateArg(Consumer, OPT_falloc_token_max_EQ,
-                std::to_string(*Opts.AllocTokenMax));
-
   if (!Opts.EmitVersionIdentMetadata)
     GenerateArg(Consumer, OPT_Qn);
 
@@ -2358,15 +2354,6 @@ bool CompilerInvocation::ParseCodeGenArgs(CodeGenOptions &Opts, ArgList &Args,
     }
   }
 
-  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) {
-    StringRef S = Arg->getValue();
-    uint64_t Value = 0;
-    if (S.getAsInteger(0, Value))
-      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
-    else
-      Opts.AllocTokenMax = Value;
-  }
-
   Opts.EmitVersionIdentMetadata = Args.hasFlag(OPT_Qy, OPT_Qn, true);
 
   if (!LangOpts->CUDAIsDevice)
@@ -3235,7 +3222,7 @@ static bool ParseFrontendArgs(FrontendOptions &Opts, ArgList &Args,
       DashX = llvm::StringSwitch<InputKind>(XValue)
                   .Case("cpp-output", InputKind(Language::C).getPreprocessed())
                   .Case("assembler-with-cpp", Language::Asm)
-                  .Cases("ast", "pcm", "precompiled-header",
+                  .Cases({"ast", "pcm", "precompiled-header"},
                          InputKind(Language::Unknown, InputKind::Precompiled))
                   .Case("ir", Language::LLVM_IR)
                   .Case("cir", Language::CIR)
@@ -4037,6 +4024,29 @@ void CompilerInvocationBase::GenerateLangArgs(const LangOptions &Opts,
 
   if (!Opts.RandstructSeed.empty())
     GenerateArg(Consumer, OPT_frandomize_layout_seed_EQ, Opts.RandstructSeed);
+
+  if (Opts.AllocTokenMax)
+    GenerateArg(Consumer, OPT_falloc_token_max_EQ,
+                std::to_string(*Opts.AllocTokenMax));
+
+  if (Opts.AllocTokenMode) {
+    StringRef S;
+    switch (*Opts.AllocTokenMode) {
+    case llvm::AllocTokenMode::Increment:
+      S = "increment";
+      break;
+    case llvm::AllocTokenMode::Random:
+      S = "random";
+      break;
+    case llvm::AllocTokenMode::TypeHash:
+      S = "typehash";
+      break;
+    case llvm::AllocTokenMode::TypeHashPointerSplit:
+      S = "typehashpointersplit";
+      break;
+    }
+    GenerateArg(Consumer, OPT_falloc_token_mode_EQ, S);
+  }
 }
 
 bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
@@ -4673,6 +4683,23 @@ bool CompilerInvocation::ParseLangArgs(LangOptions &Opts, ArgList &Args,
   if (const Arg *A = Args.getLastArg(OPT_frandomize_layout_seed_EQ))
     Opts.RandstructSeed = A->getValue(0);
 
+  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_max_EQ)) {
+    StringRef S = Arg->getValue();
+    uint64_t Value = 0;
+    if (S.getAsInteger(0, Value))
+      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
+    else
+      Opts.AllocTokenMax = Value;
+  }
+
+  if (const auto *Arg = Args.getLastArg(options::OPT_falloc_token_mode_EQ)) {
+    StringRef S = Arg->getValue();
+    if (auto Mode = getAllocTokenModeFromString(S))
+      Opts.AllocTokenMode = Mode;
+    else
+      Diags.Report(diag::err_drv_invalid_value) << Arg->getAsString(Args) << S;
+  }
+
   // Validate options for HLSL
   if (Opts.HLSL) {
     // TODO: Revisit restricting SPIR-V to logical once we've figured out how to
diff --git a/clang/lib/Frontend/FrontendAction.cpp b/clang/lib/Frontend/FrontendAction.cpp
index 0daa20a87dd7d..ed1169eb06d22 100644
--- a/clang/lib/Frontend/FrontendAction.cpp
+++ b/clang/lib/Frontend/FrontendAction.cpp
@@ -945,6 +945,7 @@ bool FrontendAction::BeginSourceFile(CompilerInstance &CI,
 
     // Set the shared objects, these are reset when we finish processing the
     // file, otherwise the CompilerInstance will happily destroy them.
+    CI.setVirtualFileSystem(AST->getVirtualFileSystemPtr());
     CI.setFileManager(AST->getFileManagerPtr());
     CI.setSourceManager(AST->getSourceManagerPtr());
     CI.setPreprocessor(AST->getPreprocessorPtr());
diff --git a/clang/lib/Frontend/PrecompiledPreamble.cpp b/clang/lib/Frontend/PrecompiledPreamble.cpp
index 03f70b74dfb42..9bf18b4d897ce 100644
--- a/clang/lib/Frontend/PrecompiledPreamble.cpp
+++ b/clang/lib/Frontend/PrecompiledPreamble.cpp
@@ -479,16 +479,12 @@ llvm::ErrorOr<PrecompiledPreamble> PrecompiledPreamble::Build(
   Diagnostics->Reset();
   ProcessWarningOptions(*Diagnostics, Clang->getDiagnosticOpts(), *VFS);
 
-  VFS = createVFSFromCompilerInvocation(Clang->getInvocation(), *Diagnostics,
-                                        VFS);
-
   // Create a file manager object to provide access to and cache the filesystem.
-  Clang->setFileManager(
-      llvm::makeIntrusiveRefCnt<FileManager>(Clang->getFileSystemOpts(), VFS));
+  Clang->createVirtualFileSystem(VFS);
+  Clang->createFileManager();
 
   // Create the source manager.
-  Clang->setSourceManager(llvm::makeIntrusiveRefCnt<SourceManager>(
-      *Diagnostics, Clang->getFileManager()));
+  Clang->createSourceManager();
 
   auto PreambleDepCollector = std::make_shared<PreambleDependencyCollector>();
   Clang->addDependencyCollector(PreambleDepCollector);
diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt
index 3cde8a024e9dc..4e0a8ada2a6ef 100644
--- a/clang/lib/Headers/CMakeLists.txt
+++ b/clang/lib/Headers/CMakeLists.txt
@@ -4,6 +4,9 @@
 set(core_files
   builtins.h
   float.h
+  __float_float.h
+  __float_header_macro.h
+  __float_infinity_nan.h
   inttypes.h
   iso646.h
   limits.h
diff --git a/clang/lib/Headers/__float_float.h b/clang/lib/Headers/__float_float.h
new file mode 100644
index 0000000000000..267c0721a7ee4
--- /dev/null
+++ b/clang/lib/Headers/__float_float.h
@@ -0,0 +1,176 @@
+/*===---- __float_float.h --------------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_FLOAT_FLOAT_H
+#define __CLANG_FLOAT_FLOAT_H
+
+#if (defined(__MINGW32__) || defined(_MSC_VER) || defined(_AIX)) &&            \
+    __STDC_HOSTED__
+
+/* Undefine anything that we'll be redefining below. */
+#  undef FLT_EVAL_METHOD
+#  undef FLT_ROUNDS
+#  undef FLT_RADIX
+#  undef FLT_MANT_DIG
+#  undef DBL_MANT_DIG
+#  undef LDBL_MANT_DIG
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#    undef DECIMAL_DIG
+#  endif
+#  undef FLT_DIG
+#  undef DBL_DIG
+#  undef LDBL_DIG
+#  undef FLT_MIN_EXP
+#  undef DBL_MIN_EXP
+#  undef LDBL_MIN_EXP
+#  undef FLT_MIN_10_EXP
+#  undef DBL_MIN_10_EXP
+#  undef LDBL_MIN_10_EXP
+#  undef FLT_MAX_EXP
+#  undef DBL_MAX_EXP
+#  undef LDBL_MAX_EXP
+#  undef FLT_MAX_10_EXP
+#  undef DBL_MAX_10_EXP
+#  undef LDBL_MAX_10_EXP
+#  undef FLT_MAX
+#  undef DBL_MAX
+#  undef LDBL_MAX
+#  undef FLT_EPSILON
+#  undef DBL_EPSILON
+#  undef LDBL_EPSILON
+#  undef FLT_MIN
+#  undef DBL_MIN
+#  undef LDBL_MIN
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#    undef FLT_TRUE_MIN
+#    undef DBL_TRUE_MIN
+#    undef LDBL_TRUE_MIN
+#    undef FLT_DECIMAL_DIG
+#    undef DBL_DECIMAL_DIG
+#    undef LDBL_DECIMAL_DIG
+#    undef FLT_HAS_SUBNORM
+#    undef DBL_HAS_SUBNORM
+#    undef LDBL_HAS_SUBNORM
+#  endif
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+#    undef FLT_NORM_MAX
+#    undef DBL_NORM_MAX
+#    undef LDBL_NORM_MAX
+#endif
+#endif
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+#  undef FLT_SNAN
+#  undef DBL_SNAN
+#  undef LDBL_SNAN
+#endif
+
+/* Characteristics of floating point types, C99 5.2.4.2.2 */
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201103L)
+#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#endif
+#define FLT_ROUNDS (__builtin_flt_rounds())
+#define FLT_RADIX __FLT_RADIX__
+
+#define FLT_MANT_DIG __FLT_MANT_DIG__
+#define DBL_MANT_DIG __DBL_MANT_DIG__
+#define LDBL_MANT_DIG __LDBL_MANT_DIG__
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#  define DECIMAL_DIG __DECIMAL_DIG__
+#endif
+
+#define FLT_DIG __FLT_DIG__
+#define DBL_DIG __DBL_DIG__
+#define LDBL_DIG __LDBL_DIG__
+
+#define FLT_MIN_EXP __FLT_MIN_EXP__
+#define DBL_MIN_EXP __DBL_MIN_EXP__
+#define LDBL_MIN_EXP __LDBL_MIN_EXP__
+
+#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
+#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
+#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
+
+#define FLT_MAX_EXP __FLT_MAX_EXP__
+#define DBL_MAX_EXP __DBL_MAX_EXP__
+#define LDBL_MAX_EXP __LDBL_MAX_EXP__
+
+#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
+#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
+#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
+
+#define FLT_MAX __FLT_MAX__
+#define DBL_MAX __DBL_MAX__
+#define LDBL_MAX __LDBL_MAX__
+
+#define FLT_EPSILON __FLT_EPSILON__
+#define DBL_EPSILON __DBL_EPSILON__
+#define LDBL_EPSILON __LDBL_EPSILON__
+
+#define FLT_MIN __FLT_MIN__
+#define DBL_MIN __DBL_MIN__
+#define LDBL_MIN __LDBL_MIN__
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    !defined(__STRICT_ANSI__) ||                                               \
+    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
+    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
+#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
+#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
+#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
+#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
+#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
+#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
+#  define FLT_HAS_SUBNORM __FLT_HAS_DENORM__
+#  define DBL_HAS_SUBNORM __DBL_HAS_DENORM__
+#  define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__
+#endif
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+   /* C23 5.2.5.3.2p28 */
+#  define FLT_SNAN (__builtin_nansf(""))
+#  define DBL_SNAN (__builtin_nans(""))
+#  define LDBL_SNAN (__builtin_nansl(""))
+
+   /* C23 5.2.5.3.3p32 */
+#  define FLT_NORM_MAX __FLT_NORM_MAX__
+#  define DBL_NORM_MAX __DBL_NORM_MAX__
+#  define LDBL_NORM_MAX __LDBL_NORM_MAX__
+#endif
+
+#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
+#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
+#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
+#  define FLT16_DIG         __FLT16_DIG__
+#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
+#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
+#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
+#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
+#  define FLT16_MAX         __FLT16_MAX__
+#  define FLT16_EPSILON     __FLT16_EPSILON__
+#  define FLT16_MIN         __FLT16_MIN__
+#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
+#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
+
+#endif /* __CLANG_FLOAT_FLOAT_H */
diff --git a/clang/lib/Headers/__float_header_macro.h b/clang/lib/Headers/__float_header_macro.h
new file mode 100644
index 0000000000000..11b270e90deb9
--- /dev/null
+++ b/clang/lib/Headers/__float_header_macro.h
@@ -0,0 +1,12 @@
+/*===---- __float_header_macro.h -------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_FLOAT_H
+#define __CLANG_FLOAT_H
+#endif /* __CLANG_FLOAT_H */
diff --git a/clang/lib/Headers/__float_infinity_nan.h b/clang/lib/Headers/__float_infinity_nan.h
new file mode 100644
index 0000000000000..7e253d0bc59b9
--- /dev/null
+++ b/clang/lib/Headers/__float_infinity_nan.h
@@ -0,0 +1,20 @@
+/*===---- __float_infinity_nan.h -------------------------------------------===
+ *
+ * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+ * See https://llvm.org/LICENSE.txt for license information.
+ * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+ *
+ *===-----------------------------------------------------------------------===
+ */
+
+#ifndef __CLANG_FLOAT_INFINITY_NAN_H
+#define __CLANG_FLOAT_INFINITY_NAN_H
+
+/* C23 5.2.5.3.3p29-30 */
+#undef INFINITY
+#undef NAN
+
+#define INFINITY (__builtin_inff())
+#define NAN (__builtin_nanf(""))
+
+#endif /* __CLANG_FLOAT_INFINITY_NAN_H */
diff --git a/clang/lib/Headers/avx2intrin.h b/clang/lib/Headers/avx2intrin.h
index fdb825fbbd134..3cbaaece7b38e 100644
--- a/clang/lib/Headers/avx2intrin.h
+++ b/clang/lib/Headers/avx2intrin.h
@@ -1975,10 +1975,9 @@ _mm256_shuffle_epi8(__m256i __a, __m256i __b) {
 /// \param __b
 ///    A 256-bit integer vector].
 /// \returns A 256-bit integer vector containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi8(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_sign_epi8(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_psignb256((__v32qi)__a, (__v32qi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -1996,10 +1995,9 @@ _mm256_sign_epi8(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [16 x i16].
 /// \returns A 256-bit vector of [16 x i16] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi16(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_sign_epi16(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_psignw256((__v16hi)__a, (__v16hi)__b);
 }
 
 /// Sets each element of the result to the corresponding element of the
@@ -2017,10 +2015,9 @@ _mm256_sign_epi16(__m256i __a, __m256i __b)
 /// \param __b
 ///    A 256-bit vector of [8 x i32].
 /// \returns A 256-bit vector of [8 x i32] containing the result.
-static __inline__ __m256i __DEFAULT_FN_ATTRS256
-_mm256_sign_epi32(__m256i __a, __m256i __b)
-{
-    return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
+static __inline__ __m256i __DEFAULT_FN_ATTRS256_CONSTEXPR
+_mm256_sign_epi32(__m256i __a, __m256i __b) {
+  return (__m256i)__builtin_ia32_psignd256((__v8si)__a, (__v8si)__b);
 }
 
 /// Shifts each 128-bit half of the 256-bit integer vector \a a left by
diff --git a/clang/lib/Headers/avx512ifmavlintrin.h b/clang/lib/Headers/avx512ifmavlintrin.h
index c4449c7ece9ff..b377c17166ffb 100644
--- a/clang/lib/Headers/avx512ifmavlintrin.h
+++ b/clang/lib/Headers/avx512ifmavlintrin.h
@@ -37,6 +37,7 @@
 
 #endif
 
+#if !(defined(__AVXIFMA__) || defined(__AVX512IFMA__))
 #define _mm_madd52hi_epu64(X, Y, Z)                                            \
   ((__m128i)__builtin_ia32_vpmadd52huq128((__v2di)(X), (__v2di)(Y),            \
                                           (__v2di)(Z)))
@@ -52,56 +53,83 @@
 #define _mm256_madd52lo_epu64(X, Y, Z)                                         \
   ((__m256i)__builtin_ia32_vpmadd52luq256((__v4di)(X), (__v4di)(Y),            \
                                           (__v4di)(Z)))
+#endif
+
+#if defined(__AVX512IFMA__)
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52hi_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52huq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52hi_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52huq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+
+static __inline__ __m128i __DEFAULT_FN_ATTRS128
+_mm_madd52lo_epu64(__m128i __X, __m128i __Y, __m128i __Z) {
+  return (__m128i)__builtin_ia32_vpmadd52luq128((__v2di)__X, (__v2di)__Y,
+                                                (__v2di)__Z);
+}
+
+static __inline__ __m256i __DEFAULT_FN_ATTRS256
+_mm256_madd52lo_epu64(__m256i __X, __m256i __Y, __m256i __Z) {
+  return (__m256i)__builtin_ia32_vpmadd52luq256((__v4di)__X, (__v4di)__Y,
+                                                (__v4di)__Z);
+}
+#endif
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52hi_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52hi_epu64(__W, __X, __Y), (__v2di)__W);
+      __M, (__v2di)__builtin_ia32_vpmadd52huq128(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52hi_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52hi_epu64(__X, __Y, __Z),
+      __M, (__v2di)__builtin_ia32_vpmadd52huq128(__X, __Y, __Z),
       (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52hi_epu64(
     __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52hi_epu64(__W, __X, __Y), (__v4di)__W);
+      __M, (__v4di)__builtin_ia32_vpmadd52huq256(__W, __X, __Y), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52hi_epu64(
     __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52hi_epu64(__X, __Y, __Z),
+      __M, (__v4di)__builtin_ia32_vpmadd52huq256(__X, __Y, __Z),
       (__v4di)_mm256_setzero_si256());
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_mask_madd52lo_epu64(__m128i __W, __mmask8 __M, __m128i __X, __m128i __Y) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52lo_epu64(__W, __X, __Y), (__v2di)__W);
+      __M, (__v2di)__builtin_ia32_vpmadd52luq128(__W, __X, __Y), (__v2di)__W);
 }
 
 static __inline__ __m128i __DEFAULT_FN_ATTRS128
 _mm_maskz_madd52lo_epu64(__mmask8 __M, __m128i __X, __m128i __Y, __m128i __Z) {
   return (__m128i)__builtin_ia32_selectq_128(
-      __M, (__v2di)_mm_madd52lo_epu64(__X, __Y, __Z),
+      __M, (__v2di)__builtin_ia32_vpmadd52luq128(__X, __Y, __Z),
       (__v2di)_mm_setzero_si128());
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_mask_madd52lo_epu64(
     __m256i __W, __mmask8 __M, __m256i __X, __m256i __Y) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52lo_epu64(__W, __X, __Y), (__v4di)__W);
+      __M, (__v4di)__builtin_ia32_vpmadd52luq256(__W, __X, __Y), (__v4di)__W);
 }
 
 static __inline__ __m256i __DEFAULT_FN_ATTRS256 _mm256_maskz_madd52lo_epu64(
     __mmask8 __M, __m256i __X, __m256i __Y, __m256i __Z) {
   return (__m256i)__builtin_ia32_selectq_256(
-      __M, (__v4di)_mm256_madd52lo_epu64(__X, __Y, __Z),
+      __M, (__v4di)__builtin_ia32_vpmadd52luq256(__X, __Y, __Z),
       (__v4di)_mm256_setzero_si256());
 }
 
diff --git a/clang/lib/Headers/avxifmaintrin.h b/clang/lib/Headers/avxifmaintrin.h
index a2ef601913431..e452d5f0920e9 100644
--- a/clang/lib/Headers/avxifmaintrin.h
+++ b/clang/lib/Headers/avxifmaintrin.h
@@ -31,6 +31,13 @@
                  __min_vector_width__(256)))
 #endif
 
+#if !defined(__AVX512IFMA__) && defined(__AVXIFMA__)
+#define _mm_madd52hi_epu64(X, Y, Z) _mm_madd52hi_avx_epu64(X, Y, Z)
+#define _mm_madd52lo_epu64(X, Y, Z) _mm_madd52lo_avx_epu64(X, Y, Z)
+#define _mm256_madd52hi_epu64(X, Y, Z) _mm256_madd52hi_avx_epu64(X, Y, Z)
+#define _mm256_madd52lo_epu64(X, Y, Z) _mm256_madd52lo_avx_epu64(X, Y, Z)
+#endif
+
 // must vex-encoding
 
 /// Multiply packed unsigned 52-bit integers in each 64-bit element of \a __Y
diff --git a/clang/lib/Headers/float.h b/clang/lib/Headers/float.h
index 30427c29dcdbd..82974f60048c9 100644
--- a/clang/lib/Headers/float.h
+++ b/clang/lib/Headers/float.h
@@ -7,13 +7,21 @@
  *===-----------------------------------------------------------------------===
  */
 
-#ifndef __CLANG_FLOAT_H
-#define __CLANG_FLOAT_H
-
 #if defined(__MVS__) && __has_include_next(<float.h>)
+#include <__float_header_macro.h>
 #include_next <float.h>
 #else
 
+#if !defined(__need_infinity_nan)
+#define __need_float_float
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
+    !defined(__STRICT_ANSI__)
+#define __need_infinity_nan
+#endif
+#include <__float_header_macro.h>
+#endif
+
+#ifdef __need_float_float
 /* If we're on MinGW, fall back to the system's float.h, which might have
  * additional definitions provided for Windows.
  * For more details see http://msdn.microsoft.com/en-us/library/y0ybw9fy.aspx
@@ -26,171 +34,15 @@
 
 #  include_next <float.h>
 
-/* Undefine anything that we'll be redefining below. */
-#  undef FLT_EVAL_METHOD
-#  undef FLT_ROUNDS
-#  undef FLT_RADIX
-#  undef FLT_MANT_DIG
-#  undef DBL_MANT_DIG
-#  undef LDBL_MANT_DIG
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#    undef DECIMAL_DIG
-#  endif
-#  undef FLT_DIG
-#  undef DBL_DIG
-#  undef LDBL_DIG
-#  undef FLT_MIN_EXP
-#  undef DBL_MIN_EXP
-#  undef LDBL_MIN_EXP
-#  undef FLT_MIN_10_EXP
-#  undef DBL_MIN_10_EXP
-#  undef LDBL_MIN_10_EXP
-#  undef FLT_MAX_EXP
-#  undef DBL_MAX_EXP
-#  undef LDBL_MAX_EXP
-#  undef FLT_MAX_10_EXP
-#  undef DBL_MAX_10_EXP
-#  undef LDBL_MAX_10_EXP
-#  undef FLT_MAX
-#  undef DBL_MAX
-#  undef LDBL_MAX
-#  undef FLT_EPSILON
-#  undef DBL_EPSILON
-#  undef LDBL_EPSILON
-#  undef FLT_MIN
-#  undef DBL_MIN
-#  undef LDBL_MIN
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#    undef FLT_TRUE_MIN
-#    undef DBL_TRUE_MIN
-#    undef LDBL_TRUE_MIN
-#    undef FLT_DECIMAL_DIG
-#    undef DBL_DECIMAL_DIG
-#    undef LDBL_DECIMAL_DIG
-#    undef FLT_HAS_SUBNORM
-#    undef DBL_HAS_SUBNORM
-#    undef LDBL_HAS_SUBNORM
-#  endif
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
-    !defined(__STRICT_ANSI__)
-#    undef FLT_NORM_MAX
-#    undef DBL_NORM_MAX
-#    undef LDBL_NORM_MAX
-#endif
-#endif
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
-    !defined(__STRICT_ANSI__)
-#  undef INFINITY
-#  undef NAN
-#  undef FLT_SNAN
-#  undef DBL_SNAN
-#  undef LDBL_SNAN
-#endif
-
-/* Characteristics of floating point types, C99 5.2.4.2.2 */
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    (defined(__cplusplus) && __cplusplus >= 201103L)
-#define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
 #endif
-#define FLT_ROUNDS (__builtin_flt_rounds())
-#define FLT_RADIX __FLT_RADIX__
 
-#define FLT_MANT_DIG __FLT_MANT_DIG__
-#define DBL_MANT_DIG __DBL_MANT_DIG__
-#define LDBL_MANT_DIG __LDBL_MANT_DIG__
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201103L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#  define DECIMAL_DIG __DECIMAL_DIG__
-#endif
-
-#define FLT_DIG __FLT_DIG__
-#define DBL_DIG __DBL_DIG__
-#define LDBL_DIG __LDBL_DIG__
-
-#define FLT_MIN_EXP __FLT_MIN_EXP__
-#define DBL_MIN_EXP __DBL_MIN_EXP__
-#define LDBL_MIN_EXP __LDBL_MIN_EXP__
-
-#define FLT_MIN_10_EXP __FLT_MIN_10_EXP__
-#define DBL_MIN_10_EXP __DBL_MIN_10_EXP__
-#define LDBL_MIN_10_EXP __LDBL_MIN_10_EXP__
-
-#define FLT_MAX_EXP __FLT_MAX_EXP__
-#define DBL_MAX_EXP __DBL_MAX_EXP__
-#define LDBL_MAX_EXP __LDBL_MAX_EXP__
-
-#define FLT_MAX_10_EXP __FLT_MAX_10_EXP__
-#define DBL_MAX_10_EXP __DBL_MAX_10_EXP__
-#define LDBL_MAX_10_EXP __LDBL_MAX_10_EXP__
-
-#define FLT_MAX __FLT_MAX__
-#define DBL_MAX __DBL_MAX__
-#define LDBL_MAX __LDBL_MAX__
-
-#define FLT_EPSILON __FLT_EPSILON__
-#define DBL_EPSILON __DBL_EPSILON__
-#define LDBL_EPSILON __LDBL_EPSILON__
-
-#define FLT_MIN __FLT_MIN__
-#define DBL_MIN __DBL_MIN__
-#define LDBL_MIN __LDBL_MIN__
-
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
-    !defined(__STRICT_ANSI__) ||                                               \
-    (defined(__cplusplus) && __cplusplus >= 201703L) ||                        \
-    (__STDC_HOSTED__ && defined(_AIX) && defined(_ALL_SOURCE))
-#  define FLT_TRUE_MIN __FLT_DENORM_MIN__
-#  define DBL_TRUE_MIN __DBL_DENORM_MIN__
-#  define LDBL_TRUE_MIN __LDBL_DENORM_MIN__
-#  define FLT_DECIMAL_DIG __FLT_DECIMAL_DIG__
-#  define DBL_DECIMAL_DIG __DBL_DECIMAL_DIG__
-#  define LDBL_DECIMAL_DIG __LDBL_DECIMAL_DIG__
-#  define FLT_HAS_SUBNORM __FLT_HAS_DENORM__
-#  define DBL_HAS_SUBNORM __DBL_HAS_DENORM__
-#  define LDBL_HAS_SUBNORM __LDBL_HAS_DENORM__
+#include <__float_float.h>
+#undef __need_float_float
 #endif
 
-#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 202311L) ||              \
-    !defined(__STRICT_ANSI__)
-   /* C23 5.2.5.3.2p28 */
-#  define FLT_SNAN (__builtin_nansf(""))
-#  define DBL_SNAN (__builtin_nans(""))
-#  define LDBL_SNAN (__builtin_nansl(""))
-
-   /* C23 5.2.5.3.3p29-30 */
-#  define INFINITY (__builtin_inff())
-#  define NAN (__builtin_nanf(""))
-
-   /* C23 5.2.5.3.3p32 */
-#  define FLT_NORM_MAX __FLT_NORM_MAX__
-#  define DBL_NORM_MAX __DBL_NORM_MAX__
-#  define LDBL_NORM_MAX __LDBL_NORM_MAX__
+#ifdef __need_infinity_nan
+#include <__float_infinity_nan.h>
+#undef __need_infinity_nan
 #endif
 
-#ifdef __STDC_WANT_IEC_60559_TYPES_EXT__
-#  define FLT16_MANT_DIG    __FLT16_MANT_DIG__
-#  define FLT16_DECIMAL_DIG __FLT16_DECIMAL_DIG__
-#  define FLT16_DIG         __FLT16_DIG__
-#  define FLT16_MIN_EXP     __FLT16_MIN_EXP__
-#  define FLT16_MIN_10_EXP  __FLT16_MIN_10_EXP__
-#  define FLT16_MAX_EXP     __FLT16_MAX_EXP__
-#  define FLT16_MAX_10_EXP  __FLT16_MAX_10_EXP__
-#  define FLT16_MAX         __FLT16_MAX__
-#  define FLT16_EPSILON     __FLT16_EPSILON__
-#  define FLT16_MIN         __FLT16_MIN__
-#  define FLT16_TRUE_MIN    __FLT16_TRUE_MIN__
-#endif /* __STDC_WANT_IEC_60559_TYPES_EXT__ */
-
 #endif /* __MVS__ */
-#endif /* __CLANG_FLOAT_H */
diff --git a/clang/lib/Headers/module.modulemap b/clang/lib/Headers/module.modulemap
index ae9d5c0a1f75e..a330c3ae38527 100644
--- a/clang/lib/Headers/module.modulemap
+++ b/clang/lib/Headers/module.modulemap
@@ -171,8 +171,22 @@ module _Builtin_intrinsics [system] [extern_c] {
 // that module. The system float.h (if present) will be treated
 // as a textual header in the sytem module.
 module _Builtin_float [system] {
-  header "float.h"
-  export *
+  textual header "float.h"
+
+  explicit module float {
+    header "__float_float.h"
+    export *
+  }
+
+  explicit module header_macro {
+    header "__float_header_macro.h"
+    export *
+  }
+
+  explicit module infinity_nan {
+    header "__float_infinity_nan.h"
+    export *
+  }
 }
 
 module _Builtin_inttypes [system] {
diff --git a/clang/lib/Headers/sifive_vector.h b/clang/lib/Headers/sifive_vector.h
index 4e67ad6fca262..ae01627de2d7d 100644
--- a/clang/lib/Headers/sifive_vector.h
+++ b/clang/lib/Headers/sifive_vector.h
@@ -115,4 +115,60 @@
 #endif
 #endif
 
+#define __riscv_sf_vsettnt_e8w1(atn) __riscv_sf_vsettnt(atn, 0, 1);
+#define __riscv_sf_vsettnt_e8w2(atn) __riscv_sf_vsettnt(atn, 0, 2);
+#define __riscv_sf_vsettnt_e8w4(atn) __riscv_sf_vsettnt(atn, 0, 3);
+#define __riscv_sf_vsettnt_e16w1(atn) __riscv_sf_vsettnt(atn, 1, 1);
+#define __riscv_sf_vsettnt_e16w2(atn) __riscv_sf_vsettnt(atn, 1, 2);
+#define __riscv_sf_vsettnt_e16w4(atn) __riscv_sf_vsettnt(atn, 1, 3);
+#define __riscv_sf_vsettnt_e32w1(atn) __riscv_sf_vsettnt(atn, 2, 1);
+#define __riscv_sf_vsettnt_e32w2(atn) __riscv_sf_vsettnt(atn, 2, 2);
+#define __riscv_sf_vsettm_e8w1(atm) __riscv_sf_vsettm(atm, 0, 1);
+#define __riscv_sf_vsettm_e8w2(atm) __riscv_sf_vsettm(atm, 0, 2);
+#define __riscv_sf_vsettm_e8w4(atm) __riscv_sf_vsettm(atm, 0, 3);
+#define __riscv_sf_vsettm_e16w1(atm) __riscv_sf_vsettm(atm, 1, 1);
+#define __riscv_sf_vsettm_e16w2(atm) __riscv_sf_vsettm(atm, 1, 2);
+#define __riscv_sf_vsettm_e16w4(atm) __riscv_sf_vsettm(atm, 1, 3);
+#define __riscv_sf_vsettm_e32w1(atm) __riscv_sf_vsettm(atm, 2, 1);
+#define __riscv_sf_vsettm_e32w2(atm) __riscv_sf_vsettm(atm, 2, 2);
+#define __riscv_sf_vsettn_e8w1(atn) __riscv_sf_vsettn(atn, 0, 1);
+#define __riscv_sf_vsettn_e8w2(atn) __riscv_sf_vsettn(atn, 0, 2);
+#define __riscv_sf_vsettn_e8w4(atn) __riscv_sf_vsettn(atn, 0, 3);
+#define __riscv_sf_vsettn_e16w1(atn) __riscv_sf_vsettn(atn, 1, 1);
+#define __riscv_sf_vsettn_e16w2(atn) __riscv_sf_vsettn(atn, 1, 2);
+#define __riscv_sf_vsettn_e16w4(atn) __riscv_sf_vsettn(atn, 1, 3);
+#define __riscv_sf_vsettn_e32w1(atn) __riscv_sf_vsettn(atn, 2, 1);
+#define __riscv_sf_vsettn_e32w2(atn) __riscv_sf_vsettn(atn, 2, 2);
+#define __riscv_sf_vsettk_e8w1(atk) __riscv_sf_vsettk(atk, 0, 1);
+#define __riscv_sf_vsettk_e8w2(atk) __riscv_sf_vsettk(atk, 0, 2);
+#define __riscv_sf_vsettk_e8w4(atk) __riscv_sf_vsettk(atk, 0, 3);
+#define __riscv_sf_vsettk_e16w1(atk) __riscv_sf_vsettk(atk, 1, 1);
+#define __riscv_sf_vsettk_e16w2(atk) __riscv_sf_vsettk(atk, 1, 2);
+#define __riscv_sf_vsettk_e16w4(atk) __riscv_sf_vsettk(atk, 1, 3);
+#define __riscv_sf_vsettk_e32w1(atk) __riscv_sf_vsettk(atk, 2, 1);
+#define __riscv_sf_vsettk_e32w2(atk) __riscv_sf_vsettk(atk, 2, 2);
+#define __riscv_sf_vtzero_t_e8w1(tile, atm, atn)                               \
+  __riscv_sf_vtzero_t(tile, atm, atn, 3, 1);
+#define __riscv_sf_vtzero_t_e8w2(tile, atm, atn)                               \
+  __riscv_sf_vtzero_t(tile, atm, atn, 3, 2);
+#define __riscv_sf_vtzero_t_e8w4(tile, atm, atn)                               \
+  __riscv_sf_vtzero_t(tile, atm, atn, 3, 4);
+#define __riscv_sf_vtzero_t_e16w1(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 4, 1);
+#define __riscv_sf_vtzero_t_e16w2(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 4, 2);
+#define __riscv_sf_vtzero_t_e16w4(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 4, 4);
+#define __riscv_sf_vtzero_t_e32w1(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 5, 1);
+#define __riscv_sf_vtzero_t_e32w2(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 5, 2);
+#if __riscv_v_elen >= 64
+#define __riscv_sf_vsettnt_e64w1(atn) __riscv_sf_vsettnt(atn, 3, 1);
+#define __riscv_sf_vsettm_e64w1(atm) __riscv_sf_vsettm(atm, 3, 1);
+#define __riscv_sf_vsettn_e64w1(atn) __riscv_sf_vsettn(atn, 3, 1);
+#define __riscv_sf_vsettk_e64w1(atk) __riscv_sf_vsettk(atk, 3, 1);
+#define __riscv_sf_vtzero_t_e64w1(tile, atm, atn)                              \
+  __riscv_sf_vtzero_t(tile, atm, atn, 6, 1);
+#endif
 #endif //_SIFIVE_VECTOR_H_
diff --git a/clang/lib/Headers/tmmintrin.h b/clang/lib/Headers/tmmintrin.h
index 5d0f20f4d527d..cb4b36ea7383c 100644
--- a/clang/lib/Headers/tmmintrin.h
+++ b/clang/lib/Headers/tmmintrin.h
@@ -26,9 +26,6 @@
 #define __zext128(x)                                                           \
   (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
                                     1, 2, 3)
-#define __anyext128(x)                                                         \
-  (__m128i) __builtin_shufflevector((__v2si)(x), __extension__(__v2si){}, 0,   \
-                                    1, -1, -1)
 
 #if defined(__cplusplus) && (__cplusplus >= 201103L)
 #define __DEFAULT_FN_ATTRS_CONSTEXPR __DEFAULT_FN_ATTRS constexpr
@@ -641,10 +638,9 @@ _mm_shuffle_pi8(__m64 __a, __m64 __b) {
 ///    A 128-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi8(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sign_epi8(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_psignb128((__v16qi)__a, (__v16qi)__b);
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -667,10 +663,9 @@ _mm_sign_epi8(__m128i __a, __m128i __b)
 ///    A 128-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi16(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sign_epi16(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_psignw128((__v8hi)__a, (__v8hi)__b);
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -693,10 +688,9 @@ _mm_sign_epi16(__m128i __a, __m128i __b)
 ///    A 128-bit integer vector containing control doublewords corresponding to
 ///    positions in the destination.
 /// \returns A 128-bit integer vector containing the resultant values.
-static __inline__ __m128i __DEFAULT_FN_ATTRS
-_mm_sign_epi32(__m128i __a, __m128i __b)
-{
-    return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
+static __inline__ __m128i __DEFAULT_FN_ATTRS_CONSTEXPR
+_mm_sign_epi32(__m128i __a, __m128i __b) {
+  return (__m128i)__builtin_ia32_psignd128((__v4si)__a, (__v4si)__b);
 }
 
 /// For each 8-bit integer in the first source operand, perform one of
@@ -719,11 +713,10 @@ _mm_sign_epi32(__m128i __a, __m128i __b)
 ///    A 64-bit integer vector containing control bytes corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi8(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignb128((__v16qi)__anyext128(__a),
-                                              (__v16qi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sign_pi8(__m64 __a,
+                                                                  __m64 __b) {
+  return __trunc64(__builtin_ia32_psignb128((__v16qi)__zext128(__a),
+                                            (__v16qi)__zext128(__b)));
 }
 
 /// For each 16-bit integer in the first source operand, perform one of
@@ -746,11 +739,10 @@ _mm_sign_pi8(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing control words corresponding to
 ///    positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi16(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignw128((__v8hi)__anyext128(__a),
-                                              (__v8hi)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sign_pi16(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(
+      __builtin_ia32_psignw128((__v8hi)__zext128(__a), (__v8hi)__zext128(__b)));
 }
 
 /// For each 32-bit integer in the first source operand, perform one of
@@ -773,14 +765,12 @@ _mm_sign_pi16(__m64 __a, __m64 __b)
 ///    A 64-bit integer vector containing two control doublewords corresponding
 ///    to positions in the destination.
 /// \returns A 64-bit integer vector containing the resultant values.
-static __inline__ __m64 __DEFAULT_FN_ATTRS
-_mm_sign_pi32(__m64 __a, __m64 __b)
-{
-    return __trunc64(__builtin_ia32_psignd128((__v4si)__anyext128(__a),
-                                              (__v4si)__anyext128(__b)));
+static __inline__ __m64 __DEFAULT_FN_ATTRS_CONSTEXPR _mm_sign_pi32(__m64 __a,
+                                                                   __m64 __b) {
+  return __trunc64(
+      __builtin_ia32_psignd128((__v4si)__zext128(__a), (__v4si)__zext128(__b)));
 }
 
-#undef __anyext128
 #undef __zext128
 #undef __trunc64
 #undef __DEFAULT_FN_ATTRS
diff --git a/clang/lib/Interpreter/Interpreter.cpp b/clang/lib/Interpreter/Interpreter.cpp
index b05cb5a0f1dbe..cde354c9cd8d1 100644
--- a/clang/lib/Interpreter/Interpreter.cpp
+++ b/clang/lib/Interpreter/Interpreter.cpp
@@ -581,12 +581,7 @@ Interpreter::Parse(llvm::StringRef Code) {
   if (!TuOrErr)
     return TuOrErr.takeError();
 
-  PTUs.emplace_back(PartialTranslationUnit());
-  PartialTranslationUnit &LastPTU = PTUs.back();
-  LastPTU.TUPart = *TuOrErr;
-
-  if (std::unique_ptr<llvm::Module> M = Act->GenModule())
-    LastPTU.TheModule = std::move(M);
+  PartialTranslationUnit &LastPTU = IncrParser->RegisterPTU(*TuOrErr);
 
   return LastPTU;
 }
diff --git a/clang/lib/Lex/HeaderSearch.cpp b/clang/lib/Lex/HeaderSearch.cpp
index 238c5e2f2d9a5..65c324c10ca5d 100644
--- a/clang/lib/Lex/HeaderSearch.cpp
+++ b/clang/lib/Lex/HeaderSearch.cpp
@@ -2186,6 +2186,8 @@ std::string HeaderSearch::suggestPathToFileForDiagnostics(
 void clang::normalizeModuleCachePath(FileManager &FileMgr, StringRef Path,
                                      SmallVectorImpl<char> &NormalizedPath) {
   NormalizedPath.assign(Path.begin(), Path.end());
-  FileMgr.makeAbsolutePath(NormalizedPath);
-  llvm::sys::path::remove_dots(NormalizedPath);
+  if (!NormalizedPath.empty()) {
+    FileMgr.makeAbsolutePath(NormalizedPath);
+    llvm::sys::path::remove_dots(NormalizedPath);
+  }
 }
diff --git a/clang/lib/Lex/LiteralSupport.cpp b/clang/lib/Lex/LiteralSupport.cpp
index 5b08d7f0efe5a..1c061528e85f1 100644
--- a/clang/lib/Lex/LiteralSupport.cpp
+++ b/clang/lib/Lex/LiteralSupport.cpp
@@ -1283,10 +1283,10 @@ bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
   // In C++2a "d" and "y" are used in the library.
   return llvm::StringSwitch<bool>(Suffix)
-      .Cases("h", "min", "s", true)
-      .Cases("ms", "us", "ns", true)
-      .Cases("il", "i", "if", true)
-      .Cases("d", "y", LangOpts.CPlusPlus20)
+      .Cases({"h", "min", "s"}, true)
+      .Cases({"ms", "us", "ns"}, true)
+      .Cases({"il", "i", "if"}, true)
+      .Cases({"d", "y"}, LangOpts.CPlusPlus20)
       .Default(false);
 }
 
diff --git a/clang/lib/Parse/ParseHLSL.cpp b/clang/lib/Parse/ParseHLSL.cpp
index 51f2aef869649..c727ee3a1f1a6 100644
--- a/clang/lib/Parse/ParseHLSL.cpp
+++ b/clang/lib/Parse/ParseHLSL.cpp
@@ -126,15 +126,9 @@ Parser::ParsedSemantic Parser::ParseHLSLSemantic() {
   // semantic index. The semantic index is the number at the end of
   // the semantic, including leading zeroes. Digits located before
   // the last letter are part of the semantic name.
-  bool Invalid = false;
   SmallString<256> Buffer;
   Buffer.resize(Tok.getLength() + 1);
   StringRef Identifier = PP.getSpelling(Tok, Buffer);
-  if (Invalid) {
-    Diag(Tok.getLocation(), diag::err_expected_semantic_identifier);
-    return {};
-  }
-
   assert(Identifier.size() > 0);
   // Determine the start of the semantic index.
   unsigned IndexIndex = Identifier.find_last_not_of("0123456789") + 1;
diff --git a/clang/lib/Parse/Parser.cpp b/clang/lib/Parse/Parser.cpp
index ec01faf446e8d..a6fc676f23a51 100644
--- a/clang/lib/Parse/Parser.cpp
+++ b/clang/lib/Parse/Parser.cpp
@@ -708,7 +708,7 @@ bool Parser::ParseTopLevelDecl(DeclGroupPtrTy &Result,
     }
 
     // Late template parsing can begin.
-    Actions.SetLateTemplateParser(LateTemplateParserCallback, nullptr, this);
+    Actions.SetLateTemplateParser(LateTemplateParserCallback, this);
     Actions.ActOnEndOfTranslationUnit();
     //else don't tell Sema that we ended parsing: more input might come.
     return true;
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index e797400397d1b..f9665b5e59831 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -155,6 +155,7 @@ getEntityLifetime(const InitializedEntity *Entity,
   case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
   case InitializedEntity::EK_LambdaCapture:
   case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_MatrixElement:
   case InitializedEntity::EK_ComplexElement:
     return {nullptr, LK_FullExpression};
 
diff --git a/clang/lib/Sema/HeuristicResolver.cpp b/clang/lib/Sema/HeuristicResolver.cpp
index cbdefaa57aacb..056e13308b7d3 100644
--- a/clang/lib/Sema/HeuristicResolver.cpp
+++ b/clang/lib/Sema/HeuristicResolver.cpp
@@ -450,7 +450,12 @@ QualType HeuristicResolverImpl::resolveExprToType(const Expr *E) {
   if (const auto *CE = dyn_cast<CallExpr>(E)) {
     if (QualType Resolved = resolveTypeOfCallExpr(CE); !Resolved.isNull())
       return Resolved;
+
+    // Don't proceed to try resolveExprToDecls(), it would just call
+    // resolveTypeOfCallExpr() again.
+    return E->getType();
   }
+
   // Similarly, unwrapping a unary dereference operation does not work via
   // resolveExprToDecls.
   if (const auto *UO = dyn_cast<UnaryOperator>(E->IgnoreParenCasts())) {
diff --git a/clang/lib/Sema/Sema.cpp b/clang/lib/Sema/Sema.cpp
index 00ad27fbbedf6..c0be3e71c0583 100644
--- a/clang/lib/Sema/Sema.cpp
+++ b/clang/lib/Sema/Sema.cpp
@@ -276,10 +276,9 @@ Sema::Sema(Preprocessor &pp, ASTContext &ctxt, ASTConsumer &consumer,
       Context(ctxt), Consumer(consumer), Diags(PP.getDiagnostics()),
       SourceMgr(PP.getSourceManager()), APINotes(SourceMgr, LangOpts),
       AnalysisWarnings(*this), ThreadSafetyDeclCache(nullptr),
-      LateTemplateParser(nullptr), LateTemplateParserCleanup(nullptr),
-      OpaqueParser(nullptr), CurContext(nullptr), ExternalSource(nullptr),
-      StackHandler(Diags), CurScope(nullptr), Ident_super(nullptr),
-      AMDGPUPtr(std::make_unique<SemaAMDGPU>(*this)),
+      LateTemplateParser(nullptr), OpaqueParser(nullptr), CurContext(nullptr),
+      ExternalSource(nullptr), StackHandler(Diags), CurScope(nullptr),
+      Ident_super(nullptr), AMDGPUPtr(std::make_unique<SemaAMDGPU>(*this)),
       ARMPtr(std::make_unique<SemaARM>(*this)),
       AVRPtr(std::make_unique<SemaAVR>(*this)),
       BPFPtr(std::make_unique<SemaBPF>(*this)),
@@ -1253,9 +1252,6 @@ void Sema::ActOnEndOfTranslationUnit() {
             ? TUFragmentKind::Private
             : TUFragmentKind::Normal);
 
-    if (LateTemplateParserCleanup)
-      LateTemplateParserCleanup(OpaqueParser);
-
     CheckDelayedMemberExceptionSpecs();
   } else {
     // If we are building a TU prefix for serialization, it is safe to transfer
@@ -1489,6 +1485,13 @@ void Sema::ActOnEndOfTranslationUnit() {
       Consumer.CompleteTentativeDefinition(VD);
   }
 
+  // In incremental mode, tentative definitions belong to the current
+  // partial translation unit (PTU). Once they have been completed and
+  // emitted to codegen, drop them to prevent re-emission in future PTUs.
+  if (PP.isIncrementalProcessingEnabled())
+    TentativeDefinitions.erase(TentativeDefinitions.begin(ExternalSource.get()),
+                               TentativeDefinitions.end());
+
   for (auto *D : ExternalDeclarations) {
     if (!D || D->isInvalidDecl() || D->getPreviousDecl() || !D->isUsed())
       continue;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
index e007d9caac244..380a852207c21 100644
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1498,6 +1498,24 @@ static void builtinAllocaAddrSpace(Sema &S, CallExpr *TheCall) {
   TheCall->setType(S.Context.getPointerType(RT));
 }
 
+static bool checkBuiltinInferAllocToken(Sema &S, CallExpr *TheCall) {
+  if (S.checkArgCountAtLeast(TheCall, 1))
+    return true;
+
+  for (Expr *Arg : TheCall->arguments()) {
+    // If argument is dependent on a template parameter, we can't resolve now.
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      continue;
+    // Reject void types.
+    QualType ArgTy = Arg->IgnoreParenImpCasts()->getType();
+    if (ArgTy->isVoidType())
+      return S.Diag(Arg->getBeginLoc(), diag::err_param_with_void_type);
+  }
+
+  TheCall->setType(S.Context.UnsignedLongLongTy);
+  return false;
+}
+
 namespace {
 enum PointerAuthOpKind {
   PAO_Strip,
@@ -2779,6 +2797,10 @@ Sema::CheckBuiltinFunctionCall(FunctionDecl *FDecl, unsigned BuiltinID,
       builtinAllocaAddrSpace(*this, TheCall);
     }
     break;
+  case Builtin::BI__builtin_infer_alloc_token:
+    if (checkBuiltinInferAllocToken(*this, TheCall))
+      return ExprError();
+    break;
   case Builtin::BI__arithmetic_fence:
     if (BuiltinArithmeticFence(TheCall))
       return ExprError();
@@ -12353,14 +12375,9 @@ static void DiagnoseMixedUnicodeImplicitConversion(Sema &S, const Type *Source,
   }
 }
 
-enum CFIUncheckedCalleeChange {
-  None,
-  Adding,
-  Discarding,
-};
-
-static CFIUncheckedCalleeChange AdjustingCFIUncheckedCallee(QualType From,
-                                                            QualType To) {
+bool Sema::DiscardingCFIUncheckedCallee(QualType From, QualType To) const {
+  From = Context.getCanonicalType(From);
+  To = Context.getCanonicalType(To);
   QualType MaybePointee = From->getPointeeType();
   if (!MaybePointee.isNull() && MaybePointee->getAs<FunctionType>())
     From = MaybePointee;
@@ -12372,25 +12389,10 @@ static CFIUncheckedCalleeChange AdjustingCFIUncheckedCallee(QualType From,
     if (const auto *ToFn = To->getAs<FunctionType>()) {
       if (FromFn->getCFIUncheckedCalleeAttr() &&
           !ToFn->getCFIUncheckedCalleeAttr())
-        return Discarding;
-      if (!FromFn->getCFIUncheckedCalleeAttr() &&
-          ToFn->getCFIUncheckedCalleeAttr())
-        return Adding;
+        return true;
     }
   }
-  return None;
-}
-
-bool Sema::DiscardingCFIUncheckedCallee(QualType From, QualType To) const {
-  From = Context.getCanonicalType(From);
-  To = Context.getCanonicalType(To);
-  return ::AdjustingCFIUncheckedCallee(From, To) == Discarding;
-}
-
-bool Sema::AddingCFIUncheckedCallee(QualType From, QualType To) const {
-  From = Context.getCanonicalType(From);
-  To = Context.getCanonicalType(To);
-  return ::AdjustingCFIUncheckedCallee(From, To) == Adding;
+  return false;
 }
 
 void Sema::CheckImplicitConversion(Expr *E, QualType T, SourceLocation CC,
diff --git a/clang/lib/Sema/SemaConcept.cpp b/clang/lib/Sema/SemaConcept.cpp
index 54cbfe46a6528..f04cc454cdb7c 100644
--- a/clang/lib/Sema/SemaConcept.cpp
+++ b/clang/lib/Sema/SemaConcept.cpp
@@ -385,6 +385,28 @@ class HashParameterMapping : public RecursiveASTVisitor<HashParameterMapping> {
     return inherited::TraverseStmt(E->getReplacement());
   }
 
+  bool TraverseTemplateName(TemplateName Template) {
+    if (auto *TTP = dyn_cast_if_present<TemplateTemplateParmDecl>(
+            Template.getAsTemplateDecl());
+        TTP && TTP->getDepth() < TemplateArgs.getNumLevels()) {
+      if (!TemplateArgs.hasTemplateArgument(TTP->getDepth(),
+                                            TTP->getPosition()))
+        return true;
+
+      TemplateArgument Arg = TemplateArgs(TTP->getDepth(), TTP->getPosition());
+      if (TTP->isParameterPack() && SemaRef.ArgPackSubstIndex) {
+        assert(Arg.getKind() == TemplateArgument::Pack &&
+               "Missing argument pack");
+        Arg = SemaRef.getPackSubstitutedTemplateArgument(Arg);
+      }
+      assert(!Arg.getAsTemplate().isNull() &&
+             "Null template template argument");
+      UsedTemplateArgs.push_back(
+          SemaRef.Context.getCanonicalTemplateArgument(Arg));
+    }
+    return inherited::TraverseTemplateName(Template);
+  }
+
   void VisitConstraint(const NormalizedConstraintWithParamMapping &Constraint) {
     if (!Constraint.hasParameterMapping()) {
       for (const auto &List : TemplateArgs)
@@ -417,8 +439,8 @@ class ConstraintSatisfactionChecker {
   const NamedDecl *Template;
   SourceLocation TemplateNameLoc;
   UnsignedOrNone PackSubstitutionIndex;
-
   ConstraintSatisfaction &Satisfaction;
+  bool BuildExpression;
 
 private:
   ExprResult
@@ -461,10 +483,11 @@ class ConstraintSatisfactionChecker {
   ConstraintSatisfactionChecker(Sema &SemaRef, const NamedDecl *Template,
                                 SourceLocation TemplateNameLoc,
                                 UnsignedOrNone PackSubstitutionIndex,
-                                ConstraintSatisfaction &Satisfaction)
+                                ConstraintSatisfaction &Satisfaction,
+                                bool BuildExpression)
       : S(SemaRef), Template(Template), TemplateNameLoc(TemplateNameLoc),
         PackSubstitutionIndex(PackSubstitutionIndex),
-        Satisfaction(Satisfaction) {}
+        Satisfaction(Satisfaction), BuildExpression(BuildExpression) {}
 
   ExprResult Evaluate(const NormalizedConstraint &Constraint,
                       const MultiLevelTemplateArgumentList &MLTAL);
@@ -821,9 +844,10 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
     Satisfaction.ContainsErrors = false;
     ExprResult Expr =
         ConstraintSatisfactionChecker(S, Template, TemplateNameLoc,
-                                      UnsignedOrNone(I), Satisfaction)
+                                      UnsignedOrNone(I), Satisfaction,
+                                      /*BuildExpression=*/false)
             .Evaluate(Constraint.getNormalizedPattern(), *SubstitutedArgs);
-    if (Expr.isUsable()) {
+    if (BuildExpression && Expr.isUsable()) {
       if (Out.isUnset())
         Out = Expr;
       else
@@ -834,7 +858,7 @@ ExprResult ConstraintSatisfactionChecker::EvaluateSlow(
                                      Constraint.getBeginLoc(),
                                      FPOptionsOverride{});
     } else {
-      assert(!Satisfaction.IsSatisfied);
+      assert(!BuildExpression || !Satisfaction.IsSatisfied);
     }
     if (!Conjunction && Satisfaction.IsSatisfied) {
       Satisfaction.Details.erase(Satisfaction.Details.begin() +
@@ -985,7 +1009,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
 
   ExprResult E = Evaluate(Constraint.getNormalizedConstraint(), MLTAL);
 
-  if (!E.isUsable()) {
+  if (E.isInvalid()) {
     Satisfaction.Details.insert(Satisfaction.Details.begin() + Size, ConceptId);
     return E;
   }
@@ -1041,7 +1065,7 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
   if (Conjunction && (!Satisfaction.IsSatisfied || Satisfaction.ContainsErrors))
     return LHS;
 
-  if (!Conjunction && LHS.isUsable() && Satisfaction.IsSatisfied &&
+  if (!Conjunction && !LHS.isInvalid() && Satisfaction.IsSatisfied &&
       !Satisfaction.ContainsErrors)
     return LHS;
 
@@ -1050,12 +1074,15 @@ ExprResult ConstraintSatisfactionChecker::Evaluate(
 
   ExprResult RHS = Evaluate(Constraint.getRHS(), MLTAL);
 
-  if (RHS.isUsable() && Satisfaction.IsSatisfied &&
+  if (!Conjunction && !RHS.isInvalid() && Satisfaction.IsSatisfied &&
       !Satisfaction.ContainsErrors)
     Satisfaction.Details.erase(Satisfaction.Details.begin() +
                                    EffectiveDetailEndIndex,
                                Satisfaction.Details.end());
 
+  if (!BuildExpression)
+    return Satisfaction.ContainsErrors ? ExprError() : ExprEmpty();
+
   if (!LHS.isUsable())
     return RHS;
 
@@ -1136,10 +1163,11 @@ static bool CheckConstraintSatisfaction(
                                     Template, /*CSE=*/nullptr,
                                     S.ArgPackSubstIndex);
 
-  ExprResult Res =
-      ConstraintSatisfactionChecker(S, Template, TemplateIDRange.getBegin(),
-                                    S.ArgPackSubstIndex, Satisfaction)
-          .Evaluate(*C, TemplateArgsLists);
+  ExprResult Res = ConstraintSatisfactionChecker(
+                       S, Template, TemplateIDRange.getBegin(),
+                       S.ArgPackSubstIndex, Satisfaction,
+                       /*BuildExpression=*/ConvertedExpr != nullptr)
+                       .Evaluate(*C, TemplateArgsLists);
 
   if (Res.isInvalid())
     return true;
@@ -2672,8 +2700,9 @@ FormulaType SubsumptionChecker::Normalize(const NormalizedConstraint &NC) {
     });
 
     if (Compound.getCompoundKind() == FormulaType::Kind) {
+      unsigned SizeLeft = Left.size();
       Res = std::move(Left);
-      Res.reserve(Left.size() + Right.size());
+      Res.reserve(SizeLeft + Right.size());
       std::for_each(std::make_move_iterator(Right.begin()),
                     std::make_move_iterator(Right.end()), Add);
       return Res;
diff --git a/clang/lib/Sema/SemaDeclAttr.cpp b/clang/lib/Sema/SemaDeclAttr.cpp
index 9475b8a684082..964a2a791e18f 100644
--- a/clang/lib/Sema/SemaDeclAttr.cpp
+++ b/clang/lib/Sema/SemaDeclAttr.cpp
@@ -5206,16 +5206,36 @@ static void handleCallConvAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
 static void handleDeviceKernelAttr(Sema &S, Decl *D, const ParsedAttr &AL) {
   const auto *FD = dyn_cast_or_null<FunctionDecl>(D);
   bool IsFunctionTemplate = FD && FD->getDescribedFunctionTemplate();
-  if (S.getASTContext().getTargetInfo().getTriple().isNVPTX()) {
+  llvm::Triple Triple = S.getASTContext().getTargetInfo().getTriple();
+  const LangOptions &LangOpts = S.getLangOpts();
+  // OpenCL has its own error messages.
+  if (!LangOpts.OpenCL && FD && !FD->isExternallyVisible()) {
+    S.Diag(AL.getLoc(), diag::err_hidden_device_kernel) << FD;
+    AL.setInvalid();
+    return;
+  }
+  if (Triple.isNVPTX()) {
     handleGlobalAttr(S, D, AL);
   } else {
     // OpenCL C++ will throw a more specific error.
-    if (!S.getLangOpts().OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) {
+    if (!LangOpts.OpenCLCPlusPlus && (!FD || IsFunctionTemplate)) {
       S.Diag(AL.getLoc(), diag::err_attribute_wrong_decl_type_str)
           << AL << AL.isRegularKeywordAttribute() << "functions";
+      AL.setInvalid();
+      return;
     }
     handleSimpleAttribute<DeviceKernelAttr>(S, D, AL);
   }
+  // TODO: isGPU() should probably return true for SPIR.
+  bool TargetDeviceEnvironment = Triple.isGPU() || Triple.isSPIR() ||
+                                 LangOpts.isTargetDevice() || LangOpts.OpenCL;
+  if (!TargetDeviceEnvironment) {
+    S.Diag(AL.getLoc(), diag::warn_cconv_unsupported)
+        << AL << (int)Sema::CallingConventionIgnoredReason::ForThisTarget;
+    AL.setInvalid();
+    return;
+  }
+
   // Make sure we validate the CC with the target
   // and warn/error if necessary.
   handleCallConvAttr(S, D, AL);
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 5b3e89f936327..96d51426e0b5c 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -21,6 +21,7 @@
 #include "clang/AST/Expr.h"
 #include "clang/AST/HLSLResource.h"
 #include "clang/AST/Type.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/Builtins.h"
 #include "clang/Basic/DiagnosticSema.h"
@@ -770,23 +771,81 @@ void SemaHLSL::ActOnTopLevelFunction(FunctionDecl *FD) {
   }
 }
 
-bool SemaHLSL::isSemanticValid(FunctionDecl *FD, DeclaratorDecl *D) {
-  const auto *AnnotationAttr = D->getAttr<HLSLAnnotationAttr>();
-  if (AnnotationAttr) {
-    CheckSemanticAnnotation(FD, D, AnnotationAttr);
-    return true;
+HLSLSemanticAttr *SemaHLSL::createSemantic(const SemanticInfo &Info,
+                                           DeclaratorDecl *TargetDecl) {
+  std::string SemanticName = Info.Semantic->getAttrName()->getName().upper();
+
+  if (SemanticName == "SV_DISPATCHTHREADID") {
+    return createSemanticAttr<HLSLSV_DispatchThreadIDAttr>(
+        *Info.Semantic, TargetDecl, Info.Index);
+  } else if (SemanticName == "SV_GROUPINDEX") {
+    return createSemanticAttr<HLSLSV_GroupIndexAttr>(*Info.Semantic, TargetDecl,
+                                                     Info.Index);
+  } else if (SemanticName == "SV_GROUPTHREADID") {
+    return createSemanticAttr<HLSLSV_GroupThreadIDAttr>(*Info.Semantic,
+                                                        TargetDecl, Info.Index);
+  } else if (SemanticName == "SV_GROUPID") {
+    return createSemanticAttr<HLSLSV_GroupIDAttr>(*Info.Semantic, TargetDecl,
+                                                  Info.Index);
+  } else if (SemanticName == "SV_POSITION") {
+    return createSemanticAttr<HLSLSV_PositionAttr>(*Info.Semantic, TargetDecl,
+                                                   Info.Index);
+  } else
+    Diag(Info.Semantic->getLoc(), diag::err_hlsl_unknown_semantic)
+        << *Info.Semantic;
+
+  return nullptr;
+}
+
+bool SemaHLSL::determineActiveSemanticOnScalar(FunctionDecl *FD,
+                                               DeclaratorDecl *D,
+                                               SemanticInfo &ActiveSemantic) {
+  if (ActiveSemantic.Semantic == nullptr) {
+    ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
+    if (ActiveSemantic.Semantic &&
+        ActiveSemantic.Semantic->isSemanticIndexExplicit())
+      ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex();
+  }
+
+  if (!ActiveSemantic.Semantic) {
+    Diag(D->getLocation(), diag::err_hlsl_missing_semantic_annotation);
+    return false;
+  }
+
+  auto *A = createSemantic(ActiveSemantic, D);
+  if (!A)
+    return false;
+
+  checkSemanticAnnotation(FD, D, A);
+  FD->addAttr(A);
+  return true;
+}
+
+bool SemaHLSL::determineActiveSemantic(FunctionDecl *FD, DeclaratorDecl *D,
+                                       SemanticInfo &ActiveSemantic) {
+  if (ActiveSemantic.Semantic == nullptr) {
+    ActiveSemantic.Semantic = D->getAttr<HLSLSemanticAttr>();
+    if (ActiveSemantic.Semantic &&
+        ActiveSemantic.Semantic->isSemanticIndexExplicit())
+      ActiveSemantic.Index = ActiveSemantic.Semantic->getSemanticIndex();
   }
 
   const Type *T = D->getType()->getUnqualifiedDesugaredType();
   const RecordType *RT = dyn_cast<RecordType>(T);
   if (!RT)
-    return false;
+    return determineActiveSemanticOnScalar(FD, D, ActiveSemantic);
 
   const RecordDecl *RD = RT->getDecl();
   for (FieldDecl *Field : RD->fields()) {
-    if (!isSemanticValid(FD, Field))
+    SemanticInfo Info = ActiveSemantic;
+    if (!determineActiveSemantic(FD, Field, Info)) {
+      Diag(Field->getLocation(), diag::note_hlsl_semantic_used_here) << Field;
       return false;
+    }
+    if (ActiveSemantic.Semantic)
+      ActiveSemantic = Info;
   }
+
   return true;
 }
 
@@ -853,8 +912,11 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
   }
 
   for (ParmVarDecl *Param : FD->parameters()) {
-    if (!isSemanticValid(FD, Param)) {
-      Diag(FD->getLocation(), diag::err_hlsl_missing_semantic_annotation);
+    SemanticInfo ActiveSemantic;
+    ActiveSemantic.Semantic = nullptr;
+    ActiveSemantic.Index = std::nullopt;
+
+    if (!determineActiveSemantic(FD, Param, ActiveSemantic)) {
       Diag(Param->getLocation(), diag::note_previous_decl) << Param;
       FD->setInvalidDecl();
     }
@@ -862,31 +924,31 @@ void SemaHLSL::CheckEntryPoint(FunctionDecl *FD) {
   // FIXME: Verify return type semantic annotation.
 }
 
-void SemaHLSL::CheckSemanticAnnotation(
-    FunctionDecl *EntryPoint, const Decl *Param,
-    const HLSLAnnotationAttr *AnnotationAttr) {
+void SemaHLSL::checkSemanticAnnotation(FunctionDecl *EntryPoint,
+                                       const Decl *Param,
+                                       const HLSLSemanticAttr *SemanticAttr) {
   auto *ShaderAttr = EntryPoint->getAttr<HLSLShaderAttr>();
   assert(ShaderAttr && "Entry point has no shader attribute");
   llvm::Triple::EnvironmentType ST = ShaderAttr->getType();
 
-  switch (AnnotationAttr->getKind()) {
+  switch (SemanticAttr->getKind()) {
   case attr::HLSLSV_DispatchThreadID:
   case attr::HLSLSV_GroupIndex:
   case attr::HLSLSV_GroupThreadID:
   case attr::HLSLSV_GroupID:
     if (ST == llvm::Triple::Compute)
       return;
-    DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Compute});
+    DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Compute});
     break;
   case attr::HLSLSV_Position:
     // TODO(#143523): allow use on other shader types & output once the overall
     // semantic logic is implemented.
     if (ST == llvm::Triple::Pixel)
       return;
-    DiagnoseAttrStageMismatch(AnnotationAttr, ST, {llvm::Triple::Pixel});
+    DiagnoseAttrStageMismatch(SemanticAttr, ST, {llvm::Triple::Pixel});
     break;
   default:
-    llvm_unreachable("Unknown HLSLAnnotationAttr");
+    llvm_unreachable("Unknown SemanticAttr");
   }
 }
 
@@ -1661,28 +1723,30 @@ void SemaHLSL::diagnoseSystemSemanticAttr(Decl *D, const ParsedAttr &AL,
     diagnoseInputIDType(ValueType, AL);
     if (IsOutput)
       Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL;
-    Attribute = createSemanticAttr<HLSLSV_DispatchThreadIDAttr>(AL, Index);
+    Attribute =
+        createSemanticAttr<HLSLSV_DispatchThreadIDAttr>(AL, nullptr, Index);
   } else if (SemanticName == "SV_GROUPINDEX") {
     if (IsOutput)
       Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL;
-    Attribute = createSemanticAttr<HLSLSV_GroupIndexAttr>(AL, Index);
+    Attribute = createSemanticAttr<HLSLSV_GroupIndexAttr>(AL, nullptr, Index);
   } else if (SemanticName == "SV_GROUPTHREADID") {
     diagnoseInputIDType(ValueType, AL);
     if (IsOutput)
       Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL;
-    Attribute = createSemanticAttr<HLSLSV_GroupThreadIDAttr>(AL, Index);
+    Attribute =
+        createSemanticAttr<HLSLSV_GroupThreadIDAttr>(AL, nullptr, Index);
   } else if (SemanticName == "SV_GROUPID") {
     diagnoseInputIDType(ValueType, AL);
     if (IsOutput)
       Diag(AL.getLoc(), diag::err_hlsl_semantic_output_not_supported) << AL;
-    Attribute = createSemanticAttr<HLSLSV_GroupIDAttr>(AL, Index);
+    Attribute = createSemanticAttr<HLSLSV_GroupIDAttr>(AL, nullptr, Index);
   } else if (SemanticName == "SV_POSITION") {
     const auto *VT = ValueType->getAs<VectorType>();
     if (!ValueType->hasFloatingRepresentation() ||
         (VT && VT->getNumElements() > 4))
       Diag(AL.getLoc(), diag::err_hlsl_attr_invalid_type)
           << AL << "float/float1/float2/float3/float4";
-    Attribute = createSemanticAttr<HLSLSV_PositionAttr>(AL, Index);
+    Attribute = createSemanticAttr<HLSLSV_PositionAttr>(AL, nullptr, Index);
   } else
     Diag(AL.getLoc(), diag::err_hlsl_unknown_semantic) << AL;
 
@@ -3369,6 +3433,11 @@ static void BuildFlattenedTypeList(QualType BaseTy,
       List.insert(List.end(), VT->getNumElements(), VT->getElementType());
       continue;
     }
+    if (const auto *MT = dyn_cast<ConstantMatrixType>(T)) {
+      List.insert(List.end(), MT->getNumElementsFlattened(),
+                  MT->getElementType());
+      continue;
+    }
     if (const auto *RD = T->getAsCXXRecordDecl()) {
       if (RD->isStandardLayout())
         RD = RD->getStandardLayoutBaseWithFields();
@@ -4167,6 +4236,32 @@ class InitListTransformer {
       }
       return true;
     }
+    if (auto *MTy = Ty->getAs<ConstantMatrixType>()) {
+      unsigned Rows = MTy->getNumRows();
+      unsigned Cols = MTy->getNumColumns();
+      QualType ElemTy = MTy->getElementType();
+
+      for (unsigned C = 0; C < Cols; ++C) {
+        for (unsigned R = 0; R < Rows; ++R) {
+          // row index literal
+          Expr *RowIdx = IntegerLiteral::Create(
+              Ctx, llvm::APInt(Ctx.getIntWidth(Ctx.IntTy), R), Ctx.IntTy,
+              E->getBeginLoc());
+          // column index literal
+          Expr *ColIdx = IntegerLiteral::Create(
+              Ctx, llvm::APInt(Ctx.getIntWidth(Ctx.IntTy), C), Ctx.IntTy,
+              E->getBeginLoc());
+          ExprResult ElExpr = S.CreateBuiltinMatrixSubscriptExpr(
+              E, RowIdx, ColIdx, E->getEndLoc());
+          if (ElExpr.isInvalid())
+            return false;
+          if (!castInitializer(ElExpr.get()))
+            return false;
+          ElExpr.get()->setType(ElemTy);
+        }
+      }
+      return true;
+    }
 
     if (auto *ArrTy = dyn_cast<ConstantArrayType>(Ty.getTypePtr())) {
       uint64_t Size = ArrTy->getZExtSize();
@@ -4220,14 +4315,17 @@ class InitListTransformer {
       return *(ArgIt++);
 
     llvm::SmallVector<Expr *> Inits;
-    assert(!isa<MatrixType>(Ty) && "Matrix types not yet supported in HLSL");
     Ty = Ty.getDesugaredType(Ctx);
-    if (Ty->isVectorType() || Ty->isConstantArrayType()) {
+    if (Ty->isVectorType() || Ty->isConstantArrayType() ||
+        Ty->isConstantMatrixType()) {
       QualType ElTy;
       uint64_t Size = 0;
       if (auto *ATy = Ty->getAs<VectorType>()) {
         ElTy = ATy->getElementType();
         Size = ATy->getNumElements();
+      } else if (auto *CMTy = Ty->getAs<ConstantMatrixType>()) {
+        ElTy = CMTy->getElementType();
+        Size = CMTy->getNumElementsFlattened();
       } else {
         auto *VTy = cast<ConstantArrayType>(Ty.getTypePtr());
         ElTy = VTy->getElementType();
diff --git a/clang/lib/Sema/SemaInit.cpp b/clang/lib/Sema/SemaInit.cpp
index 42cb3c07b7247..410a2e49100b8 100644
--- a/clang/lib/Sema/SemaInit.cpp
+++ b/clang/lib/Sema/SemaInit.cpp
@@ -17,6 +17,7 @@
 #include "clang/AST/ExprCXX.h"
 #include "clang/AST/ExprObjC.h"
 #include "clang/AST/IgnoreExpr.h"
+#include "clang/AST/TypeBase.h"
 #include "clang/AST/TypeLoc.h"
 #include "clang/Basic/SourceManager.h"
 #include "clang/Basic/Specifiers.h"
@@ -403,6 +404,9 @@ class InitListChecker {
                           unsigned &Index,
                           InitListExpr *StructuredList,
                           unsigned &StructuredIndex);
+  void CheckMatrixType(const InitializedEntity &Entity, InitListExpr *IList,
+                       QualType DeclType, unsigned &Index,
+                       InitListExpr *StructuredList, unsigned &StructuredIndex);
   void CheckVectorType(const InitializedEntity &Entity,
                        InitListExpr *IList, QualType DeclType, unsigned &Index,
                        InitListExpr *StructuredList,
@@ -1004,7 +1008,8 @@ InitListChecker::FillInEmptyInitializations(const InitializedEntity &Entity,
       return;
 
     if (ElementEntity.getKind() == InitializedEntity::EK_ArrayElement ||
-        ElementEntity.getKind() == InitializedEntity::EK_VectorElement)
+        ElementEntity.getKind() == InitializedEntity::EK_VectorElement ||
+        ElementEntity.getKind() == InitializedEntity::EK_MatrixElement)
       ElementEntity.setElementIndex(Init);
 
     if (Init >= NumInits && (ILE->hasArrayFiller() || SkipEmptyInitChecks))
@@ -1274,6 +1279,7 @@ static void warnBracedScalarInit(Sema &S, const InitializedEntity &Entity,
 
   switch (Entity.getKind()) {
   case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_MatrixElement:
   case InitializedEntity::EK_ComplexElement:
   case InitializedEntity::EK_ArrayElement:
   case InitializedEntity::EK_Parameter:
@@ -1373,11 +1379,12 @@ void InitListChecker::CheckExplicitInitList(const InitializedEntity &Entity,
       SemaRef.Diag(IList->getInit(Index)->getBeginLoc(), DK)
           << T << IList->getInit(Index)->getSourceRange();
     } else {
-      int initKind = T->isArrayType() ? 0 :
-                     T->isVectorType() ? 1 :
-                     T->isScalarType() ? 2 :
-                     T->isUnionType() ? 3 :
-                     4;
+      int initKind = T->isArrayType()    ? 0
+                     : T->isVectorType() ? 1
+                     : T->isMatrixType() ? 2
+                     : T->isScalarType() ? 3
+                     : T->isUnionType()  ? 4
+                                         : 5;
 
       unsigned DK = ExtraInitsIsError ? diag::err_excess_initializers
                                       : diag::ext_excess_initializers;
@@ -1431,6 +1438,9 @@ void InitListChecker::CheckListElementTypes(const InitializedEntity &Entity,
   } else if (DeclType->isVectorType()) {
     CheckVectorType(Entity, IList, DeclType, Index,
                     StructuredList, StructuredIndex);
+  } else if (DeclType->isMatrixType()) {
+    CheckMatrixType(Entity, IList, DeclType, Index, StructuredList,
+                    StructuredIndex);
   } else if (const RecordDecl *RD = DeclType->getAsRecordDecl()) {
     auto Bases =
         CXXRecordDecl::base_class_const_range(CXXRecordDecl::base_class_const_iterator(),
@@ -1878,6 +1888,37 @@ void InitListChecker::CheckReferenceType(const InitializedEntity &Entity,
     AggrDeductionCandidateParamTypes->push_back(DeclType);
 }
 
+void InitListChecker::CheckMatrixType(const InitializedEntity &Entity,
+                                      InitListExpr *IList, QualType DeclType,
+                                      unsigned &Index,
+                                      InitListExpr *StructuredList,
+                                      unsigned &StructuredIndex) {
+  if (!SemaRef.getLangOpts().HLSL)
+    return;
+
+  const ConstantMatrixType *MT = DeclType->castAs<ConstantMatrixType>();
+  QualType ElemTy = MT->getElementType();
+  const unsigned MaxElts = MT->getNumElementsFlattened();
+
+  unsigned NumEltsInit = 0;
+  InitializedEntity ElemEnt =
+      InitializedEntity::InitializeElement(SemaRef.Context, 0, Entity);
+
+  while (NumEltsInit < MaxElts && Index < IList->getNumInits()) {
+    // Not a sublist: just consume directly.
+    ElemEnt.setElementIndex(Index);
+    CheckSubElementType(ElemEnt, IList, ElemTy, Index, StructuredList,
+                        StructuredIndex);
+    ++NumEltsInit;
+  }
+
+  // For HLSL The error for this case is handled in SemaHLSL's initializer
+  // list diagnostics, That means the execution should require NumEltsInit
+  // to equal Max initializers. In other words  execution should never
+  // reach this point if this condition is not true".
+  assert(NumEltsInit == MaxElts && "NumEltsInit must equal MaxElts");
+}
+
 void InitListChecker::CheckVectorType(const InitializedEntity &Entity,
                                       InitListExpr *IList, QualType DeclType,
                                       unsigned &Index,
@@ -3640,6 +3681,9 @@ InitializedEntity::InitializedEntity(ASTContext &Context, unsigned Index,
   } else if (const VectorType *VT = Parent.getType()->getAs<VectorType>()) {
     Kind = EK_VectorElement;
     Type = VT->getElementType();
+  } else if (const MatrixType *MT = Parent.getType()->getAs<MatrixType>()) {
+    Kind = EK_MatrixElement;
+    Type = MT->getElementType();
   } else {
     const ComplexType *CT = Parent.getType()->getAs<ComplexType>();
     assert(CT && "Unexpected type");
@@ -3688,6 +3732,7 @@ DeclarationName InitializedEntity::getName() const {
   case EK_Delegating:
   case EK_ArrayElement:
   case EK_VectorElement:
+  case EK_MatrixElement:
   case EK_ComplexElement:
   case EK_BlockElement:
   case EK_LambdaToBlockConversionBlockElement:
@@ -3721,6 +3766,7 @@ ValueDecl *InitializedEntity::getDecl() const {
   case EK_Delegating:
   case EK_ArrayElement:
   case EK_VectorElement:
+  case EK_MatrixElement:
   case EK_ComplexElement:
   case EK_BlockElement:
   case EK_LambdaToBlockConversionBlockElement:
@@ -3754,6 +3800,7 @@ bool InitializedEntity::allowsNRVO() const {
   case EK_Delegating:
   case EK_ArrayElement:
   case EK_VectorElement:
+  case EK_MatrixElement:
   case EK_ComplexElement:
   case EK_BlockElement:
   case EK_LambdaToBlockConversionBlockElement:
@@ -3793,6 +3840,9 @@ unsigned InitializedEntity::dumpImpl(raw_ostream &OS) const {
   case EK_Delegating: OS << "Delegating"; break;
   case EK_ArrayElement: OS << "ArrayElement " << Index; break;
   case EK_VectorElement: OS << "VectorElement " << Index; break;
+  case EK_MatrixElement:
+    OS << "MatrixElement " << Index;
+    break;
   case EK_ComplexElement: OS << "ComplexElement " << Index; break;
   case EK_BlockElement: OS << "Block"; break;
   case EK_LambdaToBlockConversionBlockElement:
@@ -6030,7 +6080,7 @@ static void TryOrBuildParenListInitialization(
     Sequence.SetFailed(InitializationSequence::FK_ParenthesizedListInitFailed);
     if (!VerifyOnly) {
       QualType T = Entity.getType();
-      int InitKind = T->isArrayType() ? 0 : T->isUnionType() ? 3 : 4;
+      int InitKind = T->isArrayType() ? 0 : T->isUnionType() ? 4 : 5;
       SourceRange ExcessInitSR(Args[EntityIndexToProcess]->getBeginLoc(),
                                Args.back()->getEndLoc());
       S.Diag(Kind.getLocation(), diag::err_excess_initializers)
@@ -6823,7 +6873,8 @@ void InitializationSequence::InitializeFrom(Sema &S,
   // For HLSL ext vector types we allow list initialization behavior for C++
   // functional cast expressions which look like constructor syntax. This is
   // accomplished by converting initialization arguments to InitListExpr.
-  if (S.getLangOpts().HLSL && Args.size() > 1 && DestType->isExtVectorType() &&
+  if (S.getLangOpts().HLSL && Args.size() > 1 &&
+      (DestType->isExtVectorType() || DestType->isConstantMatrixType()) &&
       (SourceType.isNull() ||
        !Context.hasSameUnqualifiedType(SourceType, DestType))) {
     InitListExpr *ILE = new (Context)
@@ -6988,6 +7039,7 @@ static AssignmentAction getAssignmentAction(const InitializedEntity &Entity,
   case InitializedEntity::EK_Binding:
   case InitializedEntity::EK_ArrayElement:
   case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_MatrixElement:
   case InitializedEntity::EK_ComplexElement:
   case InitializedEntity::EK_BlockElement:
   case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
@@ -7013,6 +7065,7 @@ static bool shouldBindAsTemporary(const InitializedEntity &Entity) {
   case InitializedEntity::EK_Base:
   case InitializedEntity::EK_Delegating:
   case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_MatrixElement:
   case InitializedEntity::EK_ComplexElement:
   case InitializedEntity::EK_Exception:
   case InitializedEntity::EK_BlockElement:
@@ -7043,6 +7096,7 @@ static bool shouldDestroyEntity(const InitializedEntity &Entity) {
     case InitializedEntity::EK_Base:
     case InitializedEntity::EK_Delegating:
     case InitializedEntity::EK_VectorElement:
+    case InitializedEntity::EK_MatrixElement:
     case InitializedEntity::EK_ComplexElement:
     case InitializedEntity::EK_BlockElement:
     case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
@@ -7096,6 +7150,7 @@ static SourceLocation getInitializationLoc(const InitializedEntity &Entity,
   case InitializedEntity::EK_Base:
   case InitializedEntity::EK_Delegating:
   case InitializedEntity::EK_VectorElement:
+  case InitializedEntity::EK_MatrixElement:
   case InitializedEntity::EK_ComplexElement:
   case InitializedEntity::EK_BlockElement:
   case InitializedEntity::EK_LambdaToBlockConversionBlockElement:
@@ -7845,11 +7900,13 @@ ExprResult InitializationSequence::Perform(Sema &S,
   ExprResult CurInit((Expr *)nullptr);
   SmallVector<Expr*, 4> ArrayLoopCommonExprs;
 
-  // HLSL allows vector initialization to function like list initialization, but
-  // use the syntax of a C++-like constructor.
-  bool IsHLSLVectorInit = S.getLangOpts().HLSL && DestType->isExtVectorType() &&
-                          isa<InitListExpr>(Args[0]);
-  (void)IsHLSLVectorInit;
+  // HLSL allows vector/matrix initialization to function like list
+  // initialization, but use the syntax of a C++-like constructor.
+  bool IsHLSLVectorOrMatrixInit =
+      S.getLangOpts().HLSL &&
+      (DestType->isExtVectorType() || DestType->isConstantMatrixType()) &&
+      isa<InitListExpr>(Args[0]);
+  (void)IsHLSLVectorOrMatrixInit;
 
   // For initialization steps that start with a single initializer,
   // grab the only argument out the Args and place it into the "current"
@@ -7888,7 +7945,7 @@ ExprResult InitializationSequence::Perform(Sema &S,
   case SK_StdInitializerList:
   case SK_OCLSamplerInit:
   case SK_OCLZeroOpaqueType: {
-    assert(Args.size() == 1 || IsHLSLVectorInit);
+    assert(Args.size() == 1 || IsHLSLVectorOrMatrixInit);
     CurInit = Args[0];
     if (!CurInit.get()) return ExprError();
     break;
@@ -9114,7 +9171,7 @@ bool InitializationSequence::Diagnose(Sema &S,
         << R;
     else
       S.Diag(Kind.getLocation(), diag::err_excess_initializers)
-        << /*scalar=*/2 << R;
+          << /*scalar=*/3 << R;
     break;
   }
 
diff --git a/clang/lib/Sema/SemaObjCProperty.cpp b/clang/lib/Sema/SemaObjCProperty.cpp
index 1880cec6ec8e5..67c554c50a8ce 100644
--- a/clang/lib/Sema/SemaObjCProperty.cpp
+++ b/clang/lib/Sema/SemaObjCProperty.cpp
@@ -1041,7 +1041,7 @@ RedeclarePropertyAccessor(ASTContext &Context, ObjCImplementationDecl *Impl,
       Decl->getSelector(), Decl->getReturnType(),
       Decl->getReturnTypeSourceInfo(), Impl, Decl->isInstanceMethod(),
       Decl->isVariadic(), Decl->isPropertyAccessor(),
-      /* isSynthesized*/ true, Decl->isImplicit(), Decl->isDefined(),
+      /*isSynthesizedAccessorStub=*/true, Decl->isImplicit(), Decl->isDefined(),
       Decl->getImplementationControl(), Decl->hasRelatedResultType());
   ImplDecl->getMethodFamily();
   if (Decl->hasAttrs())
diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 3bb8080f6e72c..f0f3832e160cd 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2999,11 +2999,11 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
     BinOp = BinaryOperatorKind::BO_LT;
     break;
   case OpenACCReductionOperator::And:
+    BinOp = BinaryOperatorKind::BO_LAnd;
+    break;
   case OpenACCReductionOperator::Or:
-    // We just want a 'NYI' error in the backend, so leave an empty combiner
-    // recipe, and claim success.
-    CombinerRecipes.push_back({nullptr, nullptr, nullptr});
-    return false;
+    BinOp = BinaryOperatorKind::BO_LOr;
+    break;
   }
 
   // If VarTy is an array type, at the top level only, we want to do our
@@ -3068,11 +3068,25 @@ bool SemaOpenACC::CreateReductionCombinerRecipe(
                               : CombinerFailureKind::Assignment};
     }
     case OpenACCReductionOperator::And:
-    case OpenACCReductionOperator::Or:
-      llvm_unreachable("And/Or not implemented, but should fail earlier");
+    case OpenACCReductionOperator::Or: {
+      // These are done as LHS = LHS && RHS (or LHS = LHS || RHS). So after the
+      // binop, all we have to do is the assignment.
+      if (!BinOpRes.isUsable())
+        return {BinOpRes, CombinerFailureKind::BinOp};
+
+      // Build assignment.
+      ExprResult Assignment = SemaRef.BuildBinOp(SemaRef.getCurScope(), Loc,
+                                                 BinaryOperatorKind::BO_Assign,
+                                                 LHSDRE, BinOpRes.get(),
+                                                 /*ForFoldExpr=*/false);
+      return {Assignment, Assignment.isUsable()
+                              ? CombinerFailureKind::None
+                              : CombinerFailureKind::Assignment};
+    }
     case OpenACCReductionOperator::Invalid:
       llvm_unreachable("Invalid should have been caught above");
     }
+    llvm_unreachable("Unhandled case");
   };
 
   auto tryCombiner = [&, this](DeclRefExpr *LHSDRE, DeclRefExpr *RHSDRE,
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index 8b5f4b71f3134..6f2ab74d8f0eb 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -7689,7 +7689,9 @@ void SemaOpenMP::ActOnStartOfFunctionDefinitionInOpenMPDeclareVariantScope(
     FunctionDecl *UDecl = nullptr;
     if (IsTemplated && isa<FunctionTemplateDecl>(CandidateDecl)) {
       auto *FTD = cast<FunctionTemplateDecl>(CandidateDecl);
-      if (FTD->getTemplateParameters()->size() == TemplateParamLists.size())
+      // FIXME: Should this compare the template parameter lists on all levels?
+      if (SemaRef.Context.isSameTemplateParameterList(
+              FTD->getTemplateParameters(), TemplateParamLists.back()))
         UDecl = FTD->getTemplatedDecl();
     } else if (!IsTemplated)
       UDecl = dyn_cast<FunctionDecl>(CandidateDecl);
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 06e5dab35cc3e..6d011239ec813 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -2533,15 +2533,12 @@ static bool IsStandardConversion(Sema &S, Expr* From, QualType ToType,
 
   SCS.setToType(2, FromType);
 
-  // If we have not converted the argument type to the parameter type,
-  // this is a bad conversion sequence, unless we're resolving an overload in C.
-  //
-  // Permit conversions from a function without `cfi_unchecked_callee` to a
-  // function with `cfi_unchecked_callee`.
-  if (CanonFrom == CanonTo || S.AddingCFIUncheckedCallee(CanonFrom, CanonTo))
+  if (CanonFrom == CanonTo)
     return true;
 
-  if ((S.getLangOpts().CPlusPlus || !InOverloadResolution))
+  // If we have not converted the argument type to the parameter type,
+  // this is a bad conversion sequence, unless we're resolving an overload in C.
+  if (S.getLangOpts().CPlusPlus || !InOverloadResolution)
     return false;
 
   ExprResult ER = ExprResult{From};
diff --git a/clang/lib/Sema/SemaRISCV.cpp b/clang/lib/Sema/SemaRISCV.cpp
index b5f91a318ced9..75dba8049a43c 100644
--- a/clang/lib/Sema/SemaRISCV.cpp
+++ b/clang/lib/Sema/SemaRISCV.cpp
@@ -664,6 +664,80 @@ bool SemaRISCV::CheckBuiltinFunctionCall(const TargetInfo &TI,
     return CheckVSetVL(1, 2);
   case RISCVVector::BI__builtin_rvv_vsetvlimax:
     return CheckVSetVL(0, 1);
+  case RISCVVector::BI__builtin_rvv_sf_vsettnt:
+  case RISCVVector::BI__builtin_rvv_sf_vsettm:
+  case RISCVVector::BI__builtin_rvv_sf_vsettn:
+  case RISCVVector::BI__builtin_rvv_sf_vsettk:
+    return SemaRef.BuiltinConstantArgRange(TheCall, 1, 0, 3) ||
+           SemaRef.BuiltinConstantArgRange(TheCall, 2, 1, 3);
+  case RISCVVector::BI__builtin_rvv_sf_mm_f_f_w1:
+  case RISCVVector::BI__builtin_rvv_sf_mm_f_f_w2:
+  case RISCVVector::BI__builtin_rvv_sf_mm_e5m2_e4m3_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_e5m2_e5m2_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_e4m3_e4m3_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_e4m3_e5m2_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_u_u_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_u_s_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_s_u_w4:
+  case RISCVVector::BI__builtin_rvv_sf_mm_s_s_w4: {
+    QualType Arg1Type = TheCall->getArg(1)->getType();
+    ASTContext::BuiltinVectorTypeInfo Info =
+        SemaRef.Context.getBuiltinVectorTypeInfo(
+            Arg1Type->castAs<BuiltinType>());
+    unsigned EltSize = SemaRef.Context.getTypeSize(Info.ElementType);
+    llvm::APSInt Result;
+
+    // We can't check the value of a dependent argument.
+    Expr *Arg = TheCall->getArg(0);
+    if (Arg->isTypeDependent() || Arg->isValueDependent())
+      return false;
+
+    // Check constant-ness first.
+    if (SemaRef.BuiltinConstantArg(TheCall, 0, Result))
+      return true;
+
+    // For TEW = 32, mtd can only be 0, 4, 8, 12.
+    // For TEW = 64, mtd can only be 0, 2, 4, 6, 8, 10, 12, 14.
+    // Only `sf_mm_f_f_w1` and `sf_mm_f_f_w2` might have TEW = 64.
+    if ((BuiltinID == RISCVVector::BI__builtin_rvv_sf_mm_f_f_w1 &&
+         EltSize == 64) ||
+        (BuiltinID == RISCVVector::BI__builtin_rvv_sf_mm_f_f_w2 &&
+         EltSize == 32))
+      return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 15) ||
+             SemaRef.BuiltinConstantArgMultiple(TheCall, 0, 2);
+    return SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 15) ||
+           SemaRef.BuiltinConstantArgMultiple(TheCall, 0, 4);
+  }
+  case RISCVVector::BI__builtin_rvv_sf_vtzero_t: {
+    llvm::APSInt Log2SEWResult;
+    llvm::APSInt TWidenResult;
+    if (SemaRef.BuiltinConstantArg(TheCall, 3, Log2SEWResult) ||
+        SemaRef.BuiltinConstantArg(TheCall, 4, TWidenResult))
+      return true;
+
+    int Log2SEW = Log2SEWResult.getSExtValue();
+    int TWiden = TWidenResult.getSExtValue();
+
+    // 3 <= LogSEW <= 6
+    if (SemaRef.BuiltinConstantArgRange(TheCall, 3, 3, 6))
+      return true;
+
+    // TWiden
+    if (TWiden != 1 && TWiden != 2 && TWiden != 4)
+      return Diag(TheCall->getBeginLoc(),
+                  diag::err_riscv_builtin_invalid_twiden);
+
+    int TEW = (1 << Log2SEW) * TWiden;
+
+    // For TEW = 8, mtd can be 0~15.
+    // For TEW = 16 or 64, mtd can only be 0, 2, 4, 6, 8, 10, 12, 14.
+    // For TEW = 32, mtd can only be 0, 4, 8, 12.
+    if (SemaRef.BuiltinConstantArgRange(TheCall, 0, 0, 15))
+      return true;
+    if (TEW == 16 || TEW == 64)
+      return SemaRef.BuiltinConstantArgMultiple(TheCall, 0, 2);
+    return SemaRef.BuiltinConstantArgMultiple(TheCall, 0, 4);
+  }
   case RISCVVector::BI__builtin_rvv_vget_v: {
     ASTContext::BuiltinVectorTypeInfo ResVecInfo =
         Context.getBuiltinVectorTypeInfo(cast<BuiltinType>(
diff --git a/clang/lib/Sema/SemaType.cpp b/clang/lib/Sema/SemaType.cpp
index 7c1fb12a549e6..280b3c92cce14 100644
--- a/clang/lib/Sema/SemaType.cpp
+++ b/clang/lib/Sema/SemaType.cpp
@@ -134,7 +134,6 @@ static void diagnoseBadTypeAttribute(Sema &S, const ParsedAttr &attr,
   case ParsedAttr::AT_VectorCall:                                              \
   case ParsedAttr::AT_AArch64VectorPcs:                                        \
   case ParsedAttr::AT_AArch64SVEPcs:                                           \
-  case ParsedAttr::AT_DeviceKernel:                                            \
   case ParsedAttr::AT_MSABI:                                                   \
   case ParsedAttr::AT_SysVABI:                                                 \
   case ParsedAttr::AT_Pcs:                                                     \
@@ -3786,7 +3785,8 @@ static CallingConv getCCForDeclaratorChunk(
       }
     }
   }
-  for (const ParsedAttr &AL : D.getDeclSpec().getAttributes()) {
+  for (const ParsedAttr &AL : llvm::concat<ParsedAttr>(
+           D.getDeclSpec().getAttributes(), D.getAttributes())) {
     if (AL.getKind() == ParsedAttr::AT_DeviceKernel) {
       CC = CC_DeviceKernel;
       break;
@@ -7569,8 +7569,6 @@ static Attr *getCCTypeAttr(ASTContext &Ctx, ParsedAttr &Attr) {
     return createSimpleAttr<AArch64SVEPcsAttr>(Ctx, Attr);
   case ParsedAttr::AT_ArmStreaming:
     return createSimpleAttr<ArmStreamingAttr>(Ctx, Attr);
-  case ParsedAttr::AT_DeviceKernel:
-    return createSimpleAttr<DeviceKernelAttr>(Ctx, Attr);
   case ParsedAttr::AT_Pcs: {
     // The attribute may have had a fixit applied where we treated an
     // identifier as a string literal.  The contents of the string are valid,
@@ -8809,16 +8807,6 @@ static void HandleHLSLParamModifierAttr(TypeProcessingState &State,
   }
 }
 
-static bool isMultiSubjectAttrAllowedOnType(const ParsedAttr &Attr) {
-  // The DeviceKernel attribute is shared for many targets, and
-  // it is only allowed to be a type attribute with the AMDGPU
-  // spelling, so skip processing the attr as a type attr
-  // unless it has that spelling.
-  if (Attr.getKind() != ParsedAttr::AT_DeviceKernel)
-    return true;
-  return DeviceKernelAttr::isAMDGPUSpelling(Attr);
-}
-
 static void processTypeAttrs(TypeProcessingState &state, QualType &type,
                              TypeAttrLocation TAL,
                              const ParsedAttributesView &attrs,
@@ -9072,8 +9060,6 @@ static void processTypeAttrs(TypeProcessingState &state, QualType &type,
         break;
       [[fallthrough]];
     FUNCTION_TYPE_ATTRS_CASELIST:
-      if (!isMultiSubjectAttrAllowedOnType(attr))
-        break;
 
       attr.setUsedAsTypeAttr();
 
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
index 8b3fd41adb465..c1b5cb730e4a4 100644
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -5811,7 +5811,13 @@ bool ASTReader::readASTFileControlBlock(
 
     // FIXME: This allows use of the VFS; we do not allow use of the
     // VFS when actually loading a module.
-    auto BufferOrErr = FileMgr.getBufferForFile(Filename);
+    auto Entry =
+        Filename == "-" ? FileMgr.getSTDIN() : FileMgr.getFileRef(Filename);
+    if (!Entry) {
+      llvm::consumeError(Entry.takeError());
+      return true;
+    }
+    auto BufferOrErr = FileMgr.getBufferForFile(*Entry);
     if (!BufferOrErr)
       return true;
     OwnedBuffer = std::move(*BufferOrErr);
diff --git a/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
index 40985f426bdc2..6439e6cedac08 100644
--- a/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/MacOSXAPIChecker.cpp
@@ -151,12 +151,10 @@ void MacOSXAPIChecker::checkPreStmt(const CallExpr *CE,
     return;
 
   SubChecker SC =
-    llvm::StringSwitch<SubChecker>(Name)
-      .Cases("dispatch_once",
-             "_dispatch_once",
-             "dispatch_once_f",
-             &MacOSXAPIChecker::CheckDispatchOnce)
-      .Default(nullptr);
+      llvm::StringSwitch<SubChecker>(Name)
+          .Cases({"dispatch_once", "_dispatch_once", "dispatch_once_f"},
+                 &MacOSXAPIChecker::CheckDispatchOnce)
+          .Default(nullptr);
 
   if (SC)
     (this->*SC)(C, CE, Name);
diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
index 033eb8cc299b0..f60d1936b7584 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/RawPtrRefLambdaCapturesChecker.cpp
@@ -50,7 +50,9 @@ class RawPtrRefLambdaCapturesChecker
       llvm::DenseSet<const DeclRefExpr *> DeclRefExprsToIgnore;
       llvm::DenseSet<const LambdaExpr *> LambdasToIgnore;
       llvm::DenseSet<const ValueDecl *> ProtectedThisDecls;
+      llvm::DenseSet<const CallExpr *> CallToIgnore;
       llvm::DenseSet<const CXXConstructExpr *> ConstructToIgnore;
+      llvm::DenseMap<const VarDecl *, const LambdaExpr *> LambdaOwnerMap;
 
       QualType ClsType;
 
@@ -101,10 +103,60 @@ class RawPtrRefLambdaCapturesChecker
         auto *Init = VD->getInit();
         if (!Init)
           return true;
-        auto *L = dyn_cast_or_null<LambdaExpr>(Init->IgnoreParenCasts());
-        if (!L)
+        if (auto *L = dyn_cast_or_null<LambdaExpr>(Init->IgnoreParenCasts())) {
+          LambdasToIgnore.insert(L); // Evaluate lambdas in VisitDeclRefExpr.
+          return true;
+        }
+        if (!VD->hasLocalStorage())
           return true;
-        LambdasToIgnore.insert(L); // Evaluate lambdas in VisitDeclRefExpr.
+        if (auto *E = dyn_cast<ExprWithCleanups>(Init))
+          Init = E->getSubExpr();
+        if (auto *E = dyn_cast<CXXBindTemporaryExpr>(Init))
+          Init = E->getSubExpr();
+        if (auto *CE = dyn_cast<CallExpr>(Init)) {
+          if (auto *Callee = CE->getDirectCallee()) {
+            auto FnName = safeGetName(Callee);
+            unsigned ArgCnt = CE->getNumArgs();
+            if (FnName == "makeScopeExit" && ArgCnt == 1) {
+              auto *Arg = CE->getArg(0);
+              if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                Arg = E->getSubExpr();
+              if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                LambdaOwnerMap.insert(std::make_pair(VD, L));
+                CallToIgnore.insert(CE);
+                LambdasToIgnore.insert(L);
+              }
+            } else if (FnName == "makeVisitor") {
+              for (unsigned ArgIndex = 0; ArgIndex < ArgCnt; ++ArgIndex) {
+                auto *Arg = CE->getArg(ArgIndex);
+                if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                  Arg = E->getSubExpr();
+                if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                  LambdaOwnerMap.insert(std::make_pair(VD, L));
+                  CallToIgnore.insert(CE);
+                  LambdasToIgnore.insert(L);
+                }
+              }
+            }
+          }
+        } else if (auto *CE = dyn_cast<CXXConstructExpr>(Init)) {
+          if (auto *Ctor = CE->getConstructor()) {
+            if (auto *Cls = Ctor->getParent()) {
+              auto FnName = safeGetName(Cls);
+              unsigned ArgCnt = CE->getNumArgs();
+              if (FnName == "ScopeExit" && ArgCnt == 1) {
+                auto *Arg = CE->getArg(0);
+                if (auto *E = dyn_cast<MaterializeTemporaryExpr>(Arg))
+                  Arg = E->getSubExpr();
+                if (auto *L = dyn_cast<LambdaExpr>(Arg)) {
+                  LambdaOwnerMap.insert(std::make_pair(VD, L));
+                  ConstructToIgnore.insert(CE);
+                  LambdasToIgnore.insert(L);
+                }
+              }
+            }
+          }
+        }
         return true;
       }
 
@@ -114,6 +166,12 @@ class RawPtrRefLambdaCapturesChecker
         auto *VD = dyn_cast_or_null<VarDecl>(DRE->getDecl());
         if (!VD)
           return true;
+        if (auto It = LambdaOwnerMap.find(VD); It != LambdaOwnerMap.end()) {
+          auto *L = It->second;
+          Checker->visitLambdaExpr(L, shouldCheckThis() && !hasProtectedThis(L),
+                                   ClsType);
+          return true;
+        }
         auto *Init = VD->getInit();
         if (!Init)
           return true;
@@ -167,10 +225,14 @@ class RawPtrRefLambdaCapturesChecker
       }
 
       bool VisitCallExpr(CallExpr *CE) override {
+        if (CallToIgnore.contains(CE))
+          return true;
         checkCalleeLambda(CE);
-        if (auto *Callee = CE->getDirectCallee())
+        if (auto *Callee = CE->getDirectCallee()) {
+          if (isVisitFunction(CE, Callee))
+            return true;
           checkParameters(CE, Callee);
-        else if (auto *CalleeE = CE->getCallee()) {
+        } else if (auto *CalleeE = CE->getCallee()) {
           if (auto *DRE = dyn_cast<DeclRefExpr>(CalleeE->IgnoreParenCasts())) {
             if (auto *Callee = dyn_cast_or_null<FunctionDecl>(DRE->getDecl()))
               checkParameters(CE, Callee);
@@ -179,6 +241,34 @@ class RawPtrRefLambdaCapturesChecker
         return true;
       }
 
+      bool isVisitFunction(CallExpr *CallExpr, FunctionDecl *FnDecl) {
+        bool IsVisitFn = safeGetName(FnDecl) == "visit";
+        if (!IsVisitFn)
+          return false;
+        bool ArgCnt = CallExpr->getNumArgs();
+        if (!ArgCnt)
+          return false;
+        auto *Ns = FnDecl->getParent();
+        if (!Ns)
+          return false;
+        auto NsName = safeGetName(Ns);
+        if (NsName != "WTF" && NsName != "std")
+          return false;
+        auto *Arg = CallExpr->getArg(0);
+        if (!Arg)
+          return false;
+        auto *DRE = dyn_cast<DeclRefExpr>(Arg->IgnoreParenCasts());
+        if (!DRE)
+          return false;
+        auto *VD = dyn_cast<VarDecl>(DRE->getDecl());
+        if (!VD)
+          return false;
+        if (!LambdaOwnerMap.contains(VD))
+          return false;
+        DeclRefExprsToIgnore.insert(DRE);
+        return true;
+      }
+
       void checkParameters(CallExpr *CE, FunctionDecl *Callee) {
         unsigned ArgIndex = isa<CXXOperatorCallExpr>(CE);
         bool TreatAllArgsAsNoEscape = shouldTreatAllArgAsNoEscape(Callee);
@@ -280,7 +370,7 @@ class RawPtrRefLambdaCapturesChecker
         LambdasToIgnore.insert(L);
       }
 
-      bool hasProtectedThis(LambdaExpr *L) {
+      bool hasProtectedThis(const LambdaExpr *L) {
         for (const LambdaCapture &OtherCapture : L->captures()) {
           if (!OtherCapture.capturesVariable())
             continue;
@@ -378,7 +468,8 @@ class RawPtrRefLambdaCapturesChecker
     visitor.TraverseDecl(const_cast<TranslationUnitDecl *>(TUD));
   }
 
-  void visitLambdaExpr(LambdaExpr *L, bool shouldCheckThis, const QualType T,
+  void visitLambdaExpr(const LambdaExpr *L, bool shouldCheckThis,
+                       const QualType T,
                        bool ignoreParamVarDecl = false) const {
     if (TFA.isTrivial(L->getBody()))
       return;
@@ -410,7 +501,7 @@ class RawPtrRefLambdaCapturesChecker
   }
 
   void reportBug(const LambdaCapture &Capture, ValueDecl *CapturedVar,
-                 const QualType T, LambdaExpr *L) const {
+                 const QualType T, const LambdaExpr *L) const {
     assert(CapturedVar);
 
     auto Location = Capture.getLocation();
diff --git a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
index 82b560b2613f8..e0deec16afb12 100644
--- a/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/AnalysisConsumer.cpp
@@ -128,7 +128,6 @@ class AnalysisConsumer : public AnalysisASTConsumer,
   std::unique_ptr<llvm::Timer> SyntaxCheckTimer;
   std::unique_ptr<llvm::Timer> ExprEngineTimer;
   std::unique_ptr<llvm::Timer> BugReporterTimer;
-  bool ShouldClearTimersToPreventDisplayingThem;
 
   /// The information about analyzed functions shared throughout the
   /// translation unit.
@@ -149,7 +148,10 @@ class AnalysisConsumer : public AnalysisASTConsumer,
     if (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
         Opts.ShouldSerializeStats || !Opts.DumpEntryPointStatsToCSV.empty()) {
       AnalyzerTimers = std::make_unique<llvm::TimerGroup>(
-          "analyzer", "Analyzer timers");
+          "analyzer", "Analyzer timers",
+          /*PrintOnExit=*/
+          (Opts.AnalyzerDisplayProgress || Opts.PrintStats ||
+           Opts.ShouldSerializeStats));
       SyntaxCheckTimer = std::make_unique<llvm::Timer>(
           "syntaxchecks", "Syntax-based analysis time", *AnalyzerTimers);
       ExprEngineTimer = std::make_unique<llvm::Timer>(
@@ -159,12 +161,6 @@ class AnalysisConsumer : public AnalysisASTConsumer,
           *AnalyzerTimers);
     }
 
-    // Avoid displaying the timers created above in case we only want to record
-    // per-entry-point stats.
-    ShouldClearTimersToPreventDisplayingThem = !Opts.AnalyzerDisplayProgress &&
-                                               !Opts.PrintStats &&
-                                               !Opts.ShouldSerializeStats;
-
     if (Opts.PrintStats || Opts.ShouldSerializeStats) {
       llvm::EnableStatistics(/* DoPrintOnExit= */ false);
     }
@@ -287,9 +283,6 @@ class AnalysisConsumer : public AnalysisASTConsumer,
       checkerMgr->runCheckersOnASTDecl(D, *Mgr, *RecVisitorBR);
       if (SyntaxCheckTimer)
         SyntaxCheckTimer->stopTimer();
-      if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-        AnalyzerTimers->clear();
-      }
     }
     return true;
   }
@@ -583,9 +576,6 @@ void AnalysisConsumer::runAnalysisOnTranslationUnit(ASTContext &C) {
   checkerMgr->runCheckersOnASTDecl(TU, *Mgr, BR);
   if (SyntaxCheckTimer)
     SyntaxCheckTimer->stopTimer();
-  if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-    AnalyzerTimers->clear();
-  }
 
   // Run the AST-only checks using the order in which functions are defined.
   // If inlining is not turned on, use the simplest function order for path
@@ -765,9 +755,6 @@ void AnalysisConsumer::HandleCode(Decl *D, AnalysisMode Mode,
       FunctionSummaries.findOrInsertSummary(D)->second.SyntaxRunningTime =
           std::lround(CheckerDuration.getWallTime() * 1000);
       DisplayTime(CheckerDuration);
-      if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-        AnalyzerTimers->clear();
-      }
     }
   }
 
@@ -830,9 +817,6 @@ void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
     PathRunningTime.set(static_cast<unsigned>(
         std::lround(ExprEngineDuration.getWallTime() * 1000)));
     DisplayTime(ExprEngineDuration);
-    if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-      AnalyzerTimers->clear();
-    }
   }
 
   if (!Mgr->options.DumpExplodedGraphTo.empty())
@@ -843,9 +827,6 @@ void AnalysisConsumer::RunPathSensitiveChecks(Decl *D,
     Eng.ViewGraph(Mgr->options.TrimGraph);
 
   flushReports(BugReporterTimer.get(), Eng.getBugReporter());
-  if (AnalyzerTimers && ShouldClearTimersToPreventDisplayingThem) {
-    AnalyzerTimers->clear();
-  }
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
index 5301f88057203..531c642bf4f31 100644
--- a/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
+++ b/clang/lib/StaticAnalyzer/Frontend/ModelInjector.cpp
@@ -93,6 +93,7 @@ void ModelInjector::onBodySynthesis(const NamedDecl *D) {
 
   // The instance wants to take ownership, however DisableFree frontend option
   // is set to true to avoid double free issues
+  Instance.setVirtualFileSystem(CI.getVirtualFileSystemPtr());
   Instance.setFileManager(CI.getFileManagerPtr());
   Instance.setSourceManager(SM);
   Instance.setPreprocessor(CI.getPreprocessorPtr());
diff --git a/clang/lib/Support/RISCVVIntrinsicUtils.cpp b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
index dad3d0dae423a..12e209aec92ce 100644
--- a/clang/lib/Support/RISCVVIntrinsicUtils.cpp
+++ b/clang/lib/Support/RISCVVIntrinsicUtils.cpp
@@ -980,11 +980,12 @@ RVVIntrinsic::RVVIntrinsic(
     bool HasMaskedOffOperand, bool HasVL, PolicyScheme Scheme,
     bool SupportOverloading, bool HasBuiltinAlias, StringRef ManualCodegen,
     const RVVTypes &OutInTypes, const std::vector<int64_t> &NewIntrinsicTypes,
-    unsigned NF, Policy NewPolicyAttrs, bool HasFRMRoundModeOp)
+    unsigned NF, Policy NewPolicyAttrs, bool HasFRMRoundModeOp, unsigned TWiden)
     : IRName(IRName), IsMasked(IsMasked),
       HasMaskedOffOperand(HasMaskedOffOperand), HasVL(HasVL), Scheme(Scheme),
       SupportOverloading(SupportOverloading), HasBuiltinAlias(HasBuiltinAlias),
-      ManualCodegen(ManualCodegen.str()), NF(NF), PolicyAttrs(NewPolicyAttrs) {
+      ManualCodegen(ManualCodegen.str()), NF(NF), PolicyAttrs(NewPolicyAttrs),
+      TWiden(TWiden) {
 
   // Init BuiltinName, Name and OverloadedName
   BuiltinName = NewName.str();
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
index b0096d8e6b08b..42f52d0ff6241 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.cpp
@@ -382,7 +382,8 @@ DignosticsEngineWithDiagOpts::DignosticsEngineWithDiagOpts(
 
 std::pair<std::unique_ptr<driver::Driver>, std::unique_ptr<driver::Compilation>>
 buildCompilation(ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
-                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS) {
+                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+                 llvm::BumpPtrAllocator &Alloc) {
   SmallVector<const char *, 256> Argv;
   Argv.reserve(ArgStrs.size());
   for (const std::string &Arg : ArgStrs)
@@ -393,7 +394,6 @@ buildCompilation(ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
       "clang LLVM compiler", FS);
   Driver->setTitle("clang_based_tool");
 
-  llvm::BumpPtrAllocator Alloc;
   bool CLMode = driver::IsClangCL(
       driver::getDriverMode(Argv[0], ArrayRef(Argv).slice(1)));
 
@@ -524,13 +524,12 @@ bool initializeScanCompilerInstance(
   // Use the dependency scanning optimized file system if requested to do so.
   if (DepFS) {
     DepFS->resetBypassedPathPrefix();
-    if (!ScanInstance.getHeaderSearchOpts().ModuleCachePath.empty()) {
-      SmallString<256> ModulesCachePath;
-      normalizeModuleCachePath(
-          ScanInstance.getFileManager(),
-          ScanInstance.getHeaderSearchOpts().ModuleCachePath, ModulesCachePath);
+    SmallString<256> ModulesCachePath;
+    normalizeModuleCachePath(ScanInstance.getFileManager(),
+                             ScanInstance.getHeaderSearchOpts().ModuleCachePath,
+                             ModulesCachePath);
+    if (!ModulesCachePath.empty())
       DepFS->setBypassedPathPrefix(ModulesCachePath);
-    }
 
     ScanInstance.setDependencyDirectivesGetter(
         std::make_unique<ScanningDependencyDirectivesGetter>(
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
index 71c6731803597..5657317565e8d 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScannerImpl.h
@@ -105,7 +105,8 @@ struct TextDiagnosticsPrinterWithOutput {
 
 std::pair<std::unique_ptr<driver::Driver>, std::unique_ptr<driver::Compilation>>
 buildCompilation(ArrayRef<std::string> ArgStrs, DiagnosticsEngine &Diags,
-                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS);
+                 IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
+                 llvm::BumpPtrAllocator &Alloc);
 
 std::unique_ptr<CompilerInvocation>
 createCompilerInvocation(ArrayRef<std::string> CommandLine,
diff --git a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
index 95154212603ac..0a1cf6b18b11c 100644
--- a/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
+++ b/clang/lib/Tooling/DependencyScanning/DependencyScanningWorker.cpp
@@ -78,8 +78,10 @@ static bool forEachDriverJob(
     IntrusiveRefCntPtr<llvm::vfs::FileSystem> FS,
     llvm::function_ref<bool(const driver::Command &Cmd)> Callback) {
   // Compilation holds a non-owning a reference to the Driver, hence we need to
-  // keep the Driver alive when we use Compilation.
-  auto [Driver, Compilation] = buildCompilation(ArgStrs, Diags, FS);
+  // keep the Driver alive when we use Compilation. Arguments to commands may be
+  // owned by Alloc when expanded from response files.
+  llvm::BumpPtrAllocator Alloc;
+  auto [Driver, Compilation] = buildCompilation(ArgStrs, Diags, FS, Alloc);
   if (!Compilation)
     return false;
   for (const driver::Command &Job : Compilation->getJobs()) {
diff --git a/clang/lib/Tooling/Tooling.cpp b/clang/lib/Tooling/Tooling.cpp
index ea5a37216e959..e8eef5ed9c9fa 100644
--- a/clang/lib/Tooling/Tooling.cpp
+++ b/clang/lib/Tooling/Tooling.cpp
@@ -446,6 +446,7 @@ bool FrontendActionFactory::runInvocation(
     DiagnosticConsumer *DiagConsumer) {
   // Create a compiler instance to handle the actual work.
   CompilerInstance Compiler(std::move(Invocation), std::move(PCHContainerOps));
+  Compiler.setVirtualFileSystem(Files->getVirtualFileSystemPtr());
   Compiler.setFileManager(Files);
 
   // The FrontendAction can have lifetime requirements for Compiler or its
diff --git a/clang/test/AST/ByteCode/arrays.cpp b/clang/test/AST/ByteCode/arrays.cpp
index 22a4b41041eb3..eaf9559e40cda 100644
--- a/clang/test/AST/ByteCode/arrays.cpp
+++ b/clang/test/AST/ByteCode/arrays.cpp
@@ -820,3 +820,14 @@ namespace FAM {
     return 1;
   }
 }
+
+namespace MultiDimConstructExpr {
+  struct a {
+    a *p = this;
+  };
+  struct b {
+    a m[3][3];
+  };
+  constexpr b d;
+  static_assert(d.m[2][1].p == &d.m[2][1]);
+}
diff --git a/clang/test/AST/ByteCode/bitfields.cpp b/clang/test/AST/ByteCode/bitfields.cpp
index df8d5678287d3..a58328286bd9a 100644
--- a/clang/test/AST/ByteCode/bitfields.cpp
+++ b/clang/test/AST/ByteCode/bitfields.cpp
@@ -128,3 +128,82 @@ namespace NonConstBitWidth {
                 // both-note {{read of non-const variable}}
   };
 }
+
+namespace IncDecOverflow {
+  constexpr bool test1() {
+    struct {unsigned u: 5; } a {};
+    a.u--;
+    return a.u == 31;
+  }
+  static_assert(test1(), "");
+
+  constexpr bool test2() {
+    struct {unsigned u: 5; } a {};
+    --a.u;
+    return a.u == 31;
+  }
+  static_assert(test2(), "");
+
+  constexpr bool test3() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    x = a.u--;
+    return a.u == 31;
+  }
+  static_assert(test3(), "");
+
+  constexpr bool test4() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    x = --a.u;
+    return a.u == 31;
+  }
+  static_assert(test4(), "");
+
+  constexpr bool test5() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test5(), "");
+
+  constexpr bool test6() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test6(), "");
+
+  constexpr bool test7() {
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    a.u++;
+
+    return a.u == 0;
+  }
+  static_assert(test7(), "");
+
+  constexpr bool test8() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    x = a.u++;
+
+    return a.u == 0;
+  }
+  static_assert(test8(), "");
+
+  constexpr bool test9() {
+    int x = 0;
+    struct {unsigned u: 5; } a {};
+    a.u = 31;
+    x = ++a.u;
+
+    return a.u == 0;
+  }
+  static_assert(test9(), "");
+}
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index b587cd6eaf89c..1c015da7be54f 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -494,3 +494,32 @@ constexpr int modify_const_variable() {
 }
 static_assert(modify_const_variable()); // both-error {{not an integral constant expression}} \
                                         // both-note {{in call to}}
+
+constexpr int nullDest() {
+  new (nullptr) int{12}; // both-note {{construction of dereferenced null pointer}}
+  return 0;
+}
+static_assert(nullDest() == 0); // both-error {{not an integral constant expression}} \
+                                // both-note {{in call to}}
+
+constexpr int nullArrayDest() {
+  new (nullptr) int{12}; // both-note {{construction of dereferenced null pointer}}
+  return 0;
+}
+static_assert(nullArrayDest() == 0); // both-error {{not an integral constant expression}} \
+                                     // both-note {{in call to}}
+
+constexpr int intDest() {
+  new ((void*)2) int{3}; // both-note {{cast that performs the conversions of a reinterpret_cast}}
+  return 0;
+}
+static_assert(intDest() == 0); // both-error {{not an integral constant expression}} \
+                               // both-note {{in call to}}
+
+constexpr int intDestArray() {
+  new ((void*)2) int[4]; // both-note {{cast that performs the conversions of a reinterpret_cast}}
+  return 0;
+}
+static_assert(intDestArray() == 0); // both-error {{not an integral constant expression}} \
+                                    // both-note {{in call to}}
+
diff --git a/clang/test/AST/ByteCode/records.cpp b/clang/test/AST/ByteCode/records.cpp
index 48cf81182c1de..00218ba02bb31 100644
--- a/clang/test/AST/ByteCode/records.cpp
+++ b/clang/test/AST/ByteCode/records.cpp
@@ -1162,6 +1162,19 @@ namespace IndirectFieldInit {
   static_assert(s2.x == 1 && s2.y == 2 && s2.a == 3 && s2.b == 4);
 
 #endif
+
+
+  struct B {
+    struct {
+      union {
+        int x = 3;
+      };
+      int y = this->x;
+    };
+
+    constexpr B() {}
+  };
+  static_assert(B().y == 3, "");
 }
 
 namespace InheritedConstructor {
@@ -1840,3 +1853,11 @@ namespace DiamondDowncast {
   constexpr Middle2 &fail = (Middle2&)top1; // both-error {{must be initialized by a constant expression}} \
                                             // both-note {{cannot cast object of dynamic type 'const Bottom' to type 'Middle2'}}
 }
+
+namespace PrimitiveInitializedByInitList {
+  constexpr struct {
+    int a;
+    int b{this->a};
+  } c{ 17 };
+  static_assert(c.b == 17, "");
+}
diff --git a/clang/test/AST/HLSL/matrix-constructors.hlsl b/clang/test/AST/HLSL/matrix-constructors.hlsl
new file mode 100644
index 0000000000000..0a2f03c7c0fac
--- /dev/null
+++ b/clang/test/AST/HLSL/matrix-constructors.hlsl
@@ -0,0 +1,393 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+typedef float float2x1 __attribute__((matrix_type(2,1)));
+typedef float float2x3 __attribute__((matrix_type(2,3)));
+typedef float float2x2 __attribute__((matrix_type(2,2)));
+typedef float float4x4 __attribute__((matrix_type(4,4)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+typedef float float4 __attribute__((ext_vector_type(4)));
+
+[numthreads(1,1,1)]
+void ok() {
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} A 'float2x3':'matrix<float, 2, 3>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
+  float2x3 A = float2x3(1,2,3,4,5,6);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} B 'float2x1':'matrix<float, 2, 1>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>' functional cast to float2x1 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
+  float2x1 B = float2x1(1.0,2.0);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} C 'float2x1':'matrix<float, 2, 1>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>' functional cast to float2x1 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>'
+// CHECK-NEXT: UnaryOperator 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' prefix '-'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
+// CHECK-NEXT: UnaryOperator 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' prefix '-'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
+  float2x1 C = float2x1(-1.0f,-2.0f);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} D 'float2x3':'matrix<float, 2, 3>' cinit
+// CHECK-NEXT: ExprWithCleanups 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
+  float2x3 D = float2x3(float2(1,2), 3, 4, 5, 6);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} E 'float2x3':'matrix<float, 2, 3>' cinit
+// CHECK-NEXT: ExprWithCleanups 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' functional cast to float2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
+  float2x3 E = float2x3(float2(1,2), float2(3,4), 5, 6);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} F 'float2x3':'matrix<float, 2, 3>' cinit
+// CHECK-NEXT: ExprWithCleanups 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' functional cast to float4 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' functional cast to float4 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' functional cast to float4 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' xvalue
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>' functional cast to float4 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4':'vector<float, 4>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
+  float2x3 F = float2x3(float4(1,2,3,4), 5, 6);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} G 'float2x3':'matrix<float, 2, 3>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>' functional cast to float2x3 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x3':'matrix<float, 2, 3>'
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' matrixcomponent
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 5
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 6
+float2x3 G = float2x3(float2x2(1,2,3,4), 5, 6);  
+
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} H 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'Vec2' 'float2':'vector<float, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+  float2 Vec2 = float2(1.0, 2.0);   
+  float2x2 H = float2x2(Vec2,3,4);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} I 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'i' 'int'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'j' 'int'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'k' 'int'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' lvalue Var 0x{{[0-9a-fA-F]+}} 'l' 'int'
+  int i = 1, j = 2, k = 3, l = 4;
+  float2x2 I = float2x2(i,j,k,l);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} J 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' lvalue .a 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'struct S' lvalue Var 0x{{[0-9a-fA-F]+}} 's' 'struct S'
+  struct S { float2 f;  float a;} s;
+  float2x2 J = float2x2(s.f, s.a, s.a);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} L 'second_level_of_typedefs':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
+  typedef float2x2 second_level_of_typedefs;
+  second_level_of_typedefs L = float2x2(1.0f, 2.0f, 3.0f, 4.0f);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} M 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>' functional cast to second_level_of_typedefs <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'second_level_of_typedefs':'matrix<float, 2, 2>'
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 1.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 2.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 3.000000e+00
+// CHECK-NEXT: FloatingLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' 4.000000e+00
+  float2x2 M = second_level_of_typedefs(1.0f, 2.0f, 3.0f, 4.0f);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} N 'float4x4':'matrix<float, 4, 4>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4x4':'matrix<float, 4, 4>' functional cast to float4x4 <HLSLElementwiseCast>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'sF' lvalue Var 0x{{[0-9a-fA-F]+}} 'f' 'sF'
+struct sF {
+    float f[16];
+};
+
+sF f;
+float4x4 N = float4x4(f);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} GettingStrange 'float2x1':'matrix<float, 2, 1>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>' functional cast to float2x1 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x1':'matrix<float, 2, 1>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S2' lvalue Var 0x{{[0-9a-fA-F]+}} 's2' 'S2'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S2' lvalue Var 0x{{[0-9a-fA-F]+}} 's2' 'S2'
+struct S2 { float f; };
+S2 s2;
+float2x1 GettingStrange = float2x1(s2, s2);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} GettingStrange2 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <NoOp>
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue vectorcomponent
+// CHECK-NEXT: MemberExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2':'vector<float, 2>' lvalue .f 0x{{[0-9a-fA-F]+}}
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S3' lvalue Var 0x{{[0-9a-fA-F]+}} 's3' 'S3'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+struct S3 { float2 f;};
+S3 s3;
+float2x2 GettingStrange2 = float2x2(s3, s3);
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} GettingStrange3 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: CXXFunctionalCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' functional cast to float2x2 <HLSLElementwiseCast>
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'S4' lvalue Var 0x{{[0-9a-fA-F]+}} 's4' 'S4'
+struct S4 { float4 f;};
+S4 s4;
+float2x2 GettingStrange3 = float2x2(s4);
+
+}
+
diff --git a/clang/test/AST/HLSL/matrix-general-initializer.hlsl b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
new file mode 100644
index 0000000000000..14c950acb7baf
--- /dev/null
+++ b/clang/test/AST/HLSL/matrix-general-initializer.hlsl
@@ -0,0 +1,260 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s 
+
+typedef float float4x2 __attribute__((matrix_type(4,2)));
+typedef float float2x2 __attribute__((matrix_type(2,2)));
+typedef int int4x4 __attribute__((matrix_type(4,4)));
+
+
+[numthreads(1,1,1)]
+void ok() {
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} used m 'float4x2':'matrix<float, 4, 2>' cinit
+// CHECK-NEXT: ExprWithCleanups 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 3>' xxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' x
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 2>' xx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 4
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+float4x2 m = {1.xxx, 2.xx, 3.x, 4.xx};
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} s 'S' cinit
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'S'
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 2
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float4x2':'matrix<float, 4, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm' 'float4x2':'matrix<float, 4, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 3
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+struct S { float2x2 x; float2x2 y;};
+S s = {m};
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} used m2 'float2x2':'matrix<float, 2, 2>' cinit
+// CHECK-NEXT: ExprWithCleanups 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 2
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'float' <IntegralToFloating>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' <LValueToRValue>
+// CHECK-NEXT: ArraySubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int' xvalue vectorcomponent
+// CHECK-NEXT: MaterializeTemporaryExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xvalue
+// CHECK-NEXT: ExtVectorElementExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'vector<int, 4>' xxxx
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'vector<int, 1>' <VectorSplat>
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <<invalid sloc>> '__size_t':'unsigned long' 3
+float2x2 m2 = {0.xxxx};
+
+// CHECK: VarDecl 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> col:{{[0-9]+}} m3 'int4x4':'matrix<int, 4, 4>' cinit
+// CHECK-NEXT: InitListExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}, col:{{[0-9]+}}> 'int4x4':'matrix<int, 4, 4>'
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 0
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' <FloatingToIntegral>
+// CHECK-NEXT: ImplicitCastExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' <LValueToRValue>
+// CHECK-NEXT: MatrixSubscriptExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float' lvalue matrixcomponent
+// CHECK-NEXT: DeclRefExpr 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'float2x2':'matrix<float, 2, 2>' lvalue Var 0x{{[0-9a-fA-F]+}} 'm2' 'float2x2':'matrix<float, 2, 2>'
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+// CHECK-NEXT: IntegerLiteral 0x{{[0-9a-fA-F]+}} <col:{{[0-9]+}}> 'int' 1
+int4x4 m3 = {m2, m2, m2, m2};
+
+}
\ No newline at end of file
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
index a4ad741182f56..fd1eecdda64fd 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-lambda-captures.cpp
@@ -107,6 +107,79 @@ class HashMap {
   ValueType* m_table { nullptr };
 };
 
+class ScopeExit final {
+public:
+  template<typename ExitFunctionParameter>
+  explicit ScopeExit(ExitFunctionParameter&& exitFunction)
+    : m_exitFunction(std::move(exitFunction)) {
+  }
+
+  ScopeExit(ScopeExit&& other)
+    : m_exitFunction(std::move(other.m_exitFunction))
+    , m_execute(std::move(other.m_execute)) {
+  }
+
+  ~ScopeExit() {
+    if (m_execute)
+      m_exitFunction();
+  }
+
+  WTF::Function<void()> take() {
+    m_execute = false;
+    return std::move(m_exitFunction);
+  }
+
+  void release() { m_execute = false; }
+
+  ScopeExit(const ScopeExit&) = delete;
+  ScopeExit& operator=(const ScopeExit&) = delete;
+  ScopeExit& operator=(ScopeExit&&) = delete;
+
+private:
+  WTF::Function<void()> m_exitFunction;
+  bool m_execute { true };
+};
+
+template<typename ExitFunction> ScopeExit makeScopeExit(ExitFunction&&);
+template<typename ExitFunction>
+ScopeExit makeScopeExit(ExitFunction&& exitFunction)
+{
+    return ScopeExit(std::move(exitFunction));
+}
+
+// Visitor adapted from http://stackoverflow.com/questions/25338795/is-there-a-name-for-this-tuple-creation-idiom
+
+template<class A, class... B> struct Visitor : Visitor<A>, Visitor<B...> {
+    Visitor(A a, B... b)
+        : Visitor<A>(a)
+        , Visitor<B...>(b...)
+    {
+    }
+
+    using Visitor<A>::operator ();
+    using Visitor<B...>::operator ();
+};
+  
+template<class A> struct Visitor<A> : A {
+    Visitor(A a)
+        : A(a)
+    {
+    }
+
+    using A::operator();
+};
+ 
+template<class... F> Visitor<F...> makeVisitor(F... f)
+{
+    return Visitor<F...>(f...);
+}
+
+void opaqueFunction();
+template <typename Visitor, typename... Variants> void visit(Visitor&& v, Variants&&... values)
+{
+  opaqueFunction();
+}
+
 } // namespace WTF
 
 struct A {
@@ -501,3 +574,81 @@ void RefCountedObj::call() const
     };
     callLambda(lambda);
 }
+
+void scope_exit(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+  });
+  someFunction();
+  WTF::ScopeExit scope2([&] {
+    obj->method();
+  });
+  someFunction();
+}
+
+void doWhateverWith(WTF::ScopeExit& obj);
+
+void scope_exit_with_side_effect(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  doWhateverWith(scope);
+}
+
+void scope_exit_static(RefCountable* obj) {
+  static auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+}
+
+WTF::Function<void()> scope_exit_take_lambda(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  return scope.take();
+}
+
+// FIXME: Ideally, we treat release() as a trivial function.
+void scope_exit_release(RefCountable* obj) {
+  auto scope = WTF::makeScopeExit([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  scope.release();
+}
+
+void make_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+  });
+}
+
+void use_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+  });
+  WTF::visit(visitor, obj);
+}
+
+template <typename Visitor, typename ObjectType>
+void bad_visit(Visitor&, ObjectType*) {
+  someFunction();
+}
+
+void static_visitor(RefCountable* obj) {
+  static auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+}
+
+void bad_use_visitor(RefCountable* obj) {
+  auto visitor = WTF::makeVisitor([&] {
+    obj->method();
+    // expected-warning@-1{{Implicitly captured raw-pointer 'obj' to uncounted type is unsafe [webkit.UncountedLambdaCapturesChecker]}}
+  });
+  bad_visit(visitor, obj);
+}
diff --git a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
index 92eae6aab6800..19362cf79b107 100644
--- a/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
+++ b/clang/test/CIR/CodeGen/aapcs-volatile-bitfields.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir 1> %t.cirlayout
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-cir -fdump-record-layouts %s -o %t.cir > %t.cirlayout
 // RUN: FileCheck --input-file=%t.cirlayout %s --check-prefix=CIR-LAYOUT
 // RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
 
 // RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -fclangir -emit-llvm %s -o %t-cir.ll
 // RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
 
-// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll 1> %t.ogcglayout
+// RUN: %clang_cc1 -triple aarch64-unknown-linux-gnu -emit-llvm -fdump-record-layouts %s -o %t.ll > %t.ogcglayout
 // RUN: FileCheck --input-file=%t.ogcglayout %s --check-prefix=OGCG-LAYOUT
 // RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
 
diff --git a/clang/test/CIR/CodeGen/builtin_prefetech.c b/clang/test/CIR/CodeGen/builtin_prefetech.c
new file mode 100644
index 0000000000000..cfe85b9ba8104
--- /dev/null
+++ b/clang/test/CIR/CodeGen/builtin_prefetech.c
@@ -0,0 +1,43 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-cir %s -o - | FileCheck %s -check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -fclangir -emit-llvm %s -o - | FileCheck %s -check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o - | FileCheck %s -check-prefix=OGCG
+
+void foo(void *a) {
+  __builtin_prefetch(a);        // rw=0, locality=3
+  __builtin_prefetch(a, 0);     // rw=0, locality=3
+  __builtin_prefetch(a, 1);     // rw=1, locality=3
+  __builtin_prefetch(a, 1, 1);  // rw=1, locality=1
+}
+
+// CIR-LABEL: cir.func dso_local @foo(
+// CIR: %[[ALLOCA:.*]] = cir.alloca !cir.ptr<!void>
+// CIR: cir.store %arg0, %[[ALLOCA]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR: %[[P1:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+// CIR: cir.prefetch read locality(3) %[[P1]] : !cir.ptr<!void>
+// CIR: %[[P2:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+// CIR: cir.prefetch read locality(3) %[[P2]] : !cir.ptr<!void>
+// CIR: %[[P3:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+// CIR: cir.prefetch write locality(3) %[[P3]] : !cir.ptr<!void>
+// CIR: %[[P4:.*]] = cir.load{{.*}} %[[ALLOCA]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+// CIR: cir.prefetch write locality(1) %[[P4]] : !cir.ptr<!void>
+// CIR: cir.return
+
+// LLVM-LABEL: define dso_local void @foo(
+// LLVM: [[ALLOCA:%.*]] = alloca ptr, i64 1
+// LLVM: store ptr {{.*}}, ptr [[ALLOCA]]
+// LLVM: [[LP1:%.*]] = load ptr, ptr [[ALLOCA]]
+// LLVM: call void @llvm.prefetch.p0(ptr [[LP1]], i32 0, i32 3, i32 1)
+// LLVM: [[LP2:%.*]] = load ptr, ptr [[ALLOCA]]
+// LLVM: call void @llvm.prefetch.p0(ptr [[LP2]], i32 0, i32 3, i32 1)
+// LLVM: [[LP3:%.*]] = load ptr, ptr [[ALLOCA]]
+// LLVM: call void @llvm.prefetch.p0(ptr [[LP3]], i32 1, i32 3, i32 1)
+// LLVM: [[LP4:%.*]] = load ptr, ptr [[ALLOCA]]
+// LLVM: call void @llvm.prefetch.p0(ptr [[LP4]], i32 1, i32 1, i32 1)
+// LLVM: ret void
+
+// OGCG-LABEL: define dso_local void @foo(ptr
+// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 0, i32 3, i32 1)
+// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 3, i32 1)
+// OGCG: call void @llvm.prefetch.p0(ptr {{.*}}, i32 1, i32 1, i32 1)
+// OGCG: ret void
diff --git a/clang/test/CIR/CodeGen/call-via-class-member-funcptr.cpp b/clang/test/CIR/CodeGen/call-via-class-member-funcptr.cpp
new file mode 100644
index 0000000000000..dbde45499a40b
--- /dev/null
+++ b/clang/test/CIR/CodeGen/call-via-class-member-funcptr.cpp
@@ -0,0 +1,86 @@
+// RUN: %clang_cc1 -std=c++20 -triple aarch64-none-linux-android21 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -std=c++20 -triple aarch64-none-linux-android21 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -std=c++20 -triple aarch64-none-linux-android21 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+class A {
+public:
+  static char *b(int);
+};
+
+int h=0;
+
+class F {
+public:
+  const char *b();
+  A g;
+};
+
+const char *F::b() { return g.b(h); }
+
+void fn1() { F f1; }
+
+// CIR: cir.func {{.*}} @_ZN1F1bEv
+// CIR:   %[[H_PTR:.*]] = cir.get_global @h : !cir.ptr<!s32i>
+// CIR:   %[[H_VAL:.*]] = cir.load{{.*}} %[[H_PTR]] : !cir.ptr<!s32i>, !s32i
+// CIR:   %[[RET:.*]] = cir.call @_ZN1A1bEi(%[[H_VAL]]) : (!s32i) -> !cir.ptr<!s8i>
+
+// LLVM: define {{.*}} ptr @_ZN1F1bEv
+// LLVM:   %[[VAR_H:.*]] = load i32, ptr @h
+// LLVM:   %[[RET:.*]] = call ptr @_ZN1A1bEi(i32 %[[VAR_H]])
+
+// OGCG: define {{.*}} ptr @_ZN1F1bEv
+// OGCG:   %[[VAR_H:.*]] = load i32, ptr @h
+// OGCG:   %[[RET:.*]] = call noundef ptr @_ZN1A1bEi(i32 noundef %[[VAR_H]])
+
+class B {
+public:
+  B();
+  int (&indirect_callee_int_ref)(int);
+};
+
+class C {
+public:
+  int call_indirect(int v) { return inner.indirect_callee_int_ref(v); };
+  B inner;
+};
+
+void fn2() { C c1; c1.call_indirect(2); }
+
+// CIR: cir.func {{.*}} @_ZN1C13call_indirectEi(%[[THIS_ARG:.*]]: !cir.ptr<!rec_C> {{.*}}, %[[V_ARG:.*]]: !s32i {{.*}}) -> !s32i
+// CIR:   %[[THIS_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>, ["this", init]
+// CIR:   %[[V_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["v", init]
+// CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
+// CIR:   cir.store %[[V_ARG]], %[[V_ADDR]]
+// CIR:   %[[THIS:.*]] = cir.load %[[THIS_ADDR]]
+// CIR:   %[[INNER:.*]] = cir.get_member %[[THIS]][0] {name = "inner"} : !cir.ptr<!rec_C> -> !cir.ptr<!rec_B>
+// CIR:   %[[INDIRECT_CALLEE_PTR:.*]] = cir.get_member %[[INNER]][0] {name = "indirect_callee_int_ref"}
+// CIR:   %[[INDIRECT_CALLEE:.*]] = cir.load %[[INDIRECT_CALLEE_PTR]]
+// CIR:   %[[V:.*]] = cir.load{{.*}} %[[V_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:   %[[RET:.*]] = cir.call %[[INDIRECT_CALLEE]](%[[V]])
+
+// LLVM: define {{.*}} i32 @_ZN1C13call_indirectEi(ptr %[[THIS_ARG:.*]], i32 %[[V_ARG:.*]])
+// LLVM:   %[[THIS_ADDR:.*]] = alloca ptr
+// LLVM:   %[[V_ADDR:.*]] = alloca i32
+// LLVM:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// LLVM:   store i32 %[[V_ARG]], ptr %[[V_ADDR]]
+// LLVM:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// LLVM:   %[[INNER:.*]] = getelementptr %class.C, ptr %[[THIS]], i32 0, i32 0
+// LLVM:   %[[INDIRECT_CALLEE_PTR:.*]] = getelementptr %class.B, ptr %[[INNER]], i32 0, i32 0
+// LLVM:   %[[INDIRECT_CALLEE:.*]] = load ptr, ptr %[[INDIRECT_CALLEE_PTR]]
+// LLVM:   %[[V:.*]] = load i32, ptr %[[V_ADDR]]
+// LLVM:   %[[RET:.*]] = call i32 %[[INDIRECT_CALLEE]](i32 %[[V]])
+
+// OGCG: define {{.*}} i32 @_ZN1C13call_indirectEi(ptr {{.*}} %[[THIS_ARG:.*]], i32 {{.*}} %[[V_ARG:.*]])
+// OGCG:   %[[THIS_ADDR:.*]] = alloca ptr
+// OGCG:   %[[V_ADDR:.*]] = alloca i32
+// OGCG:   store ptr %[[THIS_ARG]], ptr %[[THIS_ADDR]]
+// OGCG:   store i32 %[[V_ARG]], ptr %[[V_ADDR]]
+// OGCG:   %[[THIS:.*]] = load ptr, ptr %[[THIS_ADDR]]
+// OGCG:   %[[INNER:.*]] = getelementptr inbounds nuw %class.C, ptr %[[THIS]], i32 0, i32 0
+// OGCG:   %[[INDIRECT_CALLEE_PTR:.*]] = getelementptr inbounds nuw %class.B, ptr %[[INNER]], i32 0, i32 0
+// OGCG:   %[[INDIRECT_CALLEE:.*]] = load ptr, ptr %[[INDIRECT_CALLEE_PTR]]
+// OGCG:   %[[V:.*]] = load i32, ptr %[[V_ADDR]]
+// OGCG:   %[[RET:.*]] = call noundef i32 %[[INDIRECT_CALLEE]](i32 noundef %[[V]])
diff --git a/clang/test/CIR/CodeGen/complex.cpp b/clang/test/CIR/CodeGen/complex.cpp
index 4e89af44dcd18..3fb78dc871904 100644
--- a/clang/test/CIR/CodeGen/complex.cpp
+++ b/clang/test/CIR/CodeGen/complex.cpp
@@ -1468,3 +1468,30 @@ void calling_function_with_default_arg() {
 // OGCG: store float 0x40019999A0000000, ptr %[[DEFAULT_ARG_IMAG_PTR]], align 4
 // OGCG: %[[TMP_DEFAULT_ARG:.*]] = load <2 x float>, ptr %[[DEFAULT_ARG_ADDR]], align 4
 // OGCG: call void @_Z33function_with_complex_default_argCf(<2 x float> {{.*}} %[[TMP_DEFAULT_ARG]])
+
+void calling_function_that_return_complex() {
+  float _Complex a = complex_type_return_type();
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>, ["a", init]
+// CIR: %[[RESULT:.*]] = cir.call @_Z24complex_type_return_typev() : () -> !cir.complex<!cir.float>
+// CIR: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// TODO(CIR): the difference between the CIR LLVM and OGCG is because the lack of calling convention lowering,
+
+// LLVM: %[[A_ADDR:.*]] = alloca { float, float }, i64 1, align 4
+// LLVM: %[[RESULT:.*]] = call { float, float } @_Z24complex_type_return_typev()
+// LLVM: store { float, float } %[[RESULT]], ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT_ADDR:.*]] = alloca { float, float }, align 4
+// OGCG: %[[RESULT:.*]] = call noundef <2 x float> @_Z24complex_type_return_typev()
+// OGCG: store <2 x float> %[[RESULT]], ptr %[[RESULT_ADDR]], align 4
+// OGCG: %[[RESULT_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT_ADDR]], i32 0, i32 0
+// OGCG: %[[RESULT_REAL:.*]] = load float, ptr %[[RESULT_REAL_PTR]], align 4
+// OGCG: %[[RESULT_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[RESULT_ADDR]], i32 0, i32 1
+// OGCG: %[[RESULT_IMAG:.*]] = load float, ptr %[[RESULT_IMAG_PTR]], align 4
+// OGCG: %[[A_REAL_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 0
+// OGCG: %[[A_IMAG_PTR:.*]] = getelementptr inbounds nuw { float, float }, ptr %[[A_ADDR]], i32 0, i32 1
+// OGCG: store float %[[RESULT_REAL]], ptr %[[A_REAL_PTR]], align 4
+// OGCG: store float %[[RESULT_IMAG]], ptr %[[A_IMAG_PTR]], align 4
diff --git a/clang/test/CIR/CodeGen/coro-task.cpp b/clang/test/CIR/CodeGen/coro-task.cpp
index 1fc7d77be2bce..265325f82d7f7 100644
--- a/clang/test/CIR/CodeGen/coro-task.cpp
+++ b/clang/test/CIR/CodeGen/coro-task.cpp
@@ -106,6 +106,9 @@ co_invoke_fn co_invoke;
 // CIR-NEXT: cir.global external @_ZN5folly4coro9co_invokeE = #cir.zero : !rec_folly3A3Acoro3A3Aco_invoke_fn
 
 // CIR: cir.func builtin private @__builtin_coro_id(!u32i, !cir.ptr<!void>, !cir.ptr<!void>, !cir.ptr<!void>) -> !u32i
+// CIR:  cir.func builtin private @__builtin_coro_alloc(!u32i) -> !cir.bool
+// CIR:  cir.func builtin private @__builtin_coro_size() -> !u64i
+// CIR:  cir.func builtin private @__builtin_coro_begin(!u32i, !cir.ptr<!void>) -> !cir.ptr<!void>
 
 using VoidTask = folly::coro::Task<void>;
 
@@ -114,10 +117,24 @@ VoidTask silly_task() {
 }
 
 // CIR: cir.func coroutine dso_local @_Z10silly_taskv() -> ![[VoidTask]]
-// CHECK: %[[#VoidTaskAddr:]] = cir.alloca ![[VoidTask]], {{.*}}, ["__retval"]
+// CIR: %[[VoidTaskAddr:.*]] = cir.alloca ![[VoidTask]], {{.*}}, ["__retval"]
+// CIR: %[[SavedFrameAddr:.*]] = cir.alloca !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>, ["__coro_frame_addr"]
 
 // Get coroutine id with __builtin_coro_id.
 
 // CIR: %[[NullPtr:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!void>
 // CIR: %[[Align:.*]] = cir.const #cir.int<16> : !u32i
 // CIR: %[[CoroId:.*]] = cir.call @__builtin_coro_id(%[[Align]], %[[NullPtr]], %[[NullPtr]], %[[NullPtr]])
+
+// Perform allocation calling operator 'new' depending on __builtin_coro_alloc and
+// call __builtin_coro_begin for the final coroutine frame address.
+
+// CIR: %[[ShouldAlloc:.*]] = cir.call @__builtin_coro_alloc(%[[CoroId]]) : (!u32i) -> !cir.bool
+// CIR: cir.store{{.*}} %[[NullPtr]], %[[SavedFrameAddr]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR: cir.if %[[ShouldAlloc]] {
+// CIR:   %[[CoroSize:.*]] = cir.call @__builtin_coro_size() : () -> !u64i
+// CIR:   %[[AllocAddr:.*]] = cir.call @_Znwm(%[[CoroSize]]) : (!u64i) -> !cir.ptr<!void>
+// CIR:   cir.store{{.*}} %[[AllocAddr]], %[[SavedFrameAddr]] : !cir.ptr<!void>, !cir.ptr<!cir.ptr<!void>>
+// CIR: }
+// CIR: %[[Load0:.*]] = cir.load{{.*}} %[[SavedFrameAddr]] : !cir.ptr<!cir.ptr<!void>>, !cir.ptr<!void>
+// CIR: %[[CoroFrameAddr:.*]] = cir.call @__builtin_coro_begin(%[[CoroId]], %[[Load0]])
diff --git a/clang/test/CIR/CodeGen/delete.cpp b/clang/test/CIR/CodeGen/delete.cpp
index 69640aa04531f..d8ac4361bb538 100644
--- a/clang/test/CIR/CodeGen/delete.cpp
+++ b/clang/test/CIR/CodeGen/delete.cpp
@@ -86,3 +86,42 @@ Container::~Container() { delete contents; }
 // These functions are declared/defined below the calls in OGCG.
 // OGCG: define linkonce_odr void @_ZN8ContentsD2Ev
 // OGCG: declare void @_ZdlPvm(ptr noundef, i64 noundef)
+
+struct StructWithVirtualDestructor {
+  virtual ~StructWithVirtualDestructor();
+};
+
+void destroy(StructWithVirtualDestructor *x) {
+  delete x;
+}
+
+// CIR: cir.func {{.*}} @_Z7destroyP27StructWithVirtualDestructor(%[[X_ARG:.*]]: !cir.ptr<!rec_StructWithVirtualDestructor> {{.*}})
+// CIR:   %[[X_ADDR:.*]] = cir.alloca !cir.ptr<!rec_StructWithVirtualDestructor>
+// CIR:   cir.store %[[X_ARG]], %[[X_ADDR]]
+// CIR:   %[[X:.*]] = cir.load{{.*}} %[[X_ADDR]]
+// CIR:   %[[VTABLE_PTR:.*]] = cir.vtable.get_vptr %[[X]] : !cir.ptr<!rec_StructWithVirtualDestructor> -> !cir.ptr<!cir.vptr>
+// CIR:   %[[VTABLE:.*]] = cir.load{{.*}} %[[VTABLE_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR:   %[[DTOR_FN_ADDR_PTR:.*]] = cir.vtable.get_virtual_fn_addr %[[VTABLE]][1]
+// CIR:   %[[DTOR_FN_ADDR:.*]] = cir.load{{.*}} %[[DTOR_FN_ADDR_PTR]]
+// CIR:   cir.call %[[DTOR_FN_ADDR]](%[[X]])
+
+// LLVM: define {{.*}} void @_Z7destroyP27StructWithVirtualDestructor(ptr %[[X_ARG:.*]])
+// LLVM:   %[[X_ADDR:.*]] = alloca ptr
+// LLVM:   store ptr %[[X_ARG]], ptr %[[X_ADDR]]
+// LLVM:   %[[X:.*]] = load ptr, ptr %[[X_ADDR]]
+// LLVM:   %[[VTABLE:.*]] = load ptr, ptr %[[X]]
+// LLVM:   %[[DTOR_FN_ADDR_PTR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i32 1
+// LLVM:   %[[DTOR_FN_ADDR:.*]] = load ptr, ptr %[[DTOR_FN_ADDR_PTR]]
+// LLVM:   call void %[[DTOR_FN_ADDR]](ptr %[[X]])
+
+// OGCG: define {{.*}} void @_Z7destroyP27StructWithVirtualDestructor(ptr {{.*}} %[[X_ARG:.*]])
+// OGCG:   %[[X_ADDR:.*]] = alloca ptr
+// OGCG:   store ptr %[[X_ARG]], ptr %[[X_ADDR]]
+// OGCG:   %[[X:.*]] = load ptr, ptr %[[X_ADDR]]
+// OGCG:   %[[ISNULL:.*]] = icmp eq ptr %[[X]], null
+// OGCG:   br i1 %[[ISNULL]], label %{{.*}}, label %[[DELETE_NOTNULL:.*]]
+// OGCG: [[DELETE_NOTNULL]]:
+// OGCG:   %[[VTABLE:.*]] = load ptr, ptr %[[X]]
+// OGCG:   %[[DTOR_FN_ADDR_PTR:.*]] = getelementptr inbounds ptr, ptr %[[VTABLE]], i64 1
+// OGCG:   %[[DTOR_FN_ADDR:.*]] = load ptr, ptr %[[DTOR_FN_ADDR_PTR]]
+// OGCG:   call void %[[DTOR_FN_ADDR]](ptr {{.*}} %[[X]])
diff --git a/clang/test/CIR/CodeGen/dtors.cpp b/clang/test/CIR/CodeGen/dtors.cpp
index f2c80a547f1d3..1fe048b7d5327 100644
--- a/clang/test/CIR/CodeGen/dtors.cpp
+++ b/clang/test/CIR/CodeGen/dtors.cpp
@@ -35,7 +35,7 @@ bool make_temp(const B &) { return false; }
 bool test_temp_or() { return make_temp(1) || make_temp(2); }
 
 // CIR: cir.func{{.*}} @_Z12test_temp_orv()
-// CIR:   %[[SCOPE:.*]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[REF_TMP0:.*]] = cir.alloca !rec_B, !cir.ptr<!rec_B>, ["ref.tmp0"]
 // CIR:     %[[ONE:.*]] = cir.const #cir.int<1>
 // CIR:     cir.call @_ZN1BC2Ei(%[[REF_TMP0]], %[[ONE]])
@@ -51,9 +51,9 @@ bool test_temp_or() { return make_temp(1) || make_temp(2); }
 // CIR:       cir.call @_ZN1BD2Ev(%[[REF_TMP1]])
 // CIR:       cir.yield %[[MAKE_TEMP1]] : !cir.bool
 // CIR:     })
+// CIR:     cir.store{{.*}} %[[TERNARY]], %[[RETVAL:.*]]
 // CIR:     cir.call @_ZN1BD2Ev(%[[REF_TMP0]])
-// CIR:     cir.yield %[[TERNARY]] : !cir.bool
-// CIR:   } : !cir.bool
+// CIR:   }
 
 // LLVM: define{{.*}} i1 @_Z12test_temp_orv(){{.*}} {
 // LLVM:   %[[REF_TMP0:.*]] = alloca %struct.B
@@ -105,7 +105,7 @@ bool test_temp_or() { return make_temp(1) || make_temp(2); }
 bool test_temp_and() { return make_temp(1) && make_temp(2); }
 
 // CIR: cir.func{{.*}} @_Z13test_temp_andv()
-// CIR:   %[[SCOPE:.*]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[REF_TMP0:.*]] = cir.alloca !rec_B, !cir.ptr<!rec_B>, ["ref.tmp0"]
 // CIR:     %[[ONE:.*]] = cir.const #cir.int<1>
 // CIR:     cir.call @_ZN1BC2Ei(%[[REF_TMP0]], %[[ONE]])
@@ -121,9 +121,9 @@ bool test_temp_and() { return make_temp(1) && make_temp(2); }
 // CIR:       %[[FALSE:.*]] = cir.const #false
 // CIR:       cir.yield %[[FALSE]] : !cir.bool
 // CIR:     })
+// CIR:     cir.store{{.*}} %[[TERNARY]], %[[RETVAL:.*]]
 // CIR:     cir.call @_ZN1BD2Ev(%[[REF_TMP0]])
-// CIR:     cir.yield %[[TERNARY]] : !cir.bool
-// CIR:   } : !cir.bool
+// CIR:   }
 
 // LLVM: define{{.*}} i1 @_Z13test_temp_andv(){{.*}} {
 // LLVM:   %[[REF_TMP0:.*]] = alloca %struct.B
diff --git a/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp b/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp
new file mode 100644
index 0000000000000..9ddb68fbf7355
--- /dev/null
+++ b/clang/test/CIR/CodeGen/dynamic-cast-exact.cpp
@@ -0,0 +1,208 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O1 -fclangir -clangir-disable-passes -emit-cir -o %t.cir %s
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O1 -fclangir -emit-llvm -o %t-cir.ll %s
+// RUN: FileCheck --input-file=%t-cir.ll --check-prefix=LLVM %s
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++20 -O1 -emit-llvm -o %t.ll %s
+// RUN: FileCheck --input-file=%t.ll --check-prefix=OGCG %s
+
+struct Base1 {
+  virtual ~Base1();
+};
+
+struct Base2 {
+  virtual ~Base2();
+};
+
+struct Derived final : Base1 {};
+
+Derived *ptr_cast(Base1 *ptr) {
+  return dynamic_cast<Derived *>(ptr);
+}
+
+//      CIR: cir.func {{.*}} @_Z8ptr_castP5Base1
+//      CIR:   %[[SRC:.*]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base1>>, !cir.ptr<!rec_Base1>
+// CIR-NEXT:   %[[NULL_PTR:.*]] = cir.const #cir.ptr<null>
+// CIR-NEXT:   %[[SRC_IS_NULL:.*]] = cir.cmp(eq, %[[SRC]], %[[NULL_PTR]])
+// CIR-NEXT:   %[[RESULT:.*]] = cir.ternary(%[[SRC_IS_NULL]], true {
+// CIR-NEXT:     %[[NULL_PTR_DEST:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_Derived>
+// CIR-NEXT:     cir.yield %[[NULL_PTR_DEST]] : !cir.ptr<!rec_Derived>
+// CIR-NEXT:   }, false {
+// CIR-NEXT:     %[[EXPECTED_VPTR:.*]] = cir.vtable.address_point(@_ZTV7Derived, address_point = <index = 0, offset = 2>) : !cir.vptr
+// CIR-NEXT:     %[[SRC_VPTR_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base1> -> !cir.ptr<!cir.vptr>
+// CIR-NEXT:     %[[SRC_VPTR:.*]] = cir.load{{.*}} %[[SRC_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-NEXT:     %[[SUCCESS:.*]] = cir.cmp(eq, %[[SRC_VPTR]], %[[EXPECTED_VPTR]]) : !cir.vptr, !cir.bool
+// CIR-NEXT:     %[[EXACT_RESULT:.*]] = cir.ternary(%[[SUCCESS]], true {
+// CIR-NEXT:       %[[RES:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base1> -> !cir.ptr<!rec_Derived>
+// CIR-NEXT:       cir.yield %[[RES]] : !cir.ptr<!rec_Derived>
+// CIR-NEXT:     }, false {
+// CIR-NEXT:       %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_Derived>
+// CIR-NEXT:       cir.yield %[[NULL]] : !cir.ptr<!rec_Derived>
+// CIR-NEXT:     }) : (!cir.bool) -> !cir.ptr<!rec_Derived>
+// CIR-NEXT:     cir.yield %[[EXACT_RESULT]] : !cir.ptr<!rec_Derived>
+// CIR-NEXT:   }) : (!cir.bool) -> !cir.ptr<!rec_Derived>
+
+// Note: The LLVM output omits the label for the entry block (which is
+//       implicitly %1), so we use %{{.*}} to match the implicit label in the
+//       phi check.
+
+//      LLVM: define dso_local ptr @_Z8ptr_castP5Base1(ptr{{.*}} %[[SRC:.*]])
+// LLVM-NEXT:   %[[SRC_IS_NULL:.*]] = icmp eq ptr %0, null
+// LLVM-NEXT:   br i1 %[[SRC_IS_NULL]], label %[[LABEL_END:.*]], label %[[LABEL_NOTNULL:.*]]
+//      LLVM: [[LABEL_NOTNULL]]:
+// LLVM-NEXT:   %[[VPTR:.*]] = load ptr, ptr %[[SRC]], align 8
+// LLVM-NEXT:   %[[SUCCESS:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds nuw (i8, ptr @_ZTV7Derived, i64 16)
+// LLVM-NEXT:   %[[EXACT_RESULT:.*]] = select i1 %[[SUCCESS]], ptr %[[SRC]], ptr null
+// LLVM-NEXT:   br label %[[LABEL_END]]
+//      LLVM: [[LABEL_END]]:
+// LLVM-NEXT:   %[[RESULT:.*]] = phi ptr [ %[[EXACT_RESULT]], %[[LABEL_NOTNULL]] ], [ null, %{{.*}} ]
+// LLVM-NEXT:   ret ptr %[[RESULT]]
+// LLVM-NEXT: }
+
+//      OGCG: define{{.*}} ptr @_Z8ptr_castP5Base1(ptr {{.*}} %[[SRC:.*]])
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   %[[NULL_CHECK:.*]] = icmp eq ptr %[[SRC]], null
+// OGCG-NEXT:   br i1 %[[NULL_CHECK]], label %[[LABEL_NULL:.*]], label %[[LABEL_NOTNULL:.*]]
+//      OGCG: [[LABEL_NOTNULL]]:
+// OGCG-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[SRC]], align 8
+// OGCG-NEXT:   %[[VTABLE_CHECK:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds {{.*}} (i8, ptr @_ZTV7Derived, i64 16)
+// OGCG-NEXT:   br i1 %[[VTABLE_CHECK]], label %[[LABEL_END:.*]], label %[[LABEL_NULL]]
+//      OGCG: [[LABEL_NULL]]:
+// OGCG-NEXT:   br label %[[LABEL_END]]
+//      OGCG: [[LABEL_END]]:
+// OGCG-NEXT:   %[[RESULT:.*]] = phi ptr [ %[[SRC]], %[[LABEL_NOTNULL]] ], [ null, %[[LABEL_NULL]] ]
+// OGCG-NEXT:   ret ptr %[[RESULT]]
+// OGCG-NEXT: }
+
+Derived &ref_cast(Base1 &ref) {
+  return dynamic_cast<Derived &>(ref);
+}
+
+//      CIR: cir.func {{.*}} @_Z8ref_castR5Base1
+//      CIR:   %[[SRC:.*]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base1>>, !cir.ptr<!rec_Base1>
+// CIR-NEXT:   %[[EXPECTED_VPTR:.*]] = cir.vtable.address_point(@_ZTV7Derived, address_point = <index = 0, offset = 2>) : !cir.vptr
+// CIR-NEXT:   %[[SRC_VPTR_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base1> -> !cir.ptr<!cir.vptr>
+// CIR-NEXT:   %[[SRC_VPTR:.*]] = cir.load{{.*}} %[[SRC_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-NEXT:   %[[SUCCESS:.*]] = cir.cmp(eq, %[[SRC_VPTR]], %[[EXPECTED_VPTR]]) : !cir.vptr, !cir.bool
+// CIR-NEXT:   %[[FAILED:.*]] = cir.unary(not, %[[SUCCESS]]) : !cir.bool, !cir.bool
+// CIR-NEXT:   cir.if %[[FAILED]] {
+// CIR-NEXT:     cir.call @__cxa_bad_cast() : () -> ()
+// CIR-NEXT:     cir.unreachable
+// CIR-NEXT:   }
+// CIR-NEXT:   %{{.+}} = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_Base1> -> !cir.ptr<!rec_Derived>
+
+//      LLVM: define{{.*}} ptr @_Z8ref_castR5Base1(ptr{{.*}} %[[SRC:.*]])
+// LLVM-NEXT:   %[[VPTR:.*]] = load ptr, ptr %[[SRC]], align 8
+// LLVM-NEXT:   %[[OK:.*]] = icmp eq ptr %[[VPTR]], getelementptr inbounds nuw (i8, ptr @_ZTV7Derived, i64 16)
+// LLVM-NEXT:   br i1 %[[OK]], label %[[LABEL_OK:.*]], label %[[LABEL_FAIL:.*]]
+//      LLVM: [[LABEL_FAIL]]:
+// LLVM-NEXT:   tail call void @__cxa_bad_cast()
+// LLVM-NEXT:   unreachable
+//      LLVM: [[LABEL_OK]]:
+// LLVM-NEXT:   ret ptr %[[SRC]]
+// LLVM-NEXT: }
+
+//      OGCG: define{{.*}} ptr @_Z8ref_castR5Base1(ptr {{.*}} %[[REF:.*]])
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[REF]], align 8
+// OGCG-NEXT:   %[[VTABLE_CHECK:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds {{.*}} (i8, ptr @_ZTV7Derived, i64 16)
+// OGCG-NEXT:   br i1 %[[VTABLE_CHECK]], label %[[LABEL_END:.*]], label %[[LABEL_NULL:.*]]
+//      OGCG: [[LABEL_NULL]]:
+// OGCG-NEXT:   {{.*}}call void @__cxa_bad_cast()
+// OGCG-NEXT:   unreachable
+//      OGCG: [[LABEL_END]]:
+// OGCG-NEXT:   ret ptr %[[REF]]
+// OGCG-NEXT: }
+
+struct Offset { virtual ~Offset(); };
+struct A { virtual ~A(); };
+struct B final : Offset, A { };
+
+B *offset_cast(A *a) {
+  return dynamic_cast<B*>(a);
+}
+
+//      CIR: cir.func {{.*}} @_Z11offset_castP1A
+//      CIR:   %[[SRC:.*]] = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
+// CIR-NEXT:   %[[NULL_PTR:.*]] = cir.const #cir.ptr<null>
+// CIR-NEXT:   %[[SRC_IS_NULL:.*]] = cir.cmp(eq, %[[SRC]], %[[NULL_PTR]])
+// CIR-NEXT:   %[[RESULT:.*]] = cir.ternary(%[[SRC_IS_NULL]], true {
+// CIR-NEXT:     %[[NULL_PTR_DEST:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_B>
+// CIR-NEXT:     cir.yield %[[NULL_PTR_DEST]] : !cir.ptr<!rec_B>
+// CIR-NEXT:   }, false {
+// CIR-NEXT:     %[[EXPECTED_VPTR:.*]] = cir.vtable.address_point(@_ZTV1B, address_point = <index = 1, offset = 2>) : !cir.vptr
+// CIR-NEXT:     %[[SRC_VPTR_PTR:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_A> -> !cir.ptr<!cir.vptr>
+// CIR-NEXT:     %[[SRC_VPTR:.*]] = cir.load{{.*}} %[[SRC_VPTR_PTR]] : !cir.ptr<!cir.vptr>, !cir.vptr
+// CIR-NEXT:     %[[SUCCESS:.*]] = cir.cmp(eq, %[[SRC_VPTR]], %[[EXPECTED_VPTR]]) : !cir.vptr, !cir.bool
+// CIR-NEXT:     %[[EXACT_RESULT:.*]] = cir.ternary(%[[SUCCESS]], true {
+// CIR-NEXT:       %[[MINUS_EIGHT:.*]] = cir.const #cir.int<18446744073709551608> : !u64i
+// CIR-NEXT:       %[[SRC_VOID:.*]] = cir.cast bitcast %[[SRC]] : !cir.ptr<!rec_A> -> !cir.ptr<!u8i>
+// CIR-NEXT:       %[[SRC_OFFSET:.*]] = cir.ptr_stride %[[SRC_VOID]], %[[MINUS_EIGHT]]
+// CIR-NEXT:       %[[RES:.*]] = cir.cast bitcast %[[SRC_OFFSET]] : !cir.ptr<!u8i> -> !cir.ptr<!rec_B>
+// CIR-NEXT:       cir.yield %[[RES]] : !cir.ptr<!rec_B>
+// CIR-NEXT:     }, false {
+// CIR-NEXT:       %[[NULL:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_B>
+// CIR-NEXT:       cir.yield %[[NULL]] : !cir.ptr<!rec_B>
+// CIR-NEXT:     }) : (!cir.bool) -> !cir.ptr<!rec_B>
+// CIR-NEXT:     cir.yield %[[EXACT_RESULT]] : !cir.ptr<!rec_B>
+// CIR-NEXT:   }) : (!cir.bool) -> !cir.ptr<!rec_B>
+
+//      LLVM: define dso_local ptr @_Z11offset_castP1A(ptr{{.*}} %[[SRC:.*]])
+// LLVM-NEXT:   %[[SRC_IS_NULL:.*]] = icmp eq ptr %0, null
+// LLVM-NEXT:   br i1 %[[SRC_IS_NULL]], label %[[LABEL_END:.*]], label %[[LABEL_NOTNULL:.*]]
+//      LLVM: [[LABEL_NOTNULL]]:
+// LLVM-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[SRC]]
+// LLVM-NEXT:   %[[VTABLE_CHECK:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds nuw (i8, ptr @_ZTV1B, i64 48)
+// LLVM-NEXT:   %[[SRC_OFFSET:.*]] = getelementptr i8, ptr %[[SRC]], i64 -8
+// LLVM-NEXT:   %[[EXACT_RESULT:.*]] = select i1 %[[VTABLE_CHECK]], ptr %[[SRC_OFFSET]], ptr null
+// LLVM-NEXT:   br label %[[LABEL_END]]
+//      LLVM: [[LABEL_END]]:
+// LLVM-NEXT:   %[[RESULT:.*]] = phi ptr [ %[[EXACT_RESULT]], %[[LABEL_NOTNULL]] ], [ null, %{{.*}} ]
+// LLVM-NEXT:   ret ptr %[[RESULT]]
+// LLVM-NEXT: }
+
+//      OGCG: define{{.*}} ptr @_Z11offset_castP1A(ptr{{.*}} %[[SRC:.*]])
+//      OGCG:   %[[SRV_NULL:.*]] = icmp eq ptr %[[SRC]], null
+// OGCG-NEXT:   br i1 %[[SRV_NULL]], label %[[LABEL_NULL:.*]], label %[[LABEL_NOTNULL:.*]]
+//      OGCG: [[LABEL_NOTNULL]]:
+// OGCG-NEXT:   %[[VTABLE:.*]] = load ptr, ptr %[[SRC]]
+// OGCG-NEXT:   %[[VTABLE_CHECK:.*]] = icmp eq ptr %[[VTABLE]], getelementptr inbounds nuw inrange(-16, 16) (i8, ptr @_ZTV1B, i64 48)
+// OGCG-NEXT:   %[[RESULT:.*]] = getelementptr inbounds i8, ptr %[[SRC]], i64 -8
+// OGCG-NEXT:   br i1 %[[VTABLE_CHECK]], label %[[LABEL_END:.*]], label %[[LABEL_NULL]]
+//      OGCG: [[LABEL_NULL]]:
+// OGCG-NEXT:   br label %[[LABEL_END]]
+//      OGCG: [[LABEL_END]]:
+// OGCG-NEXT:   phi ptr [ %[[RESULT]], %[[LABEL_NOTNULL]] ], [ null, %[[LABEL_NULL]] ]
+
+Derived *ptr_cast_always_fail(Base2 *ptr) {
+    return dynamic_cast<Derived *>(ptr);
+  }
+
+//      CIR: cir.func {{.*}} @_Z20ptr_cast_always_failP5Base2
+//      CIR:   %{{.+}} = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base2>>, !cir.ptr<!rec_Base2>
+// CIR-NEXT:   %[[RESULT:.*]] = cir.const #cir.ptr<null> : !cir.ptr<!rec_Derived>
+// CIR-NEXT:   cir.store %[[RESULT]], %{{.*}} : !cir.ptr<!rec_Derived>, !cir.ptr<!cir.ptr<!rec_Derived>>
+
+//      LLVM: define {{.*}} ptr @_Z20ptr_cast_always_failP5Base2
+// LLVM-NEXT:   ret ptr null
+
+//      OGCG: define {{.*}} ptr @_Z20ptr_cast_always_failP5Base2
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   ret ptr null
+
+Derived &ref_cast_always_fail(Base2 &ref) {
+  return dynamic_cast<Derived &>(ref);
+}
+
+//      CIR: cir.func {{.*}} @_Z20ref_cast_always_failR5Base2
+//      CIR:   %{{.+}} = cir.load{{.*}} %{{.+}} : !cir.ptr<!cir.ptr<!rec_Base2>>, !cir.ptr<!rec_Base2>
+// CIR-NEXT:   cir.call @__cxa_bad_cast() : () -> ()
+// CIR-NEXT:   cir.unreachable
+
+//      LLVM: define {{.*}} ptr @_Z20ref_cast_always_failR5Base2
+// LLVM-NEXT:   tail call void @__cxa_bad_cast()
+// LLVM-NEXT:   unreachable
+
+//      OGCG: define {{.*}} ptr @_Z20ref_cast_always_failR5Base2
+// OGCG-NEXT: entry:
+// OGCG-NEXT:   tail call void @__cxa_bad_cast()
+// OGCG-NEXT:   unreachable
diff --git a/clang/test/CIR/CodeGen/global-init.cpp b/clang/test/CIR/CodeGen/global-init.cpp
index 0aab69536241a..01e2868278514 100644
--- a/clang/test/CIR/CodeGen/global-init.cpp
+++ b/clang/test/CIR/CodeGen/global-init.cpp
@@ -103,19 +103,91 @@ NeedsCtorDtor needsCtorDtor;
 // OGCG:   call void @_ZN13NeedsCtorDtorC1Ev(ptr noundef nonnull align 1 dereferenceable(1) @needsCtorDtor)
 // OGCG:   %{{.*}} = call i32 @__cxa_atexit(ptr @_ZN13NeedsCtorDtorD1Ev, ptr @needsCtorDtor, ptr @__dso_handle)
 
+float num;
+float _Complex a = {num, num};
+
+// CIR-BEFORE-LPP: cir.global external @num = #cir.fp<0.000000e+00> : !cir.float
+// CIR-BEFORE-LPP: cir.global external @a = ctor : !cir.complex<!cir.float> {
+// CIR-BEFORE-LPP:  %[[THIS:.*]] = cir.get_global @a : !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE-LPP:  %[[NUM:.*]] = cir.get_global @num : !cir.ptr<!cir.float>
+// CIR-BEFORE-LPP:  %[[REAL:.*]] = cir.load{{.*}} %[[NUM]] : !cir.ptr<!cir.float>, !cir.float
+// CIR-BEFORE-LPP:  %[[NUM:.*]] = cir.get_global @num : !cir.ptr<!cir.float>
+// CIR-BEFORE-LPP:  %[[IMAG:.*]] = cir.load{{.*}} %[[NUM]] : !cir.ptr<!cir.float>, !cir.float
+// CIR-BEFORE-LPP:  %[[COMPLEX_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR-BEFORE-LPP:  cir.store{{.*}} %[[COMPLEX_VAL:.*]], %[[THIS]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+// CIR-BEFORE-LPP: }
+
+// CIR:  cir.global external @num = #cir.fp<0.000000e+00> : !cir.float
+// CIR:  cir.global external @a = #cir.zero : !cir.complex<!cir.float>
+// CIR:  cir.func internal private @__cxx_global_var_init.3()
+// CIR:   %[[A_ADDR:.*]] = cir.get_global @a : !cir.ptr<!cir.complex<!cir.float>>
+// CIR:   %[[NUM:.*]] = cir.get_global @num : !cir.ptr<!cir.float>
+// CIR:   %[[REAL:.*]] = cir.load{{.*}} %[[NUM]] : !cir.ptr<!cir.float>, !cir.float
+// CIR:   %[[NUM:.*]] = cir.get_global @num : !cir.ptr<!cir.float>
+// CIR:   %[[IMAG:.*]] = cir.load{{.*}} %[[NUM]] : !cir.ptr<!cir.float>, !cir.float
+// CIR:   %[[COMPLEX_VAL:.*]] = cir.complex.create %[[REAL]], %[[IMAG]] : !cir.float -> !cir.complex<!cir.float>
+// CIR:   cir.store{{.*}} %[[COMPLEX_VAL]], %[[A_ADDR]] : !cir.complex<!cir.float>, !cir.ptr<!cir.complex<!cir.float>>
+
+// LLVM: define internal void @__cxx_global_var_init.3()
+// LLVM:   %[[REAL:.*]] = load float, ptr @num, align 4
+// LLVM:   %[[IMAG:.*]] = load float, ptr @num, align 4
+// LLVM:   %[[TMP_COMPLEX_VAL:.*]] = insertvalue { float, float } {{.*}}, float %[[REAL]], 0
+// LLVM:   %[[COMPLEX_VAL:.*]] = insertvalue { float, float } %[[TMP_COMPLEX_VAL]], float %[[IMAG]], 1
+// LLVM:   store { float, float } %[[COMPLEX_VAL]], ptr @a, align 4
+
+// OGCG: define internal void @__cxx_global_var_init.3() {{.*}} section ".text.startup"
+// OGCG:   %[[REAL:.*]] = load float, ptr @num, align 4
+// OGCG:   %[[IMAG:.*]] = load float, ptr @num, align 4
+// OGCG:   store float %[[REAL]], ptr @a, align 4
+// OGCG:   store float %[[IMAG]], ptr getelementptr inbounds nuw ({ float, float }, ptr @a, i32 0, i32 1), align 4
+
+float fp;
+int i = (int)fp;
+
+// CIR-BEFORE-LPP: cir.global external @i = ctor : !s32i {
+// CIR-BEFORE-LPP:   %0 = cir.get_global @i : !cir.ptr<!s32i>
+// CIR-BEFORE-LPP:   %1 = cir.get_global @fp : !cir.ptr<!cir.float>
+// CIR-BEFORE-LPP:   %2 = cir.load{{.*}} %1 : !cir.ptr<!cir.float>, !cir.float
+// CIR-BEFORE-LPP:   %3 = cir.cast float_to_int %2 : !cir.float -> !s32i
+// CIR-BEFORE-LPP:   cir.store{{.*}} %3, %0 : !s32i, !cir.ptr<!s32i>
+// CIR-BEFORE-LPP: }
+
+// CIR: cir.func internal private @__cxx_global_var_init.4()
+// CIR:   %[[I_ADDR:.*]] = cir.get_global @i : !cir.ptr<!s32i>
+// CIR:   %[[FP_ADDR:.*]] = cir.get_global @fp : !cir.ptr<!cir.float>
+// CIR:   %[[TMP_FP:.*]] = cir.load{{.*}} %[[FP_ADDR]] : !cir.ptr<!cir.float>, !cir.float
+// CIR:   %[[FP_I32:.*]] = cir.cast float_to_int %[[TMP_FP]] : !cir.float -> !s32i
+// CIR:   cir.store{{.*}} %[[FP_I32]], %[[I_ADDR]] : !s32i, !cir.ptr<!s32i>
+
+// LLVM: define internal void @__cxx_global_var_init.4()
+// LLVM:   %[[TMP_FP:.*]] = load float, ptr @fp, align 4
+// LLVM:   %[[FP_I32:.*]] = fptosi float %[[TMP_FP]] to i32
+// LLVM:   store i32 %[[FP_I32]], ptr @i, align 4
+
+// OGCG: define internal void @__cxx_global_var_init.4() {{.*}} section ".text.startup"
+// OGCG:   %[[TMP_FP:.*]] = load float, ptr @fp, align 4
+// OGCG:   %[[FP_I32:.*]] = fptosi float %[[TMP_FP]] to i32
+// OGCG:   store i32 %[[FP_I32]], ptr @i, align 4
+
 // Common init function for all globals with default priority
 
 // CIR: cir.func private @_GLOBAL__sub_I_[[FILENAME:.*]]() {
 // CIR:   cir.call @__cxx_global_var_init() : () -> ()
 // CIR:   cir.call @__cxx_global_var_init.1() : () -> ()
 // CIR:   cir.call @__cxx_global_var_init.2() : () -> ()
+// CIR:   cir.call @__cxx_global_var_init.3() : () -> ()
+// CIR:   cir.call @__cxx_global_var_init.4() : () -> ()
 
 // LLVM: define void @_GLOBAL__sub_I_[[FILENAME]]()
 // LLVM:   call void @__cxx_global_var_init()
 // LLVM:   call void @__cxx_global_var_init.1()
 // LLVM:   call void @__cxx_global_var_init.2()
+// LLVM:   call void @__cxx_global_var_init.3()
+// LLVM:   call void @__cxx_global_var_init.4()
 
 // OGCG: define internal void @_GLOBAL__sub_I_[[FILENAME]]() {{.*}} section ".text.startup" {
 // OGCG:   call void @__cxx_global_var_init()
 // OGCG:   call void @__cxx_global_var_init.1()
 // OGCG:   call void @__cxx_global_var_init.2()
+// OGCG:   call void @__cxx_global_var_init.3()
+// OGCG:   call void @__cxx_global_var_init.4()
diff --git a/clang/test/CIR/CodeGen/lambda.cpp b/clang/test/CIR/CodeGen/lambda.cpp
index 0c32ceb187df4..91380b9bea296 100644
--- a/clang/test/CIR/CodeGen/lambda.cpp
+++ b/clang/test/CIR/CodeGen/lambda.cpp
@@ -219,14 +219,13 @@ int f() {
 
 // CIR: cir.func dso_local @_Z1fv() -> !s32i {{.*}} {
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
-// CIR:   %[[SCOPE_RET:.*]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[TMP:.*]] = cir.alloca ![[REC_LAM_G2]], !cir.ptr<![[REC_LAM_G2]]>, ["ref.tmp0"]
 // CIR:     %[[G2:.*]] = cir.call @_Z2g2v() : () -> ![[REC_LAM_G2]]
 // CIR:     cir.store{{.*}} %[[G2]], %[[TMP]]
 // CIR:     %[[RESULT:.*]] = cir.call @_ZZ2g2vENK3$_0clEv(%[[TMP]])
-// CIR:     cir.yield %[[RESULT]]
+// CIR:     cir.store{{.*}} %[[RESULT]], %[[RETVAL]]
 // CIR:   }
-// CIR:   cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]]
 // CIR:   %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
@@ -255,10 +254,9 @@ int f() {
 // LLVM:   %[[G2:.*]] = call %[[REC_LAM_G2]] @_Z2g2v()
 // LLVM:   store %[[REC_LAM_G2]] %[[G2]], ptr %[[TMP]]
 // LLVM:   %[[RESULT:.*]] = call i32 @"_ZZ2g2vENK3$_0clEv"(ptr %[[TMP]])
+// LLVM:   store i32 %[[RESULT]], ptr %[[RETVAL]]
 // LLVM:   br label %[[RET_BB:.*]]
 // LLVM: [[RET_BB]]:
-// LLVM:   %[[RETPHI:.*]] = phi i32 [ %[[RESULT]], %[[SCOPE_BB]] ]
-// LLVM:   store i32 %[[RETPHI]], ptr %[[RETVAL]]
 // LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL]]
 // LLVM:   ret i32 %[[RET]]
 
@@ -333,14 +331,13 @@ struct A {
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
 // CIR:   %[[THIS]] = cir.load deref %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
-// CIR:   %[[SCOPE_RET:.*]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[LAM_ADDR:.*]] = cir.alloca ![[REC_LAM_A]], !cir.ptr<![[REC_LAM_A]]>, ["ref.tmp0"]
 // CIR:     %[[STRUCT_A:.*]] = cir.get_member %[[LAM_ADDR]][0] {name = "this"} : !cir.ptr<![[REC_LAM_A]]> -> !cir.ptr<!rec_A>
 // CIR:     cir.call @_ZN1AC1ERKS_(%[[STRUCT_A]], %[[THIS]]){{.*}} : (!cir.ptr<!rec_A>, !cir.ptr<!rec_A>){{.*}} -> ()
 // CIR:     %[[LAM_RET:.*]] = cir.call @_ZZN1A3fooEvENKUlvE_clEv(%[[LAM_ADDR]])
-// CIR:     cir.yield %[[LAM_RET]]
+// CIR:     cir.store{{.*}} %[[LAM_RET]], %[[RETVAL]]
 // CIR:   }
-// CIR:   cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]]
 // CIR:   %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
@@ -355,10 +352,9 @@ struct A {
 // LLVM:   %[[STRUCT_A:.*]] = getelementptr %[[REC_LAM_A]], ptr %[[LAM_ALLOCA]], i32 0, i32 0
 // LLVM:   call void @_ZN1AC1ERKS_(ptr %[[STRUCT_A]], ptr %[[THIS]])
 // LLVM:   %[[LAM_RET:.*]] = call i32 @_ZZN1A3fooEvENKUlvE_clEv(ptr %[[LAM_ALLOCA]])
+// LLVM:   store i32 %[[LAM_RET]], ptr %[[RETVAL]]
 // LLVM:   br label %[[RET_BB:.*]]
 // LLVM: [[RET_BB]]:
-// LLVM:   %[[RETPHI:.*]] = phi i32 [ %[[LAM_RET]], %[[SCOPE_BB]] ]
-// LLVM:   store i32 %[[RETPHI]], ptr %[[RETVAL]]
 // LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL]]
 // LLVM:   ret i32 %[[RET]]
 
@@ -407,14 +403,13 @@ struct A {
 // CIR:   %[[RETVAL:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
 // CIR:   cir.store %[[THIS_ARG]], %[[THIS_ADDR]]
 // CIR:   %[[THIS]] = cir.load %[[THIS_ADDR]] : !cir.ptr<!cir.ptr<!rec_A>>, !cir.ptr<!rec_A>
-// CIR:   %[[SCOPE_RET:.*]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[LAM_ADDR:.*]] = cir.alloca ![[REC_LAM_PTR_A]], !cir.ptr<![[REC_LAM_PTR_A]]>, ["ref.tmp0"]
 // CIR:     %[[A_ADDR_ADDR:.*]] = cir.get_member %[[LAM_ADDR]][0] {name = "this"} : !cir.ptr<![[REC_LAM_PTR_A]]> -> !cir.ptr<!cir.ptr<!rec_A>>
 // CIR:     cir.store{{.*}} %[[THIS]], %[[A_ADDR_ADDR]]
 // CIR:     %[[LAM_RET:.*]] = cir.call @_ZZN1A3barEvENKUlvE_clEv(%[[LAM_ADDR]])
-// CIR:     cir.yield %[[LAM_RET]]
+// CIR:     cir.store{{.*}} %[[LAM_RET]], %[[RETVAL]]
 // CIR:   }
-// CIR:   cir.store{{.*}} %[[SCOPE_RET]], %[[RETVAL]]
 // CIR:   %[[RET:.*]] = cir.load{{.*}} %[[RETVAL]]
 // CIR:   cir.return %[[RET]]
 
@@ -429,10 +424,9 @@ struct A {
 // LLVM:   %[[A_ADDR_ADDR:.*]] = getelementptr %[[REC_LAM_PTR_A]], ptr %[[LAM_ALLOCA]], i32 0, i32 0
 // LLVM:   store ptr %[[THIS]], ptr %[[A_ADDR_ADDR]]
 // LLVM:   %[[LAM_RET:.*]] = call i32 @_ZZN1A3barEvENKUlvE_clEv(ptr %[[LAM_ALLOCA]])
+// LLVM:   store i32 %[[LAM_RET]], ptr %[[RETVAL]]
 // LLVM:   br label %[[RET_BB:.*]]
 // LLVM: [[RET_BB]]:
-// LLVM:   %[[RETPHI:.*]] = phi i32 [ %[[LAM_RET]], %[[SCOPE_BB]] ]
-// LLVM:   store i32 %[[RETPHI]], ptr %[[RETVAL]]
 // LLVM:   %[[RET:.*]] = load i32, ptr %[[RETVAL]]
 // LLVM:   ret i32 %[[RET]]
 
diff --git a/clang/test/CIR/CodeGen/new.cpp b/clang/test/CIR/CodeGen/new.cpp
index 000ea5b77bebf..2efad10c19efd 100644
--- a/clang/test/CIR/CodeGen/new.cpp
+++ b/clang/test/CIR/CodeGen/new.cpp
@@ -208,6 +208,127 @@ void t_new_constant_size() {
 // OGCG:   %[[CALL:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 128)
 // OGCG:   store ptr %[[CALL]], ptr %[[P_ADDR]], align 8
 
+class C {
+  public:
+    ~C();
+};
+
+void t_constant_size_nontrivial() {
+  auto p = new C[3];
+}
+
+// CHECK:  cir.func{{.*}} @_Z26t_constant_size_nontrivialv()
+// CHECK:    %[[P_ADDR:.*]] = cir.alloca !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>, ["p", init] {alignment = 8 : i64}
+// CHECK:    %[[#NUM_ELEMENTS:]] = cir.const #cir.int<3> : !u64i
+// CHECK:    %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<3> : !u64i
+// CHECK:    %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<11> : !u64i
+// CHECK:    %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr<!void>
+// CHECK:    %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr<!void> -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_BASE]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!u64i>
+// CHECK:    cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr<!u64i>
+// CHECK:    %[[#COOKIE_SIZE:]] = cir.const #cir.int<8> : !s32i
+// CHECK:    %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr<!cir.ptr<!u8i>>, !s32i) -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!void>
+// CHECK:    %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr<!void> -> !cir.ptr<!rec_C>
+// CHECK:    cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr<!rec_C>, !cir.ptr<!cir.ptr<!rec_C>>
+// CHECK:    cir.return
+// CHECK:  }
+
+// LLVM: @_Z26t_constant_size_nontrivialv()
+// LLVM:   %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[COOKIE_PTR:.*]] = call ptr @_Znam(i64 11)
+// LLVM:   store i64 3, ptr %[[COOKIE_PTR]], align 8
+// LLVM:   %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[COOKIE_PTR]], i64 8
+// LLVM:   store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8
+
+// OGCG: @_Z26t_constant_size_nontrivialv()
+// OGCG:   %[[ALLOCA:.*]] = alloca ptr, align 8
+// OGCG:   %[[COOKIE_PTR:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 11)
+// OGCG:   store i64 3, ptr %[[COOKIE_PTR]], align 8
+// OGCG:   %[[ALLOCATED_PTR:.*]] = getelementptr inbounds i8, ptr %[[COOKIE_PTR]], i64 8
+// OGCG:   store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8
+
+class D {
+  public:
+    int x;
+    ~D();
+};
+
+void t_constant_size_nontrivial2() {
+  auto p = new D[3];
+}
+
+// In this test SIZE_WITHOUT_COOKIE isn't used, but it would be if there were
+// an initializer.
+
+// CHECK:  cir.func{{.*}} @_Z27t_constant_size_nontrivial2v()
+// CHECK:    %[[P_ADDR:.*]] = cir.alloca !cir.ptr<!rec_D>, !cir.ptr<!cir.ptr<!rec_D>>, ["p", init] {alignment = 8 : i64}
+// CHECK:    %[[#NUM_ELEMENTS:]] = cir.const #cir.int<3> : !u64i
+// CHECK:    %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<12> : !u64i
+// CHECK:    %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<20> : !u64i
+// CHECK:    %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr<!void>
+// CHECK:    %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr<!void> -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_BASE]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!u64i>
+// CHECK:    cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr<!u64i>
+// CHECK:    %[[#COOKIE_SIZE:]] = cir.const #cir.int<8> : !s32i
+// CHECK:    %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr<!cir.ptr<!u8i>>, !s32i) -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!void>
+// CHECK:    %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr<!void> -> !cir.ptr<!rec_D>
+// CHECK:    cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr<!rec_D>, !cir.ptr<!cir.ptr<!rec_D>>
+// CHECK:    cir.return
+// CHECK:  }
+
+// LLVM: @_Z27t_constant_size_nontrivial2v()
+// LLVM:   %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[COOKIE_PTR:.*]] = call ptr @_Znam(i64 20)
+// LLVM:   store i64 3, ptr %[[COOKIE_PTR]], align 8
+// LLVM:   %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[COOKIE_PTR]], i64 8
+// LLVM:   store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8
+
+struct alignas(16) E {
+  int x;
+  ~E();
+};
+
+void t_align16_nontrivial() {
+  auto p = new E[2];
+}
+
+// CHECK:  cir.func{{.*}} @_Z20t_align16_nontrivialv()
+// CHECK:    %[[P_ADDR:.*]] = cir.alloca !cir.ptr<!rec_E>, !cir.ptr<!cir.ptr<!rec_E>>, ["p", init] {alignment = 8 : i64}
+// CHECK:    %[[#NUM_ELEMENTS:]] = cir.const #cir.int<2> : !u64i
+// CHECK:    %[[#SIZE_WITHOUT_COOKIE:]] = cir.const #cir.int<32> : !u64i
+// CHECK:    %[[#ALLOCATION_SIZE:]] = cir.const #cir.int<48> : !u64i
+// CHECK:    %[[RAW_PTR:.*]] = cir.call @_Znam(%[[#ALLOCATION_SIZE]]) : (!u64i) -> !cir.ptr<!void>
+// CHECK:    %[[COOKIE_PTR_BASE:.*]] = cir.cast bitcast %[[RAW_PTR]] : !cir.ptr<!void> -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[COOKIE_OFFSET:.*]] = cir.const #cir.int<8> : !s32i
+// CHECK:    %[[COOKIE_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[COOKIE_OFFSET]] : (!cir.ptr<!cir.ptr<!u8i>>, !s32i) -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[COOKIE_PTR:.*]] = cir.cast bitcast %[[COOKIE_PTR_RAW]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!u64i>
+// CHECK:    cir.store align(8) %[[#NUM_ELEMENTS]], %[[COOKIE_PTR]] : !u64i, !cir.ptr<!u64i>
+// CHECK:    %[[#COOKIE_SIZE:]] = cir.const #cir.int<16> : !s32i
+// CHECK:    %[[DATA_PTR_RAW:.*]] = cir.ptr_stride %[[COOKIE_PTR_BASE]], %[[#COOKIE_SIZE]] : (!cir.ptr<!cir.ptr<!u8i>>, !s32i) -> !cir.ptr<!cir.ptr<!u8i>>
+// CHECK:    %[[DATA_PTR_VOID:.*]] = cir.cast bitcast %[[DATA_PTR_RAW]] : !cir.ptr<!cir.ptr<!u8i>> -> !cir.ptr<!void>
+// CHECK:    %[[DATA_PTR:.*]] = cir.cast bitcast %[[DATA_PTR_VOID]] : !cir.ptr<!void> -> !cir.ptr<!rec_E>
+// CHECK:    cir.store align(8) %[[DATA_PTR]], %[[P_ADDR]] : !cir.ptr<!rec_E>, !cir.ptr<!cir.ptr<!rec_E>>
+// CHECK:    cir.return
+// CHECK:  }
+
+// LLVM: @_Z20t_align16_nontrivialv()
+// LLVM:   %[[ALLOCA:.*]] = alloca ptr, i64 1, align 8
+// LLVM:   %[[RAW_PTR:.*]] = call ptr @_Znam(i64 48)
+// LLVM:   %[[COOKIE_PTR:.*]] = getelementptr ptr, ptr %[[RAW_PTR]], i64 8
+// LLVM:   store i64 2, ptr %[[COOKIE_PTR]], align 8
+// LLVM:   %[[ALLOCATED_PTR:.*]] = getelementptr ptr, ptr %[[RAW_PTR]], i64 16
+// LLVM:   store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8
+
+// OGCG: define{{.*}} void @_Z20t_align16_nontrivialv
+// OGCG:   %[[ALLOCA:.*]] = alloca ptr, align 8
+// OGCG:   %[[RAW_PTR:.*]] = call noalias noundef nonnull ptr @_Znam(i64 noundef 48)
+// OGCG:   %[[COOKIE_PTR:.*]] = getelementptr inbounds i8, ptr %[[RAW_PTR]], i64 8
+// OGCG:   store i64 2, ptr %[[COOKIE_PTR]], align 8
+// OGCG:   %[[ALLOCATED_PTR:.*]] = getelementptr inbounds i8, ptr %[[RAW_PTR]], i64 16
+// OGCG:   store ptr %[[ALLOCATED_PTR]], ptr %[[ALLOCA]], align 8
+// OGCG:   ret void
 
 void t_new_multidim_constant_size() {
   auto p = new double[2][3][4];
diff --git a/clang/test/CIR/CodeGen/statement-exprs.c b/clang/test/CIR/CodeGen/statement-exprs.c
index f6ec9ecd1b67e..c784ec9eda7d8 100644
--- a/clang/test/CIR/CodeGen/statement-exprs.c
+++ b/clang/test/CIR/CodeGen/statement-exprs.c
@@ -218,7 +218,7 @@ struct S { int x; };
 int test3() { return ({ struct S s = {1}; s; }).x; }
 // CIR: cir.func no_proto dso_local @test3() -> !s32i
 // CIR:   %[[RETVAL:.+]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
-// CIR:   %[[YIELDVAL:.+]] = cir.scope {
+// CIR:   cir.scope {
 // CIR:     %[[REF_TMP0:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["ref.tmp0"]
 // CIR:     %[[TMP:.+]] = cir.alloca !rec_S, !cir.ptr<!rec_S>, ["tmp"]
 // CIR:     cir.scope {
@@ -230,9 +230,8 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
 // CIR:     }
 // CIR:     %[[GEP_X_TMP:.+]] = cir.get_member %[[REF_TMP0]][0] {name = "x"} : !cir.ptr<!rec_S> -> !cir.ptr<!s32i>
 // CIR:     %[[XVAL:.+]] = cir.load {{.*}} %[[GEP_X_TMP]] : !cir.ptr<!s32i>, !s32i
-// CIR:     cir.yield %[[XVAL]] : !s32i
-// CIR:   } : !s32i
-// CIR:   cir.store %[[YIELDVAL]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:     cir.store{{.*}} %[[XVAL]], %[[RETVAL]] : !s32i, !cir.ptr<!s32i>
+// CIR:   }
 // CIR:   %[[RES:.+]] = cir.load %[[RETVAL]] : !cir.ptr<!s32i>, !s32i
 // CIR:   cir.return %[[RES]] : !s32i
 
@@ -252,10 +251,9 @@ int test3() { return ({ struct S s = {1}; s; }).x; }
 // LLVM: [[LBL8]]:
 // LLVM:     %[[GEP_VAR1:.+]] = getelementptr %struct.S, ptr %[[VAR1]], i32 0, i32 0
 // LLVM:     %[[LOAD_X:.+]] = load i32, ptr %[[GEP_VAR1]]
+// LLVM:     store i32 %[[LOAD_X]], ptr %[[VAR4]]
 // LLVM:     br label %[[LBL11:.+]]
 // LLVM: [[LBL11]]:
-// LLVM:     %[[PHI:.+]] = phi i32 [ %[[LOAD_X]], %[[LBL8]] ]
-// LLVM:     store i32 %[[PHI]], ptr %[[VAR4]]
 // LLVM:     %[[RES:.+]] = load i32, ptr %[[VAR4]]
 // LLVM:     ret i32 %[[RES]]
 
diff --git a/clang/test/CIR/CodeGen/struct-init.cpp b/clang/test/CIR/CodeGen/struct-init.cpp
index a47ef530b8369..79886190616b9 100644
--- a/clang/test/CIR/CodeGen/struct-init.cpp
+++ b/clang/test/CIR/CodeGen/struct-init.cpp
@@ -5,10 +5,58 @@
 // RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -emit-llvm %s -o %t.ll
 // RUN: FileCheck --check-prefix=OGCG --input-file=%t.ll %s
 
+struct BitfieldStruct {
+  unsigned int a:4;
+  unsigned int b:14;
+  unsigned int c:14;
+};
+
+BitfieldStruct overlapping_init = { 3, 2, 1 };
+
+// This is unintuitive. The bitfields are initialized using a struct of constants
+// that maps to the bitfields but splits the value into bytes.
+
+// CIR: cir.global external @overlapping_init = #cir.const_record<{#cir.int<35> : !u8i, #cir.int<0> : !u8i, #cir.int<4> : !u8i, #cir.int<0> : !u8i}> : !rec_anon_struct
+// LLVM: @overlapping_init = global { i8, i8, i8, i8 } { i8 35, i8 0, i8 4, i8 0 }
+// OGCG: @overlapping_init = global { i8, i8, i8, i8 } { i8 35, i8 0, i8 4, i8 0 }
+
 struct S {
   int a, b, c;
 };
 
+S partial_init = { 1 };
+
+// CIR: cir.global external @partial_init = #cir.const_record<{#cir.int<1> : !s32i, #cir.int<0> : !s32i, #cir.int<0> : !s32i}> : !rec_S
+// LLVM: @partial_init = global %struct.S { i32 1, i32 0, i32 0 }
+// OGCG: @partial_init = global %struct.S { i32 1, i32 0, i32 0 }
+
+struct StructWithDefaultInit {
+  int a = 2;
+};
+
+StructWithDefaultInit swdi = {};
+
+// CIR: cir.global external @swdi = #cir.const_record<{#cir.int<2> : !s32i}> : !rec_StructWithDefaultInit
+// LLVM: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4
+// OGCG: @swdi = global %struct.StructWithDefaultInit { i32 2 }, align 4
+
+struct StructWithFieldInitFromConst {
+  int a : 10;
+  int b = a;
+};
+
+StructWithFieldInitFromConst swfifc = {};
+
+// CIR: cir.global external @swfifc = #cir.zero : !rec_anon_struct
+// LLVM: @swfifc = global { i8, i8, i32 } zeroinitializer, align 4
+// OGCG: @swfifc = global { i8, i8, i32 } zeroinitializer, align 4
+
+StructWithFieldInitFromConst swfifc2 = { 2 };
+
+// CIR: cir.global external @swfifc2 = #cir.const_record<{#cir.int<2> : !u8i, #cir.int<0> : !u8i, #cir.int<2> : !s32i}> : !rec_anon_struct
+// LLVM: @swfifc2 = global { i8, i8, i32 } { i8 2, i8 0, i32 2 }, align 4
+// OGCG: @swfifc2 = global { i8, i8, i32 } { i8 2, i8 0, i32 2 }, align 4
+
 void init() {
   S s1 = {1, 2, 3};
   S s2 = {4, 5};
diff --git a/clang/test/CIR/CodeGen/struct.cpp b/clang/test/CIR/CodeGen/struct.cpp
index 6d362c79c1c44..c8db71498e477 100644
--- a/clang/test/CIR/CodeGen/struct.cpp
+++ b/clang/test/CIR/CodeGen/struct.cpp
@@ -280,3 +280,67 @@ void bin_comma() {
 // OGCG: define{{.*}} void @_Z9bin_commav()
 // OGCG:   %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
 // OGCG:   call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
+
+void compound_literal_expr() { CompleteS a = (CompleteS){}; }
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !rec_CompleteS, !cir.ptr<!rec_CompleteS>, ["a", init]
+// CIR: %[[A_ELEM_0_PTR:.*]] = cir.get_member %[[A_ADDR]][0] {name = "a"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s32i>
+// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_0_PTR]] : !s32i, !cir.ptr<!s32i>
+// CIR: %[[A_ELEM_1_PTR:.*]] = cir.get_member %[[A_ADDR]][1] {name = "b"} : !cir.ptr<!rec_CompleteS> -> !cir.ptr<!s8i>
+// CIR: %[[CONST_0:.*]] = cir.const #cir.int<0> : !s8i
+// CIR: cir.store{{.*}} %[[CONST_0]], %[[A_ELEM_1_PTR]] : !s8i, !cir.ptr<!s8i>
+
+// TODO(cir): zero-initialize the padding
+
+// LLVM: %[[A_ADDR:.*]] = alloca %struct.CompleteS, i64 1, align 4
+// LLVM: %[[A_ELEM_0_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 0
+// LLVM: store i32 0, ptr %[[A_ELEM_0_PTR]], align 4
+// LLVM: %[[A_ELEM_1_PTR:.*]] = getelementptr %struct.CompleteS, ptr %[[A_ADDR]], i32 0, i32 1
+// LLVM: store i8 0, ptr %[[A_ELEM_1_PTR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca %struct.CompleteS, align 4
+// OGCG: call void @llvm.memset.p0.i64(ptr align 4 %[[A_ADDR]], i8 0, i64 8, i1 false)
+
+struct StructWithConstMember {
+  int a : 1;
+};
+
+void struct_with_const_member_expr() {
+  int a = (StructWithConstMember){}.a;
+}
+
+// CIR: %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a", init]
+// CIR: %[[RESULT:.*]] = cir.scope {
+// CIR:   %[[REF_ADDR:.*]] = cir.alloca !rec_StructWithConstMember, !cir.ptr<!rec_StructWithConstMember>, ["ref.tmp0"]
+// CIR:   %[[ELEM_0_PTR:.*]] = cir.get_member %[[REF_ADDR]][0] {name = "a"} : !cir.ptr<!rec_StructWithConstMember> -> !cir.ptr<!u8i>
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   %[[SET_BF:.*]] = cir.set_bitfield{{.*}} (#bfi_a, %[[ELEM_0_PTR]] : !cir.ptr<!u8i>, %[[CONST_0]] : !s32i) -> !s32i
+// CIR:   %[[CONST_0:.*]] = cir.const #cir.int<0> : !s32i
+// CIR:   cir.yield %[[CONST_0]] : !s32i
+// CIR: } : !s32i
+// CIR: cir.store{{.*}} %[[RESULT]], %[[A_ADDR]] : !s32i, !cir.ptr<!s32i>
+
+// TODO(cir): zero-initialize the padding
+
+// LLVM:  %[[REF_ADDR:.*]] = alloca %struct.StructWithConstMember, i64 1, align 4
+// LLVM:  %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM:  br label %[[BF_LABEL:.*]]
+// LLVM: [[BF_LABEL]]:
+// LLVM:  %[[ELEM_0_PTR:.*]] = getelementptr %struct.StructWithConstMember, ptr %[[REF_ADDR]], i32 0, i32 0
+// LLVM:  %[[TMP_REF:.*]] = load i8, ptr %[[ELEM_0_PTR]], align 4
+// LLVM:  %[[BF_CLEAR:.*]] = and i8 %[[TMP_REF]], -2
+// LLVM:  %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
+// LLVM:  store i8 %[[BF_SET]], ptr %[[ELEM_0_PTR]], align 4
+// LLVM:  br label %[[RESULT_LABEL:.*]]
+// LLVM: [[RESULT_LABEL]]:
+// LLVM:  %[[RESULT:.*]] = phi i32 [ 0, %[[BF_LABEL]] ]
+// LLVM:  store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[REF_ADDR:.*]] = alloca %struct.StructWithConstMember, align 4
+// OGCG: %[[TMP_REF:.*]] = load i8, ptr %[[REF_ADDR]], align 4
+// OGCG: %[[BF_CLEAR:.*]] = and i8 %[[TMP_REF]], -2
+// OGCG: %[[BF_SET:.*]] = or i8 %[[BF_CLEAR]], 0
+// OGCG: store i8 %[[BF_SET]], ptr %[[REF_ADDR]], align 4
+// OGCG: store i32 0, ptr %[[A_ADDR]], align 4
diff --git a/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp b/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp
new file mode 100644
index 0000000000000..7429549100362
--- /dev/null
+++ b/clang/test/CIR/CodeGen/trivial-ctor-const-init.cpp
@@ -0,0 +1,44 @@
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -fclangir -emit-cir %s -o %t.cir
+// RUN: FileCheck --input-file=%t.cir %s --check-prefix=CIR
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -fclangir -emit-llvm %s -o %t-cir.ll
+// RUN: FileCheck --input-file=%t-cir.ll %s --check-prefix=LLVM
+// RUN: %clang_cc1 -triple x86_64-unknown-linux-gnu -std=c++11 -emit-llvm %s -o %t.ll
+// RUN: FileCheck --input-file=%t.ll %s --check-prefix=OGCG
+
+struct StructWithDefaultCtor {
+  int n;
+};
+
+StructWithDefaultCtor defCtor = StructWithDefaultCtor();
+
+// CIR: cir.global {{.*}} @defCtor = #cir.zero : !rec_StructWithDefaultCtor
+// LLVM: @defCtor = global %struct.StructWithDefaultCtor zeroinitializer
+// OGCG: @defCtor = global %struct.StructWithDefaultCtor zeroinitializer
+
+struct StructWithCtorArg {
+  double value;
+  StructWithCtorArg(const double& x) : value(x) {}
+};
+
+StructWithCtorArg withArg = 0.0;
+
+// CIR: cir.global {{.*}} @withArg = #cir.zero : !rec_StructWithCtorArg
+// LLVM: @withArg = global %struct.StructWithCtorArg zeroinitializer
+// OGCG: @withArg = global %struct.StructWithCtorArg zeroinitializer
+
+// CIR: cir.func {{.*}} @__cxx_global_var_init()
+// CIR:   %[[TMP0:.*]] = cir.alloca !cir.double, !cir.ptr<!cir.double>, ["ref.tmp0"]
+// CIR:   %[[WITH_ARG:.*]] = cir.get_global @withArg : !cir.ptr<!rec_StructWithCtorArg>
+// CIR:   %[[ZERO:.*]] = cir.const #cir.fp<0.000000e+00> : !cir.double
+// CIR:   cir.store{{.*}} %[[ZERO]], %[[TMP0]] : !cir.double, !cir.ptr<!cir.double>
+// CIR:   cir.call @_ZN17StructWithCtorArgC1ERKd(%[[WITH_ARG]], %[[TMP0]]) : (!cir.ptr<!rec_StructWithCtorArg>, !cir.ptr<!cir.double>) -> ()
+
+// LLVM: define {{.*}} void @__cxx_global_var_init()
+// LLVM:   %[[TMP0:.*]] = alloca double
+// LLVM:   store double 0.000000e+00, ptr %[[TMP0]]
+// LLVM:   call void @_ZN17StructWithCtorArgC1ERKd(ptr @withArg, ptr %[[TMP0]])
+
+// OGCG: define {{.*}} void @__cxx_global_var_init()
+// OGCG:   %[[TMP0:.*]] = alloca double
+// OGCG:   store double 0.000000e+00, ptr %[[TMP0]]
+// OGCG:   call void @_ZN17StructWithCtorArgC1ERKd(ptr {{.*}} @withArg, ptr {{.*}} %[[TMP0]])
diff --git a/clang/test/CIR/CodeGen/try-catch.cpp b/clang/test/CIR/CodeGen/try-catch.cpp
index 5a503102d13df..1e4d2a63ada01 100644
--- a/clang/test/CIR/CodeGen/try-catch.cpp
+++ b/clang/test/CIR/CodeGen/try-catch.cpp
@@ -117,3 +117,50 @@ void try_catch_with_empty_catch_all_2() {
 // OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], 1
 // OGCG: store i32 %[[RESULT]], ptr %[[A_ADDR]], align 4
 // OGCG: ret void
+
+void try_catch_with_alloca() {
+  try {
+    int a;
+    int b;
+    int c = a + b;
+  } catch (...) {
+  }
+}
+
+// CIR: cir.scope {
+// CIR:   %[[A_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["a"]
+// CIR:   %[[B_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["b"]
+// CIR:   %[[C_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["c", init]
+// CIR:   cir.try {
+// CIR:     %[[TMP_A:.*]] = cir.load{{.*}} %[[A_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:     %[[TMP_B:.*]] = cir.load{{.*}} %[[B_ADDR]] : !cir.ptr<!s32i>, !s32i
+// CIR:     %[[RESULT:.*]] = cir.binop(add, %[[TMP_A]], %[[TMP_B]]) nsw : !s32i
+// CIR:     cir.store{{.*}} %[[RESULT]], %[[C_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR:     cir.yield
+// CIR:   }
+// CIR: }
+
+// LLVM:  %[[A_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[B_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM:  %[[C_ADDR:.*]] = alloca i32, i64 1, align 4
+// LLVM:  br label %[[LABEL_1:.*]]
+// LLVM: [[LABEL_1]]:
+// LLVM:  br label %[[LABEL_2:.*]]
+// LLVM: [[LABEL_2]]:
+// LLVM:  %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// LLVM:  %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
+// LLVM:  %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
+// LLVM:  store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
+// LLVM:  br label %[[LABEL_3:.*]]
+// LLVM: [[LABEL_3]]:
+// LLVM:  br label %[[LABEL_4:.*]]
+// LLVM: [[LABEL_4]]:
+// LLVM:  ret void
+
+// OGCG: %[[A_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[B_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[C_ADDR:.*]] = alloca i32, align 4
+// OGCG: %[[TMP_A:.*]] = load i32, ptr %[[A_ADDR]], align 4
+// OGCG: %[[TMP_B:.*]] = load i32, ptr %[[B_ADDR]], align 4
+// OGCG: %[[RESULT:.*]] = add nsw i32 %[[TMP_A]], %[[TMP_B]]
+// OGCG: store i32 %[[RESULT]], ptr %[[C_ADDR]], align 4
diff --git a/clang/test/CIR/CodeGen/vla.c b/clang/test/CIR/CodeGen/vla.c
index b22c704f67198..0af4f838b6287 100644
--- a/clang/test/CIR/CodeGen/vla.c
+++ b/clang/test/CIR/CodeGen/vla.c
@@ -282,4 +282,61 @@ void f3(unsigned len) {
 //         break;
 //     }
 // }
-  
\ No newline at end of file
+
+int f5(unsigned long len) {
+  int arr[len];
+  return arr[2];
+}
+
+// CIR: cir.func{{.*}} @f5(%[[LEN_ARG:.*]]: !u64i {{.*}}) -> !s32i
+// CIR:   %[[LEN_ADDR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["len", init]
+// CIR:   %[[RET_ADDR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["__retval"]
+// CIR:   %[[SAVED_STACK:.*]] = cir.alloca !cir.ptr<!u8i>, !cir.ptr<!cir.ptr<!u8i>>, ["saved_stack"]
+// CIR:   cir.store{{.*}} %[[LEN_ARG]], %[[LEN_ADDR]]
+// CIR:   %[[LEN:.*]] = cir.load{{.*}} %[[LEN_ADDR]]
+// CIR:   %[[STACK_PTR:.*]] = cir.stacksave
+// CIR:   cir.store{{.*}} %[[STACK_PTR]], %[[SAVED_STACK]]
+// CIR:   %[[ARR:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, %[[LEN]] : !u64i, ["arr"]
+// CIR:   %[[TWO:.*]] = cir.const #cir.int<2> : !s32i
+// CIR:   %[[ARR_2:.*]] = cir.ptr_stride %[[ARR]], %[[TWO]]
+// CIR:   %[[ARR_VAL:.*]] = cir.load{{.*}} %[[ARR_2]] : !cir.ptr<!s32i>, !s32i
+// CIR:   cir.store{{.*}} %[[ARR_VAL]], %[[RET_ADDR]] : !s32i, !cir.ptr<!s32i>
+// CIR:   %[[STACK_RESTORE_PTR:.*]] = cir.load{{.*}} %[[SAVED_STACK]]
+// CIR:   cir.stackrestore %[[STACK_RESTORE_PTR]]
+// CIR:   %[[RET_VAL:.*]] = cir.load{{.*}} %[[RET_ADDR]]
+// CIR:   cir.return %[[RET_VAL]] : !s32i
+
+// LLVM: define{{.*}} i32 @f5(i64 %[[LEN_ARG:.*]])
+// LLVM:   %[[LEN_ADDR:.*]] = alloca i64
+// LLVM:   %[[RET_ADDR:.*]] = alloca i32
+// LLVM:   %[[SAVED_STACK:.*]] = alloca ptr
+// LLVM:   store i64 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// LLVM:   %[[LEN:.*]] = load i64, ptr %[[LEN_ADDR]]
+// LLVM:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// LLVM:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// LLVM:   %[[ARR:.*]] = alloca i32, i64 %[[LEN]]
+// LLVM:   %[[ARR_2:.*]] = getelementptr i32, ptr %[[ARR]], i64 2
+// LLVM:   %[[ARR_VAL:.*]] = load i32, ptr %[[ARR_2]]
+// LLVM:   store i32 %[[ARR_VAL]], ptr %[[RET_ADDR]]
+// LLVM:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// LLVM:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+// LLVM:   %[[RET_VAL:.*]] = load i32, ptr %[[RET_ADDR]]
+// LLVM:   ret i32 %[[RET_VAL]]
+
+// Note: VLA_EXPR0 below is emitted to capture debug info.
+
+// OGCG: define{{.*}} i32 @f5(i64 {{.*}} %[[LEN_ARG:.*]])
+// OGCG:   %[[LEN_ADDR:.*]] = alloca i64
+// OGCG:   %[[SAVED_STACK:.*]] = alloca ptr
+// OGCG:   %[[VLA_EXPR0:.*]] = alloca i64
+// OGCG:   store i64 %[[LEN_ARG]], ptr %[[LEN_ADDR]]
+// OGCG:   %[[LEN:.*]] = load i64, ptr %[[LEN_ADDR]]
+// OGCG:   %[[STACK_PTR:.*]] = call ptr @llvm.stacksave.p0()
+// OGCG:   store ptr %[[STACK_PTR]], ptr %[[SAVED_STACK]]
+// OGCG:   %[[ARR:.*]] = alloca i32, i64 %[[LEN]]
+// OGCG:   store i64 %[[LEN]], ptr %[[VLA_EXPR0]]
+// OGCG:   %[[ARR_2:.*]] = getelementptr inbounds i32, ptr %[[ARR]], i64 2
+// OGCG:   %[[ARR_VAL:.*]] = load i32, ptr %[[ARR_2]]
+// OGCG:   %[[STACK_RESTORE_PTR:.*]] = load ptr, ptr %[[SAVED_STACK]]
+// OGCG:   call void @llvm.stackrestore.p0(ptr %[[STACK_RESTORE_PTR]])
+// OGCG:   ret i32 %[[ARR_VAL]]
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp
new file mode 100644
index 0000000000000..9882f050045d3
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-read.cpp
@@ -0,0 +1,24 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+void use(int x, unsigned int y, float f) {
+  // CHECK: cir.func{{.*}}(%[[X_ARG:.*]]: !s32i{{.*}}, %[[Y_ARG:.*]]: !u32i{{.*}}, %[[F_ARG:.*]]: !cir.float{{.*}}){{.*}}{
+  // CHECK-NEXT: %[[X_ALLOC:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+  // CHECK-NEXT: %[[Y_ALLOC:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["y", init]
+  // CHECK-NEXT: %[[F_ALLOC:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+  // CHECK-NEXT: cir.store %[[X_ARG]], %[[X_ALLOC]] : !s32i, !cir.ptr<!s32i>
+  // CHECK-NEXT: cir.store %[[Y_ARG]], %[[Y_ALLOC]] : !u32i, !cir.ptr<!u32i>
+  // CHECK-NEXT: cir.store %[[F_ARG]], %[[F_ALLOC]] : !cir.float, !cir.ptr<!cir.float>
+
+  // CHECK-NEXT: acc.atomic.read %[[X_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr<!s32i>, !cir.ptr<!u32i>, !s32i
+#pragma acc atomic read
+  x = y;
+
+  // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load{{.*}} %[[X_ALLOC]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[X_CAST:.*]] = cir.cast integral %[[X_LOAD]] : !s32i -> !u32i
+  // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load{{.*}} %[[Y_ALLOC]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[CMP:.*]] = cir.cmp(eq, %[[X_CAST]], %[[Y_LOAD]]) : !u32i, !cir.bool
+  // CHECK-NEXT: %[[CMP_CAST:.*]] = builtin.unrealized_conversion_cast %[[CMP]] : !cir.bool to i1
+  // CHECK-NEXT: acc.atomic.read if(%[[CMP_CAST]]) %[[F_ALLOC]] = %[[Y_ALLOC]] : !cir.ptr<!cir.float>, !cir.ptr<!u32i>, !cir.float
+#pragma acc atomic read if (x == y)
+  f = y;
+}
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp
new file mode 100644
index 0000000000000..7ab6b62c4b41e
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-update.cpp
@@ -0,0 +1,151 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+struct HasOps {
+  operator float();
+  int thing();
+};
+
+void use(int x, unsigned int y, float f, HasOps ops) {
+  // CHECK: cir.func{{.*}}(%[[X_ARG:.*]]: !s32i{{.*}}, %[[Y_ARG:.*]]: !u32i{{.*}}, %[[F_ARG:.*]]: !cir.float{{.*}}){{.*}}, %[[OPS_ARG:.*]]: !rec_HasOps{{.*}}) {
+  // CHECK-NEXT: %[[X_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+  // CHECK-NEXT: %[[Y_ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["y", init]
+  // CHECK-NEXT: %[[F_ALLOCA:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+  // CHECK-NEXT: %[[OPS_ALLOCA:.*]] = cir.alloca !rec_HasOps, !cir.ptr<!rec_HasOps>, ["ops", init]
+  // CHECK-NEXT: cir.store %[[X_ARG]], %[[X_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  // CHECK-NEXT: cir.store %[[Y_ARG]], %[[Y_ALLOCA]] : !u32i, !cir.ptr<!u32i>
+  // CHECK-NEXT: cir.store %[[F_ARG]], %[[F_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  // CHECK-NEXT: cir.store %[[OPS_ARG]], %[[OPS_ALLOCA]] : !rec_HasOps, !cir.ptr<!rec_HasOps>
+
+  // CHECK-NEXT: acc.atomic.update %[[X_ALLOCA]] : !cir.ptr<!s32i> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !s32i{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[TEMP_LOAD]]) nsw : !s32i, !s32i
+  // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !s32i
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  ++x;
+
+  // CHECK-NEXT: acc.atomic.update %[[Y_ALLOCA]] : !cir.ptr<!u32i> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !u32i{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[TEMP_LOAD]]) : !u32i, !u32i
+  // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
+  // 
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !u32i
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  y++;
+
+  // CHECK-NEXT: acc.atomic.update %[[F_ALLOCA]] : !cir.ptr<!cir.float> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !cir.float{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[INC:.*]] = cir.unary(dec, %[[TEMP_LOAD]]) : !cir.float, !cir.float
+  // CHECK-NEXT: cir.store {{.*}}%[[INC]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  // 
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !cir.float
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  f--;
+
+  // CHECK-NEXT: acc.atomic.update %[[X_ALLOCA]] : !cir.ptr<!s32i> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !s32i{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[F_LOAD:.*]] = cir.load{{.*}} %[[F_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[INT_TO_F:.*]] = cir.cast int_to_float %[[TEMP_LOAD]] : !s32i -> !cir.float
+  // CHECK-NEXT: %[[ADD:.*]] = cir.binop(add, %[[INT_TO_F]], %[[F_LOAD]]) : !cir.float
+  // CHECK-NEXT: %[[F_TO_INT:.*]] = cir.cast float_to_int %[[ADD]] : !cir.float -> !s32i
+  // CHECK-NEXT: cir.store{{.*}} %[[F_TO_INT]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !s32i
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  x += f;
+
+  // CHECK-NEXT: acc.atomic.update %[[F_ALLOCA]] : !cir.ptr<!cir.float> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !cir.float{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  //
+  // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load{{.*}} %[[Y_ALLOCA]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[INT_TO_F:.*]] = cir.cast int_to_float %[[Y_LOAD]] : !u32i -> !cir.float
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[DIV:.*]] = cir.binop(div, %[[TEMP_LOAD]], %[[INT_TO_F]]) : !cir.float
+  // CHECK-NEXT: cir.store{{.*}} %[[DIV]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !cir.float
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  f /= y;
+
+  // CHECK-NEXT: acc.atomic.update %[[Y_ALLOCA]] : !cir.ptr<!u32i> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !u32i{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[CALL:.*]] = cir.call {{.*}}(%[[OPS_ALLOCA]]) : (!cir.ptr<!rec_HasOps>) -> !s32i
+  // CHECK-NEXT: %[[CALL_CAST:.*]] = cir.cast integral %[[CALL]] : !s32i -> !u32i
+  // CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[TEMP_LOAD]], %[[CALL_CAST]]) : !u32i
+  // CHECK-NEXT: cir.store{{.*}} %[[MUL]], %[[TEMP_ALLOCA]] : !u32i, !cir.ptr<!u32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !u32i
+  // CHECK-NEXT: }
+
+#pragma acc atomic update
+  y = y * ops.thing();
+
+  // CHECK-NEXT: acc.atomic.update %[[X_ALLOCA]] : !cir.ptr<!s32i> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !s32i{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[CALL:.*]] = cir.call {{.*}}(%[[OPS_ALLOCA]]) : (!cir.ptr<!rec_HasOps>) -> !s32i
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[OR:.*]] = cir.binop(or, %[[CALL]], %[[INT_TO_F]]) : !s32i
+  // CHECK-NEXT: cir.store{{.*}} %[[OR]], %[[TEMP_ALLOCA]] : !s32i, !cir.ptr<!s32i>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !s32i
+  // CHECK-NEXT: }
+#pragma acc atomic update
+  x = ops.thing() | x;
+
+  // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load{{.*}} %[[X_ALLOCA]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[BOOL_CAST:.*]] = cir.cast int_to_bool %[[X_LOAD]] : !s32i -> !cir.bool
+  // CHECK-NEXT: %[[X_CAST:.*]] = builtin.unrealized_conversion_cast %[[BOOL_CAST]] : !cir.bool to i1
+  // CHECK-NEXT: acc.atomic.update if(%[[X_CAST]]) %[[F_ALLOCA]] : !cir.ptr<!cir.float> {
+  // CHECK-NEXT: ^bb0(%[[RECIPE_ARG:.*]]: !cir.float{{.*}}):
+  // CHECK-NEXT: %[[TEMP_ALLOCA:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["x_var", init]
+  // CHECK-NEXT: cir.store %[[RECIPE_ARG]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[CALL:.*]] = cir.call {{.*}}(%[[OPS_ALLOCA]]) : (!cir.ptr<!rec_HasOps>) -> !cir.float
+  // CHECK-NEXT: %[[SUB:.*]] = cir.binop(sub, %[[TEMP_LOAD]], %[[CALL]]) : !cir.float
+  // CHECK-NEXT: cir.store{{.*}} %[[SUB]], %[[TEMP_ALLOCA]] : !cir.float, !cir.ptr<!cir.float>
+  //
+  // CHECK-NEXT: %[[TEMP_LOAD:.*]] = cir.load{{.*}} %[[TEMP_ALLOCA]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: acc.yield %[[TEMP_LOAD]] : !cir.float
+  // CHECK-NEXT: }
+#pragma acc atomic update if (x)
+  f = f - ops;
+}
diff --git a/clang/test/CIR/CodeGenOpenACC/atomic-write.cpp b/clang/test/CIR/CodeGenOpenACC/atomic-write.cpp
new file mode 100644
index 0000000000000..16855348cb1f1
--- /dev/null
+++ b/clang/test/CIR/CodeGenOpenACC/atomic-write.cpp
@@ -0,0 +1,55 @@
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+
+extern "C" bool condition(int x, unsigned int y, float f);
+extern "C" double do_thing(float f);
+
+struct ConvertsToScalar {
+  operator float();
+};
+
+void use(int x, unsigned int y, float f, ConvertsToScalar cts) {
+  // CHECK: cir.func{{.*}}(%[[X_ARG:.*]]: !s32i{{.*}}, %[[Y_ARG:.*]]: !u32i{{.*}}, %[[F_ARG:.*]]: !cir.float{{.*}}){{.*}}, %[[CTS_ARG:.*]]: !rec_ConvertsToScalar{{.*}}) {
+  // CHECK-NEXT: %[[X_ALLOC:.*]] = cir.alloca !s32i, !cir.ptr<!s32i>, ["x", init]
+  // CHECK-NEXT: %[[Y_ALLOC:.*]] = cir.alloca !u32i, !cir.ptr<!u32i>, ["y", init]
+  // CHECK-NEXT: %[[F_ALLOC:.*]] = cir.alloca !cir.float, !cir.ptr<!cir.float>, ["f", init]
+  // CHECK-NEXT: %[[CTS_ALLOC:.*]] = cir.alloca !rec_ConvertsToScalar, !cir.ptr<!rec_ConvertsToScalar>, ["cts", init]
+  //
+  // CHECK-NEXT: cir.store %[[X_ARG]], %[[X_ALLOC]] : !s32i, !cir.ptr<!s32i>
+  // CHECK-NEXT: cir.store %[[Y_ARG]], %[[Y_ALLOC]] : !u32i, !cir.ptr<!u32i>
+  // CHECK-NEXT: cir.store %[[F_ARG]], %[[F_ALLOC]] : !cir.float, !cir.ptr<!cir.float>
+  // CHECK-NEXT: cir.store %[[CTS_ARG]], %[[CTS_ALLOC]] : !rec_ConvertsToScalar, !cir.ptr<!rec_ConvertsToScalar>
+
+  // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load {{.*}}%[[Y_ALLOC]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[Y_TO_FLOAT:.*]] = cir.cast int_to_float %[[Y_LOAD]] : !u32i -> !cir.float
+  // CHECK-NEXT: %[[F_LOAD:.*]] = cir.load {{.*}}%[[F_ALLOC]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[MUL:.*]] = cir.binop(mul, %[[Y_TO_FLOAT]], %[[F_LOAD]]) : !cir.float
+  // CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_int %[[MUL]] : !cir.float -> !s32i
+  // CHECK-NEXT: acc.atomic.write %[[X_ALLOC]] = %[[RHS_CAST]] : !cir.ptr<!s32i>, !s32i
+#pragma acc atomic write
+  x = y * f;
+
+  // CHECK-NEXT: %[[F_LOAD:.*]] = cir.load {{.*}}%[[F_ALLOC]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[CALL:.*]] = cir.call @do_thing(%[[F_LOAD]]) : (!cir.float) -> !cir.double
+  // CHECK-NEXT: %[[CALL_CAST:.*]] = cir.cast float_to_int %[[CALL]] : !cir.double -> !u32i
+  // CHECK-NEXT: acc.atomic.write %[[Y_ALLOC]] = %[[CALL_CAST]] : !cir.ptr<!u32i>, !u32i
+#pragma acc atomic write
+  y = do_thing(f);
+
+  // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load {{.*}}%[[X_ALLOC]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[X_CAST:.*]] = cir.cast int_to_float %[[X_LOAD]] : !s32i -> !cir.float
+  // CHECK-NEXT: %[[THING_CALL:.*]] = cir.call @do_thing(%[[X_CAST]]) : (!cir.float) -> !cir.double
+  // CHECK-NEXT: %[[THING_CAST:.*]] = cir.cast floating %[[THING_CALL]] : !cir.double -> !cir.float
+  // CHECK-NEXT: %[[X_LOAD:.*]] = cir.load {{.*}}%[[X_ALLOC]] : !cir.ptr<!s32i>, !s32i
+  // CHECK-NEXT: %[[Y_LOAD:.*]] = cir.load {{.*}}%[[Y_ALLOC]] : !cir.ptr<!u32i>, !u32i
+  // CHECK-NEXT: %[[F_LOAD:.*]] = cir.load {{.*}}%[[F_ALLOC]] : !cir.ptr<!cir.float>, !cir.float
+  // CHECK-NEXT: %[[COND_CALL:.*]] = cir.call @condition(%[[X_LOAD]], %[[Y_LOAD]], %[[F_LOAD]]) : (!s32i, !u32i, !cir.float) -> !cir.bool
+  // CHECK-NEXT: %[[COND_CAST:.*]] = builtin.unrealized_conversion_cast %[[COND_CALL]] : !cir.bool to i1
+  // CHECK-NEXT: acc.atomic.write if(%[[COND_CAST]]) %[[F_ALLOC]] = %[[THING_CAST]] : !cir.ptr<!cir.float>, !cir.float
+#pragma acc atomic write if (condition(x, y, f))
+  f = do_thing(x);
+
+  // CHECK-NEXT: %[[CTS_CONV_CALL:.*]] = cir.call @{{.*}}(%[[CTS_ALLOC]]) : (!cir.ptr<!rec_ConvertsToScalar>) -> !cir.float
+  // CHECK-NEXT: acc.atomic.write %[[F_ALLOC]] = %[[CTS_CONV_CALL]] : !cir.ptr<!cir.float>, !cir.float
+#pragma acc atomic write
+  f = cts;
+}
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
index c1c2e4b715365..53eba7bafb312 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -507,7 +577,77 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1532,7 +1672,101 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1576,7 +1810,101 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2398,6 +2726,104 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2446,6 +2872,104 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
index 853f345e53ddf..63d69529bee53 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 template<typename T>
 void acc_combined() {
   T someVar;
@@ -92,7 +92,18 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -106,7 +117,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -371,7 +393,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -401,7 +457,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -708,7 +798,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -744,7 +872,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
index 67e8460649f7e..9c1b16174d5fb 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
index d74de8220225a..78b43ddc8f182 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_combined() {
@@ -145,7 +145,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -159,7 +170,18 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -587,7 +609,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -617,7 +673,41 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1116,7 +1206,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1152,7 +1280,45 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
index a6df6c03f5c8e..5b37071a22ddb 100644
--- a/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/combined-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_combined() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc parallel loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_combined() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc parallel loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_combined() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_combined() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
index d65d5d4add0ac..6ec1c43ebbe45 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -std=c23 -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -485,7 +485,80 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -512,7 +585,80 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -1516,7 +1662,104 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1546,7 +1789,104 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2376,6 +2716,107 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2424,6 +2865,107 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_INT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
index f32fa2d2d6372..7bd6f67a9e19e 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -507,7 +577,77 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   ;
@@ -1532,7 +1672,101 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -1576,7 +1810,101 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2398,6 +2726,104 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
@@ -2446,6 +2872,104 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
index 9f7336727e5a9..13c335b867044 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   float someVar;
@@ -92,7 +92,19 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -106,7 +118,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -371,7 +395,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -401,7 +460,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -708,7 +802,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -744,7 +877,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast int_to_float %[[RES_TO_INT]] : !s32i -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
index ffd26319e9bfc..67378210ba83c 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -93,7 +93,18 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -107,7 +118,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   ;
@@ -372,7 +394,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -402,7 +458,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -709,7 +799,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
@@ -745,7 +873,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
index 1e367ee37a30d..262fe98d8e1df 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_compute() {
   ;
 
 #pragma acc parallel reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
index 2f42a5c63f149..be7b12350360d 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   int someVar;
@@ -144,7 +144,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -158,7 +169,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -586,7 +608,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -616,7 +672,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1115,7 +1205,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1151,7 +1279,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
index af7bcf3770fe1..fb6984fcd0068 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_compute() {
@@ -145,7 +145,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -159,7 +170,18 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   ;
@@ -587,7 +609,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -617,7 +673,41 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1116,7 +1206,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1152,7 +1280,45 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
index ec890e2b1de65..3a80ed5304928 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_compute() {
 // CHECK-NEXT: }
   ;
 #pragma acc parallel  reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_compute() {
   ;
 
 #pragma acc parallel  reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
index 08daa702c47f8..9b10a296e99f5 100644
--- a/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
+++ b/clang/test/CIR/CodeGenOpenACC/compute-reduction-clause-unsigned-int.c
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 void acc_compute() {
   unsigned int someVar;
@@ -143,7 +143,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -157,7 +169,19 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!u32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!u32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !u32i, !cir.ptr<!u32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!u32i>
 // CHECK-NEXT: }
   ;
@@ -585,7 +609,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -615,7 +674,42 @@ void acc_compute() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !s64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1114,7 +1208,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
@@ -1150,7 +1283,46 @@ void acc_compute() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!u32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!u32i x 5>> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!u32i>, !u64i) -> !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_SINT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast integral %[[RES_TO_SINT]] : !s32i -> !u32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!u32i x 5>>
 // CHECK-NEXT: }
   ;
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
index 1a77c0f10a144..11ebd7b4c26cb 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-default-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct DefaultOperators {
   int i;
@@ -480,7 +480,77 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -507,7 +577,77 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_DefaultOperators> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHSARG]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHSARG]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHSARG]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHSARG]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHSARG]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_DefaultOperators>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1532,7 +1672,101 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -1576,7 +1810,101 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !s64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+//
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2398,6 +2726,104 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]]
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
@@ -2446,6 +2872,104 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>> -> !cir.ptr<!rec_DefaultOperators>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_DefaultOperators>, !u64i) -> !cir.ptr<!rec_DefaultOperators>
+//
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_GET_U:.*]] = cir.get_member %[[LHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[RHS_GET_U:.*]] = cir.get_member %[[RHS_STRIDE]][1] {name = "u"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!u32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_U]] : !cir.ptr<!u32i>, !u32i
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !u32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !u32i
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_U]] : !u32i, !cir.ptr<!u32i>
+//
+// CHECK-NEXT: %[[LHS_GET_F:.*]] = cir.get_member %[[LHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_GET_F:.*]] = cir.get_member %[[RHS_STRIDE]][2] {name = "f"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_F]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_F]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_GET_D:.*]] = cir.get_member %[[LHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[RHS_GET_D:.*]] = cir.get_member %[[RHS_STRIDE]][3] {name = "d"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.double>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[LHS_CAST:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_D]] : !cir.ptr<!cir.double>, !cir.double
+// CHECK-NEXT: %[[RHS_CAST:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.double -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.double
+// CHECK-NEXT: cir.store {{.*}} %[[RES_TO_VAL]], %[[LHS_GET_D]] : !cir.double, !cir.ptr<!cir.double>
+//
+// CHECK-NEXT: %[[LHS_GET_B:.*]] = cir.get_member %[[LHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[RHS_GET_B:.*]] = cir.get_member %[[RHS_STRIDE]][4] {name = "b"} : !cir.ptr<!rec_DefaultOperators> -> !cir.ptr<!cir.bool>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_LOAD]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]]
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_B]] : !cir.ptr<!cir.bool>, !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_LOAD]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: cir.store {{.*}} %[[TERNARY]], %[[LHS_GET_B]] : !cir.bool, !cir.ptr<!cir.bool>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_DefaultOperators x 5>>
 // CHECK-NEXT: }
   for(int i = 0; i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
index 7faef7111a9c8..57cc1afec2911 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-float.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -93,7 +93,18 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -107,7 +118,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.float> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.float> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !cir.float, !cir.ptr<!cir.float>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.float>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -372,7 +394,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -402,7 +458,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !s64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -709,7 +799,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -745,7 +873,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!cir.float x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>> -> !cir.ptr<!cir.float>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!cir.float>, !u64i) -> !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[LHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!cir.float>, !cir.float
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast float_to_bool %[[RHS_LOAD]] : !cir.float -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_float %[[TERNARY]] : !cir.bool -> !cir.float
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !cir.float, !cir.ptr<!cir.float>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!cir.float x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
index 43c9fbbce7533..8a5bf3ebb3e12 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-inline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 struct HasOperatorsInline {
   int i;
@@ -14,8 +14,8 @@ struct HasOperatorsInline {
   HasOperatorsInline &operator&=(HasOperatorsInline& other);
   HasOperatorsInline &operator|=(HasOperatorsInline& other);
   HasOperatorsInline &operator^=(HasOperatorsInline& other);
-  bool &operator&&(HasOperatorsInline& other);
-  bool &operator||(HasOperatorsInline& other);
+  HasOperatorsInline &operator&&(HasOperatorsInline& other);
+  HasOperatorsInline &operator||(HasOperatorsInline& other);
   // For min/max
   bool operator<(HasOperatorsInline& other);
   HasOperatorsInline &operator=(HasOperatorsInline& other);
@@ -277,7 +277,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -286,7 +287,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS18HasOperatorsInline : !cir.ptr<!rec_HasOperatorsInline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsInline, !cir.ptr<!rec_HasOperatorsInline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsInline> -> !cir.ptr<!s32i>
@@ -308,7 +309,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsInline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsInline> {{.*}}):  
@@ -318,7 +320,7 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_18HasOperatorsInline : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsInline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsInline>>, ["arrayinit.temp"]
@@ -1254,7 +1256,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -1318,7 +1344,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !s64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}):  
@@ -2121,6 +2171,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineaaERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2198,6 +2277,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsInline>, !u64i) -> !cir.ptr<!rec_HasOperatorsInline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZN18HasOperatorsInlineooERS_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline>
+// CHECK-NEXT: @_ZN18HasOperatorsInlineaSERS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsInline>, !cir.ptr<!rec_HasOperatorsInline>) -> !cir.ptr<!rec_HasOperatorsInline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsInline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
index 5353218866d47..f60dff9385412 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-int.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 template<typename T>
 void acc_loop() {
@@ -145,7 +145,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -159,7 +170,18 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!s32i> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!s32i> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHSARG]] : !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHSARG]] : !s32i, !cir.ptr<!s32i>
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!s32i>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -587,7 +609,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -617,7 +673,41 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !s64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1116,7 +1206,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[FALSE:.*]] = cir.const #false
+// CHECK-NEXT: cir.yield %[[FALSE]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
@@ -1152,7 +1280,45 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!s32i x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!s32i x 5>> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!s32i>, !u64i) -> !cir.ptr<!s32i>
+//
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load {{.*}} %[[LHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_TO_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load {{.*}} %[[RHS_STRIDE]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_TO_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_TO_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_VAL:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_VAL]], %[[LHS_STRIDE]] : !s32i, !cir.ptr<!s32i>
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!s32i x 5>>
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
diff --git a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
index e193cfa1a5ab2..8613bc8a0f27e 100644
--- a/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/loop-reduction-clause-outline-ops.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 struct HasOperatorsOutline {
   int i;
   unsigned u;
@@ -15,8 +15,8 @@ HasOperatorsOutline &operator*=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator&=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator|=(HasOperatorsOutline &, HasOperatorsOutline &);
 HasOperatorsOutline &operator^=(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
-bool &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator&&(HasOperatorsOutline &, HasOperatorsOutline &);
+HasOperatorsOutline &operator||(HasOperatorsOutline &, HasOperatorsOutline &);
 // For min/max
 bool operator<(HasOperatorsOutline &, HasOperatorsOutline &);
 
@@ -276,7 +276,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -285,7 +286,7 @@ void acc_loop() {
 // CHECK-NEXT: }
   for(int i=0;i < 5; ++i);
 #pragma acc loop reduction(||:someVar)
-// CHECK-NEXT: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
+// CHECK: acc.reduction.recipe @reduction_lor__ZTS19HasOperatorsOutline : !cir.ptr<!rec_HasOperatorsOutline> reduction_operator <lor> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !rec_HasOperatorsOutline, !cir.ptr<!rec_HasOperatorsOutline>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[GET_I:.*]] = cir.get_member %[[ALLOCA]][0] {name = "i"} : !cir.ptr<!rec_HasOperatorsOutline> -> !cir.ptr<!s32i>
@@ -307,7 +308,8 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHSARG]], %[[RHSARG]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHSARG]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!rec_HasOperatorsOutline>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}, %[[ARG:.*]]: !cir.ptr<!rec_HasOperatorsOutline> {{.*}}):  
@@ -317,7 +319,7 @@ void acc_loop() {
   for(int i=0;i < 5; ++i);
 
 #pragma acc loop reduction(+:someVarArr)
-// CHECK-NEXT: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
+// CHECK: acc.reduction.recipe @reduction_add__ZTSA5_19HasOperatorsOutline : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> reduction_operator <add> init {
 // CHECK-NEXT: ^bb0(%[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>{{.*}})
 // CHECK-NEXT: %[[ALLOCA:.*]] = cir.alloca !cir.array<!rec_HasOperatorsOutline x 5>, !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>, ["openacc.reduction.init", init]
 // CHECK-NEXT: %[[TEMP_ITR:.*]] = cir.alloca !cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!cir.ptr<!rec_HasOperatorsOutline>>, ["arrayinit.temp"]
@@ -1253,7 +1255,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -1317,7 +1343,31 @@ void acc_loop() {
 //
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}})
-// TODO OpenACC: Expecting combination operation here
+// CHECK-NEXT: %[[ZERO:.*]] = cir.const #cir.int<0> : !s64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !s64i, !cir.ptr<!s64i>, ["itr"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[ZERO]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[END_VAL:.*]] = cir.const #cir.int<5> : !s64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[END_VAL]]) : !s64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !s64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!s64i>, !s64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !s64i, !s64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !s64i, !cir.ptr<!s64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}):  
@@ -2120,6 +2170,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZaaR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
@@ -2197,6 +2276,35 @@ void acc_loop() {
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
 // CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}))
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB]] : index to !u64i
+// CHECK-NEXT: %[[UB:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB]] : index to !u64i
+// CHECK-NEXT: %[[ITR:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB_CAST]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR_LOAD]], %[[UB_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR_LOAD:.*]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_DECAY]], %[[ITR_LOAD]] : (!cir.ptr<!rec_HasOperatorsOutline>, !u64i) -> !cir.ptr<!rec_HasOperatorsOutline>
+//
+// CHECK-NEXT: %[[OP_RES:.*]] = cir.call @_ZooR19HasOperatorsOutlineS0_(%[[LHS_STRIDE]], %[[RHS_STRIDE]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline>
+// CHECK-NEXT: @_ZN19HasOperatorsOutlineaSERKS_(%[[LHS_STRIDE]], %[[OP_RES]]) : (!cir.ptr<!rec_HasOperatorsOutline>, !cir.ptr<!rec_HasOperatorsOutline>) -> !cir.ptr<!rec_HasOperatorsOutline
+//
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR_LOAD]] = cir.load %[[ITR]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
 // CHECK-NEXT: acc.yield %[[LHSARG]] : !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>>
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[ORIG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[ARG:.*]]: !cir.ptr<!cir.array<!rec_HasOperatorsOutline x 5>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty{{.*}}): 
diff --git a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
index 33e12fe1cd833..b4d76e18bf345 100644
--- a/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/openacc-not-implemented.cpp
@@ -3,8 +3,8 @@
 void HelloWorld(int *A, int *B, int *C, int N) {
 
 // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Atomic Construct}}
-#pragma acc atomic
-  N = N + 1;
+#pragma acc atomic capture
+  B = A += ++N;
 
 // expected-error@+1{{ClangIR code gen Not Yet Implemented: OpenACC Declare Construct}}
 #pragma acc declare create(A)
diff --git a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
index 20ad7a31b635e..29f1b5fe10158 100644
--- a/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
+++ b/clang/test/CIR/CodeGenOpenACC/reduction-clause-recipes.cpp
@@ -1,4 +1,4 @@
-// RUN: not %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
+// RUN: %clang_cc1 -fopenacc -triple x86_64-linux-gnu -Wno-openacc-self-if-potential-conflict -emit-cir -fclangir -triple x86_64-linux-pc %s -o - | FileCheck %s
 
 // Note: unlike the 'private' recipe checks, this is just for spot-checking,
 // so this test isn't as comprehensive.  The same code paths are used for
@@ -753,8 +753,97 @@ void do_things(unsigned A, unsigned B) {
 // CHECK-NEXT: }
 // CHECK-NEXT: acc.yield
 // CHECK-NEXT: } combiner {
-// CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
-// CHECK-NEXT: acc.yield
+// CHECK-NEXT: ^bb0(%[[LHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[RHSARG:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB3:.*]] = acc.get_lowerbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB3]] : index to !u64i
+// CHECK-NEXT: %[[UB3:.*]] = acc.get_upperbound %[[BOUND3]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB3_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB3]] : index to !u64i
+// CHECK-NEXT: %[[ITR3:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB3_CAST]], %[[ITR3]] : !u64i, !cir.ptr<!u64i> 
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[CMP:.*]] = cir.cmp(lt, %[[ITR3_LOAD]], %[[UB3_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[CMP]]) 
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR3_LOAD:.*]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_TLA_LOAD:.*]] = cir.load %[[LHSARG]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[LHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[RHS_TLA_LOAD:.*]] = cir.load %[[RHSARG]] : !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>>, !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE:.*]] = cir.ptr_stride %[[RHS_TLA_LOAD]], %[[ITR3_LOAD]] : (!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>, !u64i) -> !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB2:.*]] = acc.get_lowerbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB2]] : index to !u64i
+// CHECK-NEXT: %[[UB2:.*]] = acc.get_upperbound %[[BOUND2]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB2_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB2]] : index to !u64i
+// CHECK-NEXT: %[[ITR2:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB2_CAST]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR2_LOAD]], %[[UB2_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR2_LOAD:.*]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[LHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND3_STRIDE_DECAY:.*]] = cir.cast array_to_ptrdecay %[[RHS_BOUND3_STRIDE]] : !cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>> -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND3_STRIDE_DECAY]], %[[ITR2_LOAD]] : (!cir.ptr<!cir.ptr<!rec_NoOps>>, !u64i) -> !cir.ptr<!cir.ptr<!rec_NoOps>>
+// CHECK-NEXT: cir.scope {
+// CHECK-NEXT: %[[LB1:.*]] = acc.get_lowerbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[LB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[LB1]] : index to !u64i
+// CHECK-NEXT: %[[UB1:.*]] = acc.get_upperbound %[[BOUND1]] : (!acc.data_bounds_ty) -> index
+// CHECK-NEXT: %[[UB1_CAST:.*]] = builtin.unrealized_conversion_cast %[[UB1]] : index to !u64i
+// CHECK-NEXT: %[[ITR1:.*]] = cir.alloca !u64i, !cir.ptr<!u64i>, ["iter"] {alignment = 8 : i64}
+// CHECK-NEXT: cir.store %[[LB1_CAST]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.for : cond {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[COND:.*]] = cir.cmp(lt, %[[ITR1_LOAD]], %[[UB1_CAST]]) : !u64i, !cir.bool
+// CHECK-NEXT: cir.condition(%[[COND]])
+// CHECK-NEXT: } body {
+// CHECK-NEXT: %[[ITR1_LOAD:.*]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[LHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[LHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_STRIDE:.*]] = cir.ptr_stride %[[LHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_BOUND2_STRIDE_LOAD:.*]] = cir.load %[[RHS_BOUND2_STRIDE]] : !cir.ptr<!cir.ptr<!rec_NoOps>>, !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[RHS_STRIDE:.*]] = cir.ptr_stride %[[RHS_BOUND2_STRIDE_LOAD]], %[[ITR1_LOAD]] : (!cir.ptr<!rec_NoOps>, !u64i) -> !cir.ptr<!rec_NoOps>
+// CHECK-NEXT: %[[LHS_GET_I:.*]] = cir.get_member %[[LHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[RHS_GET_I:.*]] = cir.get_member %[[RHS_STRIDE]][0] {name = "i"} : !cir.ptr<!rec_NoOps> -> !cir.ptr<!s32i>
+// CHECK-NEXT: %[[LHS_LOAD:.*]] = cir.load{{.*}} %[[LHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[LHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[LHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: %[[TERNARY:.*]] = cir.ternary(%[[LHS_CAST_BOOL]], true {
+// CHECK-NEXT: %[[TRUE:.*]] = cir.const #true
+// CHECK-NEXT: cir.yield %[[TRUE]] : !cir.bool
+// CHECK-NEXT: }, false {
+// CHECK-NEXT: %[[RHS_LOAD:.*]] = cir.load{{.*}} %[[RHS_GET_I]] : !cir.ptr<!s32i>, !s32i
+// CHECK-NEXT: %[[RHS_CAST_BOOL:.*]] = cir.cast int_to_bool %[[RHS_LOAD]] : !s32i -> !cir.bool
+// CHECK-NEXT: cir.yield %[[RHS_CAST_BOOL]] : !cir.bool
+// CHECK-NEXT: }) : (!cir.bool) -> !cir.bool
+// CHECK-NEXT: %[[RES_TO_INT:.*]] = cir.cast bool_to_int %[[TERNARY]] : !cir.bool -> !s32i
+// CHECK-NEXT: cir.store{{.*}} %[[RES_TO_INT]], %[[LHS_GET_I]] : !s32i, !cir.ptr<!s32i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR1_LOAD]] = cir.load %[[ITR1]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR1_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR1]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR2_LOAD]] = cir.load %[[ITR2]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR2_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR2]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: } step {
+// CHECK-NEXT: %[[ITR3_LOAD]] = cir.load %[[ITR3]] : !cir.ptr<!u64i>, !u64i
+// CHECK-NEXT: %[[INC:.*]] = cir.unary(inc, %[[ITR3_LOAD]]) : !u64i, !u64i
+// CHECK-NEXT: cir.store %[[INC]], %[[ITR3]] : !u64i, !cir.ptr<!u64i>
+// CHECK-NEXT: cir.yield
+// CHECK-NEXT: }
+// CHECK-NEXT: }
+// CHECK-NEXT: acc.yield %[[LHSARG]]
 // CHECK-NEXT: } destroy {
 // CHECK-NEXT: ^bb0(%[[REF:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[PRIVATE:.*]]: !cir.ptr<!cir.ptr<!cir.array<!cir.ptr<!rec_NoOps> x 5>>> {{.*}}, %[[BOUND1:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND2:.*]]: !acc.data_bounds_ty {{.*}}, %[[BOUND3:.*]]: !acc.data_bounds_ty {{.*}}):
 // CHECK-NEXT: cir.scope {
diff --git a/clang/test/ClangScanDeps/resource_directory.c b/clang/test/ClangScanDeps/resource_directory.c
index 6183e8aefacfa..5c4b24f386de3 100644
--- a/clang/test/ClangScanDeps/resource_directory.c
+++ b/clang/test/ClangScanDeps/resource_directory.c
@@ -1,4 +1,5 @@
-// REQUIRES: shell
+// Path seperator differences
+// UNSUPPORTED: system-windows
 
 // RUN: rm -rf %t && mkdir %t
 // RUN: cp %S/Inputs/resource_directory/* %t
diff --git a/clang/test/ClangScanDeps/response-file.c b/clang/test/ClangScanDeps/response-file.c
index c08105c127202..f905438e86af6 100644
--- a/clang/test/ClangScanDeps/response-file.c
+++ b/clang/test/ClangScanDeps/response-file.c
@@ -1,10 +1,12 @@
-// Check that the scanner can handle a response file input.
+// Check that the scanner can handle a response file input. Uses -verbatim-args
+// to ensure response files are expanded by the scanner library and not the
+// argumeent adjuster in clang-scan-deps.
 
 // RUN: rm -rf %t
 // RUN: split-file %s %t
 // RUN: sed -e "s|DIR|%/t|g" %t/cdb.json.template > %t/cdb.json
 
-// RUN: clang-scan-deps -format experimental-full -compilation-database %t/cdb.json > %t/deps.json
+// RUN: clang-scan-deps -verbatim-args -format experimental-full -compilation-database %t/cdb.json > %t/deps.json
 
 // RUN: cat %t/deps.json | sed 's:\\\\\?:/:g' | FileCheck -DPREFIX=%/t %s
 
diff --git a/clang/test/CodeGen/AArch64/sign-return-address.c b/clang/test/CodeGen/AArch64/sign-return-address.c
index 11dd683a3d7e9..2b505de339054 100644
--- a/clang/test/CodeGen/AArch64/sign-return-address.c
+++ b/clang/test/CodeGen/AArch64/sign-return-address.c
@@ -28,17 +28,17 @@
 // NONE-NOT:  !"branch-target-enforcement"
 // ALL-NOT:   !"branch-target-enforcement"
 // PART-NOT:  !"branch-target-enforcement"
-// BTE:       !{i32 8, !"branch-target-enforcement", i32 1}
+// BTE:       !{i32 8, !"branch-target-enforcement", i32 2}
 // B-KEY-NOT: !"branch-target-enforcement"
 
 // NONE-NOT:  !"sign-return-address"
-// ALL:   !{i32 8, !"sign-return-address", i32 1}
-// PART:  !{i32 8, !"sign-return-address", i32 1}
+// ALL:   !{i32 8, !"sign-return-address", i32 2}
+// PART:  !{i32 8, !"sign-return-address", i32 2}
 // BTE-NOT:   !"sign-return-address"
-// B-KEY: !{i32 8, !"sign-return-address", i32 1}
+// B-KEY: !{i32 8, !"sign-return-address", i32 2}
 
 // NONE-NOT:  !"sign-return-address-all"
-// ALL:   !{i32 8, !"sign-return-address-all", i32 1}
+// ALL:   !{i32 8, !"sign-return-address-all", i32 2}
 // PART-NOT:  !"sign-return-address-all"
 // BTE-NOT:   !"sign-return-address-all"
 // B-KEY-NOT: !"sign-return-address-all"
@@ -47,6 +47,6 @@
 // ALL-NOT:   !"sign-return-address-with-bkey"
 // PART-NOT:  !"sign-return-address-with-bkey"
 // BTE-NOT:   !"sign-return-address-with-bkey"
-// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 1}
+// B-KEY: !{i32 8, !"sign-return-address-with-bkey", i32 2}
 
 void foo() {}
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e4m3.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e4m3.c
new file mode 100644
index 0000000000000..d162f449d9239
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e4m3.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e4m3.e4m3.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e5m2.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e5m2.c
new file mode 100644
index 0000000000000..342af1eca65bf
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e4m3_e5m2.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e4m3.e5m2.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e4m3.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e4m3.c
new file mode 100644
index 0000000000000..b8f58fe230b76
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e4m3.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e5m2_e4m3_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e5m2.e4m3.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e5m2_e4m3_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e5m2_e4m3_w4_u8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e5m2.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e5m2.c
new file mode 100644
index 0000000000000..7c2eb3227f004
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_e5m2_e5m2.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e5m2.e5m2.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_f_f.c
new file mode 100644
index 0000000000000..262bc0a5cf66a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_f_f.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a16f \
+// RUN: -target-feature +xsfmm32a32f -target-feature +xsfmm64a64f \
+// RUN: -target-feature +zvfhmin -target-feature +zve64d -disable-O0-optnone \
+// RUN: -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w2_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], <vscale x 32 x half> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> [[VS2]], <vscale x 32 x half> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 2)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w2_f16m8(vfloat16m8_t vs2, vfloat16m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w2_f16m8(0, vs2, vs1, tm, tn, tk);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w1_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x float> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv16f32(i64 0, <vscale x 16 x float> [[VS2]], <vscale x 16 x float> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w1_f32m8(vfloat32m8_t vs2, vfloat32m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w1_f32m8(0, vs2, vs1, tm, tn, tk);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w1_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], <vscale x 8 x double> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv8f64(i64 0, <vscale x 8 x double> [[VS2]], <vscale x 8 x double> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w1_f64m8(vfloat64m8_t vs2, vfloat64m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w1_f64m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_s.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_s.c
new file mode 100644
index 0000000000000..35c6756b8b663
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_s.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_s_s_w4_i8m8_i8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.s.s.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_s_s_w4_i8m8_i8m8(vint8m8_t vs2, vint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_s_s_w4_i8m8_i8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_u.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_u.c
new file mode 100644
index 0000000000000..c142fcc19832a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_s_u.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_s_u_w4_i8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.s.u.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_s_u_w4_i8m8_u8m8(vint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_s_u_w4_i8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_s.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_s.c
new file mode 100644
index 0000000000000..46350e576e9f7
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_s.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_u_s_w4_u8m8_i8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.u.s.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_u_s_w4_u8m8_i8m8(vuint8m8_t vs2, vint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_u_s_w4_u8m8_i8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_u.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_u.c
new file mode 100644
index 0000000000000..de84db50339f4
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_mm_u_u.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_u_u_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.u.u.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_u_u_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_u_u_w4_u8m8_u8m8(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte16.c
new file mode 100644
index 0000000000000..2c23176aca97c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte16.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_bf16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_bf16(size_t tss, __bf16 *base, size_t vl) {
+  return __riscv_sf_vlte16_bf16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_f16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_f16(size_t tss, _Float16 *base, size_t vl) {
+  return __riscv_sf_vlte16_f16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_i16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_i16(size_t tss, int16_t *base, size_t vl) {
+  return __riscv_sf_vlte16_i16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_u16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_u16(size_t tss, uint16_t *base, size_t vl) {
+  return __riscv_sf_vlte16_u16(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte32.c
new file mode 100644
index 0000000000000..a0422cfe2c8ce
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte32.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_f32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_f32(size_t tss, float *base, size_t vl) {
+  return __riscv_sf_vlte32_f32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_i32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_i32(size_t tss, int32_t *base, size_t vl) {
+  return __riscv_sf_vlte32_i32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_u32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_u32(size_t tss, uint32_t *base, size_t vl) {
+  return __riscv_sf_vlte32_u32(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte64.c
new file mode 100644
index 0000000000000..e8b9552451e65
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte64.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_f64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_f64(size_t tss, double *base, size_t vl) {
+  return __riscv_sf_vlte64_f64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_i64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_i64(size_t tss, int64_t *base, size_t vl) {
+  return __riscv_sf_vlte64_i64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_u64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_u64(size_t tss, uint64_t *base, size_t vl) {
+  return __riscv_sf_vlte64_u64(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte8.c
new file mode 100644
index 0000000000000..a86ccec3bd3ca
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vlte8.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte8_i8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte8_i8(size_t tss, int8_t *base, size_t vl) {
+  return __riscv_sf_vlte8_i8(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte8_u8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte8_u8(size_t tss, uint8_t *base, size_t vl) {
+  return __riscv_sf_vlte8_u8(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettk.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettk.c
new file mode 100644
index 0000000000000..32b7bce839b6e
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettk.c
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zve64x \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e8w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 0, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e8w1(size_t tn) {
+  return __riscv_sf_vsettk_e8w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e8w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 0, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e8w2(size_t tn) {
+  return __riscv_sf_vsettk_e8w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e8w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 0, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e8w4(size_t tn) {
+  return __riscv_sf_vsettk_e8w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e16w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 1, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e16w1(size_t tn) {
+  return __riscv_sf_vsettk_e16w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e16w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 1, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e16w2(size_t tn) {
+  return __riscv_sf_vsettk_e16w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e16w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 1, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e16w4(size_t tn) {
+  return __riscv_sf_vsettk_e16w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e32w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 2, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e32w1(size_t tn) {
+  return __riscv_sf_vsettk_e32w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e32w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 2, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e32w2(size_t tn) {
+  return __riscv_sf_vsettk_e32w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettk_e64w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettk.i64(i64 [[TN]], i64 3, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettk_e64w1(size_t tn) {
+  return __riscv_sf_vsettk_e64w1(tn);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettm.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettm.c
new file mode 100644
index 0000000000000..0ce7c5785a22f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettm.c
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zve64x \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e8w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 0, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e8w1(size_t tn) {
+  return __riscv_sf_vsettm_e8w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e8w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 0, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e8w2(size_t tn) {
+  return __riscv_sf_vsettm_e8w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e8w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 0, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e8w4(size_t tn) {
+  return __riscv_sf_vsettm_e8w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e16w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 1, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e16w1(size_t tn) {
+  return __riscv_sf_vsettm_e16w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e16w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 1, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e16w2(size_t tn) {
+  return __riscv_sf_vsettm_e16w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e16w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 1, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e16w4(size_t tn) {
+  return __riscv_sf_vsettm_e16w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e32w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 2, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e32w1(size_t tn) {
+  return __riscv_sf_vsettm_e32w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e32w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 2, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e32w2(size_t tn) {
+  return __riscv_sf_vsettm_e32w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettm_e64w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettm.i64(i64 [[TN]], i64 3, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettm_e64w1(size_t tn) {
+  return __riscv_sf_vsettm_e64w1(tn);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettn.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettn.c
new file mode 100644
index 0000000000000..7b058f4667738
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettn.c
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zve64x \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e8w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e8w1(size_t tn) {
+  return __riscv_sf_vsettn_e8w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e8w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e8w2(size_t tn) {
+  return __riscv_sf_vsettn_e8w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e8w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e8w4(size_t tn) {
+  return __riscv_sf_vsettn_e8w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e16w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e16w1(size_t tn) {
+  return __riscv_sf_vsettn_e16w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e16w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e16w2(size_t tn) {
+  return __riscv_sf_vsettn_e16w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e16w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e16w4(size_t tn) {
+  return __riscv_sf_vsettn_e16w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e32w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 2, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e32w1(size_t tn) {
+  return __riscv_sf_vsettn_e32w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e32w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 2, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e32w2(size_t tn) {
+  return __riscv_sf_vsettn_e32w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettn_e64w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 3, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettn_e64w1(size_t tn) {
+  return __riscv_sf_vsettn_e64w1(tn);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettnt.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettnt.c
new file mode 100644
index 0000000000000..29eaec3668c4f
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vsettnt.c
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zve64x \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e8w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e8w1(size_t tn) {
+  return __riscv_sf_vsettnt_e8w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e8w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e8w2(size_t tn) {
+  return __riscv_sf_vsettnt_e8w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e8w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 0, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e8w4(size_t tn) {
+  return __riscv_sf_vsettnt_e8w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e16w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e16w1(size_t tn) {
+  return __riscv_sf_vsettnt_e16w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e16w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e16w2(size_t tn) {
+  return __riscv_sf_vsettnt_e16w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e16w4(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 1, i64 3)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e16w4(size_t tn) {
+  return __riscv_sf_vsettnt_e16w4(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e32w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 2, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e32w1(size_t tn) {
+  return __riscv_sf_vsettnt_e32w1(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e32w2(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 2, i64 2)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e32w2(size_t tn) {
+  return __riscv_sf_vsettnt_e32w2(tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local i64 @test_sf_vsettnt_e64w1(
+// CHECK-RV64-SAME: i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call i64 @llvm.riscv.sf.vsettnt.i64(i64 [[TN]], i64 3, i64 1)
+// CHECK-RV64-NEXT:    ret i64 [[TMP0]]
+//
+size_t test_sf_vsettnt_e64w1(size_t tn) {
+  return __riscv_sf_vsettnt_e64w1(tn);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste16.c
new file mode 100644
index 0000000000000..bf50e7fe527dd
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste16.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_bf16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_bf16(size_t tss, __bf16 *base, size_t vl) {
+  return __riscv_sf_vste16_bf16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_f16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_f16(size_t tss, _Float16 *base, size_t vl) {
+  return __riscv_sf_vste16_f16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_i16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_i16(size_t tss, int16_t *base, size_t vl) {
+  return __riscv_sf_vste16_i16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_u16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_u16(size_t tss, uint16_t *base, size_t vl) {
+  return __riscv_sf_vste16_u16(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste32.c
new file mode 100644
index 0000000000000..d1d7191f2b11d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste32.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_f32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_f32(size_t tss, float *base, size_t vl) {
+  return __riscv_sf_vste32_f32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_i32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_i32(size_t tss, int32_t *base, size_t vl) {
+  return __riscv_sf_vste32_i32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_u32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_u32(size_t tss, uint32_t *base, size_t vl) {
+  return __riscv_sf_vste32_u32(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste64.c
new file mode 100644
index 0000000000000..4c7d4b18b30f9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste64.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_f64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_f64(size_t tss, double *base, size_t vl) {
+  return __riscv_sf_vste64_f64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_i64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_i64(size_t tss, int64_t *base, size_t vl) {
+  return __riscv_sf_vste64_i64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_u64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_u64(size_t tss, uint64_t *base, size_t vl) {
+  return __riscv_sf_vste64_u64(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste8.c
new file mode 100644
index 0000000000000..d03bc61c49f90
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vste8.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste8_i8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste8_i8(size_t tss, int8_t *base, size_t vl) {
+  return __riscv_sf_vste8_i8(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste8_u8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste8_u8(size_t tss, uint8_t *base, size_t vl) {
+  return __riscv_sf_vste8_u8(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtdiscard.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtdiscard.c
new file mode 100644
index 0000000000000..7eef2c927934d
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtdiscard.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtdiscard(
+// CHECK-RV64-SAME: ) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtdiscard()
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtdiscard() {
+  return __riscv_sf_vtdiscard();
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_t_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_t_v.c
new file mode 100644
index 0000000000000..37ee503391132
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_t_v.c
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -target-feature +zve64d \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_bf16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.i64(i64 [[TSS]], <vscale x 32 x bfloat> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_bf16m8(size_t tss, vbfloat16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_bf16m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32f16.i64(i64 [[TSS]], <vscale x 32 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f16m8(size_t tss, vfloat16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_f16m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16f32.i64(i64 [[TSS]], <vscale x 16 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f32m8(size_t tss, vfloat32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_f32m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8f64.i64(i64 [[TSS]], <vscale x 8 x double> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f64m8(size_t tss, vfloat64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_f64m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 64 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.i64(i64 [[TSS]], <vscale x 64 x i8> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i8m8(size_t tss, vint8m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_i8m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.i64(i64 [[TSS]], <vscale x 32 x i16> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i16m8(size_t tss, vint16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_i16m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.i64(i64 [[TSS]], <vscale x 16 x i32> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i32m8(size_t tss, vint32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_i32m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x i64> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.i64(i64 [[TSS]], <vscale x 8 x i64> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i64m8(size_t tss, vint64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_i64m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 64 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.i64(i64 [[TSS]], <vscale x 64 x i8> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u8m8(size_t tss, vuint8m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_u8m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.i64(i64 [[TSS]], <vscale x 32 x i16> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u16m8(size_t tss, vuint16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_u16m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.i64(i64 [[TSS]], <vscale x 16 x i32> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u32m8(size_t tss, vuint32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_u32m8(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x i64> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.i64(i64 [[TSS]], <vscale x 8 x i64> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u64m8(size_t tss, vuint64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v_u64m8(tss, src, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_v_t.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_v_t.c
new file mode 100644
index 0000000000000..d127cf42173ec
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtmv_v_t.c
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -target-feature +zve64d \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x bfloat> @test_sf_vtmv_v_t_bf16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x bfloat> @llvm.riscv.sf.vtmv.v.t.nxv32bf16.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x bfloat> [[TMP0]]
+//
+vbfloat16m8_t test_sf_vtmv_v_t_bf16m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_bf16m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x half> @test_sf_vtmv_v_t_f16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x half> @llvm.riscv.sf.vtmv.v.t.nxv32f16.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x half> [[TMP0]]
+//
+vfloat16m8_t test_sf_vtmv_v_t_f16m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_f16m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x float> @test_sf_vtmv_v_t_f32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x float> @llvm.riscv.sf.vtmv.v.t.nxv16f32.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x float> [[TMP0]]
+//
+vfloat32m8_t test_sf_vtmv_v_t_f32m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_f32m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x double> @test_sf_vtmv_v_t_f64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x double> @llvm.riscv.sf.vtmv.v.t.nxv8f64.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x double> [[TMP0]]
+//
+vfloat64m8_t test_sf_vtmv_v_t_f64m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_f64m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_sf_vtmv_v_t_i8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+vint8m8_t test_sf_vtmv_v_t_i8m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_i8m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_sf_vtmv_v_t_i16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vint16m8_t test_sf_vtmv_v_t_i16m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_i16m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vtmv_v_t_i32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+vint32m8_t test_sf_vtmv_v_t_i32m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_i32m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_sf_vtmv_v_t_i64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vint64m8_t test_sf_vtmv_v_t_i64m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_i64m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 64 x i8> @test_sf_vtmv_v_t_u8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 64 x i8> @llvm.riscv.sf.vtmv.v.t.nxv64i8.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 64 x i8> [[TMP0]]
+//
+vuint8m8_t test_sf_vtmv_v_t_u8m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_u8m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 32 x i16> @test_sf_vtmv_v_t_u16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 32 x i16> @llvm.riscv.sf.vtmv.v.t.nxv32i16.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 32 x i16> [[TMP0]]
+//
+vuint16m8_t test_sf_vtmv_v_t_u16m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_u16m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 16 x i32> @test_sf_vtmv_v_t_u32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 16 x i32> @llvm.riscv.sf.vtmv.v.t.nxv16i32.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 16 x i32> [[TMP0]]
+//
+vuint32m8_t test_sf_vtmv_v_t_u32m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_u32m8(tss, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local <vscale x 8 x i64> @test_sf_vtmv_v_t_u64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    [[TMP0:%.*]] = call <vscale x 8 x i64> @llvm.riscv.sf.vtmv.v.t.nxv8i64.i64(i64 [[TSS]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret <vscale x 8 x i64> [[TMP0]]
+//
+vuint64m8_t test_sf_vtmv_v_t_u64m8(size_t tss, size_t vl) {
+  return __riscv_sf_vtmv_v_t_u64m8(tss, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtzero_t.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtzero_t.c
new file mode 100644
index 0000000000000..1e4de48d321a3
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/non-overloaded/sf_vtzero_t.c
@@ -0,0 +1,99 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zve64x \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e8w1(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 3, i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e8w1(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e8w1(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e8w2(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 3, i64 2)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e8w2(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e8w2(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e8w4(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 3, i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e8w4(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e8w4(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e16w1(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 4, i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e16w1(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e16w1(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e16w2(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 4, i64 2)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e16w2(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e16w2(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e16w4(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 4, i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e16w4(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e16w4(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e32w1(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 5, i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e32w1(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e32w1(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e32w2(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 5, i64 2)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e32w2(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e32w2(0, tm, tn);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtzero_t_e64w1(
+// CHECK-RV64-SAME: i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtzero.t.i64(i64 0, i64 [[TM]], i64 [[TN]], i64 6, i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtzero_t_e64w1(size_t tm, size_t tn) {
+  return __riscv_sf_vtzero_t_e64w1(0, tm, tn);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e4m3.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e4m3.c
new file mode 100644
index 0000000000000..2f6c4dc324d60
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e4m3.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e4m3.e4m3.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e4m3_e4m3_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e4m3_e4m3(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e5m2.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e5m2.c
new file mode 100644
index 0000000000000..40ae780d6c461
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e4m3_e5m2.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e4m3.e5m2.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e4m3_e5m2_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e4m3_e5m2(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e4m3.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e4m3.c
new file mode 100644
index 0000000000000..f4f024cbd0988
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e4m3.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e5m2_e4m3_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e5m2.e4m3.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e5m2_e4m3_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e5m2_e4m3(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e5m2.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e5m2.c
new file mode 100644
index 0000000000000..01399d5d81d39
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_e5m2_e5m2.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8f \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.e5m2.e5m2.i64.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_e5m2_e5m2_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_e5m2_e5m2(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_f_f.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_f_f.c
new file mode 100644
index 0000000000000..2371e4ec24fc9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_f_f.c
@@ -0,0 +1,40 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a16f \
+// RUN: -target-feature +xsfmm32a32f -target-feature +xsfmm64a64f \
+// RUN: -target-feature +zvfhmin -target-feature +zve64d -disable-O0-optnone \
+// RUN: -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w2_f16m8(
+// CHECK-RV64-SAME: <vscale x 32 x half> [[VS2:%.*]], <vscale x 32 x half> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv32f16(i64 0, <vscale x 32 x half> [[VS2]], <vscale x 32 x half> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 2)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w2_f16m8(vfloat16m8_t vs2, vfloat16m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w2(0, vs2, vs1, tm, tn, tk);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w1_f32m8(
+// CHECK-RV64-SAME: <vscale x 16 x float> [[VS2:%.*]], <vscale x 16 x float> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv16f32(i64 0, <vscale x 16 x float> [[VS2]], <vscale x 16 x float> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w1_f32m8(vfloat32m8_t vs2, vfloat32m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w1(0, vs2, vs1, tm, tn, tk);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_f_f_w1_f64m8(
+// CHECK-RV64-SAME: <vscale x 8 x double> [[VS2:%.*]], <vscale x 8 x double> [[VS1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.f.f.i64.nxv8f64(i64 0, <vscale x 8 x double> [[VS2]], <vscale x 8 x double> [[VS1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 1)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_f_f_w1_f64m8(vfloat64m8_t vs2, vfloat64m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_f_f_w1(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_s.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_s.c
new file mode 100644
index 0000000000000..2d34f7d05060a
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_s.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_s_s_w4_i8m8_i8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.s.s.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_s_s_w4_i8m8_i8m8(vint8m8_t vs2, vint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_s_s(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_u.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_u.c
new file mode 100644
index 0000000000000..1f9bc33412871
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_s_u.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_s_u_w4_i8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.s.u.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_s_u_w4_i8m8_u8m8(vint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_s_u(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_s.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_s.c
new file mode 100644
index 0000000000000..2c6d538821fb2
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_s.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_u_s_w4_u8m8_i8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.u.s.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_u_s_w4_u8m8_i8m8(vuint8m8_t vs2, vint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_u_s(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_u.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_u.c
new file mode 100644
index 0000000000000..bb1eaf19b04da
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_mm_u_u.c
@@ -0,0 +1,18 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmm32a8i \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_mm_u_u_w4_u8m8_u8m8(
+// CHECK-RV64-SAME: <vscale x 64 x i8> [[vs2:%.*]], <vscale x 64 x i8> [[vs1:%.*]], i64 noundef [[TM:%.*]], i64 noundef [[TN:%.*]], i64 noundef [[TK:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.mm.u.u.i64.nxv64i8.nxv64i8(i64 0, <vscale x 64 x i8> [[vs2]], <vscale x 64 x i8> [[vs1]], i64 [[TM]], i64 [[TN]], i64 [[TK]], i64 4)
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_mm_u_u_w4_u8m8_u8m8(vuint8m8_t vs2, vuint8m8_t vs1, size_t tm, size_t tn, size_t tk) {
+  return __riscv_sf_mm_u_u(0, vs2, vs1, tm, tn, tk);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte16.c
new file mode 100644
index 0000000000000..e199c1fb35041
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte16.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_bf16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_bf16(size_t tss, __bf16 *base, size_t vl) {
+  return __riscv_sf_vlte16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_f16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_f16(size_t tss, _Float16 *base, size_t vl) {
+  return __riscv_sf_vlte16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_i16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_i16(size_t tss, int16_t *base, size_t vl) {
+  return __riscv_sf_vlte16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte16_u16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte16_u16(size_t tss, uint16_t *base, size_t vl) {
+  return __riscv_sf_vlte16(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte32.c
new file mode 100644
index 0000000000000..388884d91f0d0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte32.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_f32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_f32(size_t tss, float *base, size_t vl) {
+  return __riscv_sf_vlte32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_i32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_i32(size_t tss, int32_t *base, size_t vl) {
+  return __riscv_sf_vlte32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte32_u32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte32_u32(size_t tss, uint32_t *base, size_t vl) {
+  return __riscv_sf_vlte32(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte64.c
new file mode 100644
index 0000000000000..80d7542f10585
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte64.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_f64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_f64(size_t tss, double *base, size_t vl) {
+  return __riscv_sf_vlte64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_i64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_i64(size_t tss, int64_t *base, size_t vl) {
+  return __riscv_sf_vlte64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte64_u64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte64_u64(size_t tss, uint64_t *base, size_t vl) {
+  return __riscv_sf_vlte64(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte8.c
new file mode 100644
index 0000000000000..c29bf933cf85c
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vlte8.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte8_i8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte8_i8(size_t tss, int8_t *base, size_t vl) {
+  return __riscv_sf_vlte8(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vlte8_u8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vlte8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vlte8_u8(size_t tss, uint8_t *base, size_t vl) {
+  return __riscv_sf_vlte8(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste16.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste16.c
new file mode 100644
index 0000000000000..e1fff6c95ce03
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste16.c
@@ -0,0 +1,49 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_bf16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_bf16(size_t tss, __bf16 *base, size_t vl) {
+  return __riscv_sf_vste16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_f16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_f16(size_t tss, _Float16 *base, size_t vl) {
+  return __riscv_sf_vste16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_i16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_i16(size_t tss, int16_t *base, size_t vl) {
+  return __riscv_sf_vste16(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste16_u16(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste16.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste16_u16(size_t tss, uint16_t *base, size_t vl) {
+  return __riscv_sf_vste16(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste32.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste32.c
new file mode 100644
index 0000000000000..0c3bc4ce95912
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste32.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_f32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_f32(size_t tss, float *base, size_t vl) {
+  return __riscv_sf_vste32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_i32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_i32(size_t tss, int32_t *base, size_t vl) {
+  return __riscv_sf_vste32(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste32_u32(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste32.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste32_u32(size_t tss, uint32_t *base, size_t vl) {
+  return __riscv_sf_vste32(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste64.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste64.c
new file mode 100644
index 0000000000000..0a568073e1ab9
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste64.c
@@ -0,0 +1,38 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_f64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_f64(size_t tss, double *base, size_t vl) {
+  return __riscv_sf_vste64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_i64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_i64(size_t tss, int64_t *base, size_t vl) {
+  return __riscv_sf_vste64(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste64_u64(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste64.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste64_u64(size_t tss, uint64_t *base, size_t vl) {
+  return __riscv_sf_vste64(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste8.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste8.c
new file mode 100644
index 0000000000000..3115945796ba0
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vste8.c
@@ -0,0 +1,28 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste8_i8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste8_i8(size_t tss, int8_t *base, size_t vl) {
+  return __riscv_sf_vste8(tss, base, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vste8_u8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], ptr noundef [[BASE:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vste8.i64(i64 [[TSS]], ptr [[BASE]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vste8_u8(size_t tss, uint8_t *base, size_t vl) {
+  return __riscv_sf_vste8(tss, base, vl);
+}
+
diff --git a/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vtmv_t_v.c b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vtmv_t_v.c
new file mode 100644
index 0000000000000..2461c39c64c56
--- /dev/null
+++ b/clang/test/CodeGen/RISCV/rvv-intrinsics-sifive/non-policy/overloaded/sf_vtmv_t_v.c
@@ -0,0 +1,130 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 4
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +xsfmmbase \
+// RUN:   -target-feature +zvfhmin -target-feature +zvfbfmin \
+// RUN:   -target-feature +zve64d \
+// RUN:   -disable-O0-optnone -emit-llvm %s -o - | \
+// RUN:   opt -S -passes=mem2reg | FileCheck --check-prefix=CHECK-RV64 %s
+
+#include <sifive_vector.h>
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_bf16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x bfloat> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32bf16.i64(i64 [[TSS]], <vscale x 32 x bfloat> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_bf16m8(size_t tss, vbfloat16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x half> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32f16.i64(i64 [[TSS]], <vscale x 32 x half> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f16m8(size_t tss, vfloat16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x float> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16f32.i64(i64 [[TSS]], <vscale x 16 x float> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f32m8(size_t tss, vfloat32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_f64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x double> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8f64.i64(i64 [[TSS]], <vscale x 8 x double> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_f64m8(size_t tss, vfloat64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 64 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.i64(i64 [[TSS]], <vscale x 64 x i8> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i8m8(size_t tss, vint8m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.i64(i64 [[TSS]], <vscale x 32 x i16> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i16m8(size_t tss, vint16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.i64(i64 [[TSS]], <vscale x 16 x i32> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i32m8(size_t tss, vint32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_i64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x i64> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.i64(i64 [[TSS]], <vscale x 8 x i64> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_i64m8(size_t tss, vint64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u8m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 64 x i8> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv64i8.i64(i64 [[TSS]], <vscale x 64 x i8> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u8m8(size_t tss, vuint8m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u16m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 32 x i16> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv32i16.i64(i64 [[TSS]], <vscale x 32 x i16> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u16m8(size_t tss, vuint16m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u32m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 16 x i32> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv16i32.i64(i64 [[TSS]], <vscale x 16 x i32> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u32m8(size_t tss, vuint32m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
+// CHECK-RV64-LABEL: define dso_local void @test_sf_vtmv_t_v_u64m8(
+// CHECK-RV64-SAME: i64 noundef [[TSS:%.*]], <vscale x 8 x i64> [[SRC:%.*]], i64 noundef [[VL:%.*]]) #[[ATTR0]] {
+// CHECK-RV64-NEXT:  entry:
+// CHECK-RV64-NEXT:    call void @llvm.riscv.sf.vtmv.t.v.nxv8i64.i64(i64 [[TSS]], <vscale x 8 x i64> [[SRC]], i64 [[VL]])
+// CHECK-RV64-NEXT:    ret void
+//
+void test_sf_vtmv_t_v_u64m8(size_t tss, vuint64m8_t src, size_t vl) {
+  return __riscv_sf_vtmv_t_v(tss, src, vl);
+}
+
diff --git a/clang/test/CodeGen/X86/avx-builtins.c b/clang/test/CodeGen/X86/avx-builtins.c
index 26aed77283391..737febbc7fef6 100644
--- a/clang/test/CodeGen/X86/avx-builtins.c
+++ b/clang/test/CodeGen/X86/avx-builtins.c
@@ -1891,12 +1891,16 @@ __m256d test_mm256_shuffle_pd(__m256d A, __m256d B) {
   return _mm256_shuffle_pd(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m256d(_mm256_shuffle_pd(((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));
+
 __m256 test_mm256_shuffle_ps(__m256 A, __m256 B) {
   // CHECK-LABEL: test_mm256_shuffle_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 8, i32 8, i32 4, i32 4, i32 12, i32 12>
   return _mm256_shuffle_ps(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m256(_mm256_shuffle_ps(((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));
+
 __m256d test_mm256_sqrt_pd(__m256d A) {
   // CHECK-LABEL: test_mm256_sqrt_pd
   // CHECK: call {{.*}}<4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.*}})
diff --git a/clang/test/CodeGen/X86/avx2-builtins.c b/clang/test/CodeGen/X86/avx2-builtins.c
index 03b1bdeb55d7c..de4cb2fd0b055 100644
--- a/clang/test/CodeGen/X86/avx2-builtins.c
+++ b/clang/test/CodeGen/X86/avx2-builtins.c
@@ -1155,23 +1155,30 @@ __m256i test_mm256_shufflelo_epi16(__m256i a) {
   return _mm256_shufflelo_epi16(a, 83);
 }
 TEST_CONSTEXPR(match_v16hi(_mm256_shufflelo_epi16(((__m256i)(__v16hi){ 0,1,2,3, 4,5,6,7, 8,9,10,11, 12,13,14,15}), 83), 3,0,1,1, 4,5,6,7, 11,8,9,9, 12,13,14,15) );
+
 __m256i test_mm256_sign_epi8(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sign_epi8
   // CHECK: call <32 x i8> @llvm.x86.avx2.psign.b(<32 x i8> %{{.*}}, <32 x i8> %{{.*}})
   return _mm256_sign_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_sign_epi8(
+    (__m256i)(__v32qs){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1},
+    (__m256i)(__v32qs){0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1}),
+    0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,-1));
 
 __m256i test_mm256_sign_epi16(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sign_epi16
   // CHECK: call <16 x i16> @llvm.x86.avx2.psign.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}})
   return _mm256_sign_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v16hi(_mm256_sign_epi16((__m256i)(__v16hi){0x77,0x77,0xbe,0xbe, -0x9,-0x9,-0x8,-0x8, 0,0,0,0, 0,0,0,0}, (__m256i)(__v16hi){-1,-256,1,256, -512,-1028,512,1028, -2048,-4096,0,0, 0,0,0,0}), -0x77,-0x77,0xbe,0xbe, 0x9,0x9,-0x8,-0x8, 0,0,0,0, 0,0,0,0));
 
 __m256i test_mm256_sign_epi32(__m256i a, __m256i b) {
   // CHECK-LABEL: test_mm256_sign_epi32
   // CHECK: call <8 x i32> @llvm.x86.avx2.psign.d(<8 x i32> %{{.*}}, <8 x i32> %{{.*}})
   return _mm256_sign_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v8si(_mm256_sign_epi32((__m256i)(__v8si){0xbeef,0xfeed,0xbead,0xdeed, -1,2,-3,4}, (__m256i)(__v8si){0,0,0,0,-1,-1,-1,-1}), 0,0,0,0, 1,-2,3,-4));
 
 __m256i test_mm256_slli_epi16(__m256i a) {
   // CHECK-LABEL: test_mm256_slli_epi16
@@ -1229,6 +1236,8 @@ __m256i test_mm256_slli_si256(__m256i a) {
   // CHECK: shufflevector <32 x i8> zeroinitializer, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
   return _mm256_slli_si256(a, 3);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_slli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29));
+TEST_CONSTEXPR(match_v32qi(_mm256_slli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_sllv_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sllv_epi32
@@ -1372,6 +1381,8 @@ __m256i test_mm256_srli_si256(__m256i a) {
   // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> zeroinitializer, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
   return _mm256_srli_si256(a, 3);
 }
+TEST_CONSTEXPR(match_v32qi(_mm256_srli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 3), 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0));
+TEST_CONSTEXPR(match_v32qi(_mm256_srli_si256(((__m256i)(__v32qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_srlv_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_srlv_epi32
diff --git a/clang/test/CodeGen/X86/avx512bw-builtins.c b/clang/test/CodeGen/X86/avx512bw-builtins.c
index d07e40a2b7071..be2cd480f7558 100644
--- a/clang/test/CodeGen/X86/avx512bw-builtins.c
+++ b/clang/test/CodeGen/X86/avx512bw-builtins.c
@@ -2045,6 +2045,8 @@ __m512i test_mm512_bslli_epi128(__m512i __A) {
   // CHECK: shufflevector <64 x i8> zeroinitializer, <64 x i8> %{{.*}}, <64 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 69, i32 70, i32 71, i32 72, i32 73, i32 74, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 85, i32 86, i32 87, i32 88, i32 89, i32 90, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 101, i32 102, i32 103, i32 104, i32 105, i32 106, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116, i32 117, i32 118, i32 119, i32 120, i32 121, i32 122>
   return _mm512_bslli_epi128(__A, 5);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_bslli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 5), 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 0, 0, 0, 0, 0, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 0, 0, 0, 0, 0, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 0, 0, 0, 0, 0, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59));
+TEST_CONSTEXPR(match_v64qi(_mm512_bslli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m512i test_mm512_srlv_epi16(__m512i __A, __m512i __B) {
   // CHECK-LABEL: test_mm512_srlv_epi16
@@ -2223,6 +2225,9 @@ __m512i test_mm512_bsrli_epi128(__m512i __A) {
   // CHECK: shufflevector <64 x i8> %{{.*}}, <64 x i8> zeroinitializer, <64 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 64, i32 65, i32 66, i32 67, i32 68, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 80, i32 81, i32 82, i32 83, i32 84, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 96, i32 97, i32 98, i32 99, i32 100, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 112, i32 113, i32 114, i32 115, i32 116>
   return _mm512_bsrli_epi128(__A, 5);
 }
+TEST_CONSTEXPR(match_v64qi(_mm512_bsrli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 5), 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 0, 0, 0, 0, 0, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 0, 0, 0, 0, 0, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 0, 0, 0, 0, 0, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 0, 0, 0, 0, 0));
+TEST_CONSTEXPR(match_v64qi(_mm512_bsrli_epi128(((__m512i)(__v64qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
+
 __m512i test_mm512_mask_mov_epi16(__m512i __W, __mmask32 __U, __m512i __A) {
   // CHECK-LABEL: test_mm512_mask_mov_epi16
   // CHECK: select <32 x i1> %{{.*}}, <32 x i16> %{{.*}}, <32 x i16> %{{.*}}
diff --git a/clang/test/CodeGen/X86/avx512f-builtins.c b/clang/test/CodeGen/X86/avx512f-builtins.c
index 122dadd03a211..69599379b6b3d 100644
--- a/clang/test/CodeGen/X86/avx512f-builtins.c
+++ b/clang/test/CodeGen/X86/avx512f-builtins.c
@@ -6741,9 +6741,13 @@ __m512 test_mm512_maskz_shuffle_ps(__mmask16 __U, __m512 __M, __m512 __V) {
   // CHECK-LABEL: test_mm512_maskz_shuffle_ps
   // CHECK: shufflevector <16 x float> %{{.*}}, <16 x float> %{{.*}}, <16 x i32> <i32 0, i32 1, i32 16, i32 16, i32 4, i32 5, i32 20, i32 20, i32 8, i32 9, i32 24, i32 24, i32 12, i32 13, i32 28, i32 28>
   // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}}
-  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4); 
+  return _mm512_maskz_shuffle_ps(__U, __M, __V, 4);
 }
 
+TEST_CONSTEXPR((match_m512(_mm512_shuffle_ps(((__m512)(__v16sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), ((__m512)(__v16sf){17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f, 25.0f, 26.0f, 27.0f, 28.0f, 29.0f, 30.0f, 31.0f, 32.0f}), 4), 1.0f, 2.0f, 17.0f, 17.0f, 5.0f, 6.0f, 21.0f, 21.0f, 9.0f, 10.0f, 25.0f, 25.0f, 13.0f, 14.0f, 29.0f, 29.0f)));
+TEST_CONSTEXPR((match_m512d(_mm512_shuffle_pd(((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));
+TEST_CONSTEXPR((match_m512d(_mm512_maskz_shuffle_pd(0xFF, ((__m512d)(__v8df){1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0}), ((__m512d)(__v8df){9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0}), 48), 1.0, 9.0, 3.0, 11.0, 6.0, 14.0, 7.0, 15.0)));
+
 __m128d test_mm_sqrt_round_sd(__m128d __A, __m128d __B) {
   // CHECK-LABEL: test_mm_sqrt_round_sd
   // CHECK: call {{.*}}<2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x double> %{{.*}}, i8 -1, i32 11)
diff --git a/clang/test/CodeGen/X86/avx512vl-builtins.c b/clang/test/CodeGen/X86/avx512vl-builtins.c
index 34db7641b9448..33c43977f72dc 100644
--- a/clang/test/CodeGen/X86/avx512vl-builtins.c
+++ b/clang/test/CodeGen/X86/avx512vl-builtins.c
@@ -8933,9 +8933,14 @@ __m256 test_mm256_maskz_shuffle_ps(__mmask8 __U, __m256 __A, __m256 __B) {
   // CHECK-LABEL: test_mm256_maskz_shuffle_ps
   // CHECK: shufflevector <8 x float> %{{.*}}, <8 x float> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 8, i32 8, i32 4, i32 5, i32 12, i32 12>
   // CHECK: select <8 x i1> %{{.*}}, <8 x float> %{{.*}}, <8 x float> %{{.*}}
-  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4); 
+  return _mm256_maskz_shuffle_ps(__U, __A, __B, 4);
 }
 
+TEST_CONSTEXPR((match_m128d(_mm_maskz_shuffle_pd(0x3, ((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
+TEST_CONSTEXPR((match_m256d(_mm256_maskz_shuffle_pd(0xF, ((__m256d)(__v4df){1.0, 2.0, 3.0, 4.0}), ((__m256d)(__v4df){5.0, 6.0, 7.0, 8.0}), 15), 2.0, 6.0, 4.0, 8.0)));
+TEST_CONSTEXPR((match_m128(_mm_maskz_shuffle_ps(0xF, ((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m256(_mm256_maskz_shuffle_ps(0xFF, ((__m256)(__v8sf){1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}), ((__m256)(__v8sf){9.0f, 10.0f, 11.0f, 12.0f, 13.0f, 14.0f, 15.0f, 16.0f}), 4), 1.0f, 2.0f, 9.0f, 9.0f, 5.0f, 6.0f, 13.0f, 13.0f)));
+
 __m128d test_mm_rsqrt14_pd(__m128d __A) {
   // CHECK-LABEL: test_mm_rsqrt14_pd
   // CHECK: @llvm.x86.avx512.rsqrt14.pd.128
diff --git a/clang/test/CodeGen/X86/mmx-builtins.c b/clang/test/CodeGen/X86/mmx-builtins.c
index 767425577a130..273138063a1b1 100644
--- a/clang/test/CodeGen/X86/mmx-builtins.c
+++ b/clang/test/CodeGen/X86/mmx-builtins.c
@@ -602,23 +602,28 @@ __m64 test_mm_shuffle_pi16(__m64 a) {
   return _mm_shuffle_pi16(a, 3);
 }
 TEST_CONSTEXPR(match_v4hi(_mm_shuffle_pi16(((__m64)(__v4hi){0,1,2,3}), 3), 3,0,0,0));
+
 __m64 test_mm_sign_pi8(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi8
   // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(
   return _mm_sign_pi8(a, b);
 }
+TEST_CONSTEXPR(match_v8qi(_mm_sign_pi8((__m64)(__v8qi){0,0,0,0, 0,0,0,0}, (__m64)(__v8qi){0,0,0,0, 0,0,0,0}), 0,0,0,0, 0,0,0,0));
+TEST_CONSTEXPR(match_v8qi(_mm_sign_pi8((__m64)(__v8qi){6,7,6,7, 6,7,6,7}, (__m64)(__v8qi){1,1,1,1, 0,0,0,0}), 6,7,6,7, 0,0,0,0));
 
 __m64 test_mm_sign_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(
   return _mm_sign_pi16(a, b);
 }
+TEST_CONSTEXPR(match_v4hi(_mm_sign_pi16((__m64)(__v4hi){-1,0,1,0}, (__m64)(__v4hi){1,0,-1,0}), -1,0,-1,0));
 
 __m64 test_mm_sign_pi32(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sign_pi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(
   return _mm_sign_pi32(a, b);
 }
+TEST_CONSTEXPR(match_v2si(_mm_sign_pi32((__m64)(__v2si){0x7FFF, -1}, (__m64)(__v2si){-1, 0x7FFF}), -0x7FFF, -1));
 
 __m64 test_mm_sll_pi16(__m64 a, __m64 b) {
   // CHECK-LABEL: test_mm_sll_pi16
diff --git a/clang/test/CodeGen/X86/sse-builtins.c b/clang/test/CodeGen/X86/sse-builtins.c
index f5c1d00d1b851..6c5297e45dc82 100644
--- a/clang/test/CodeGen/X86/sse-builtins.c
+++ b/clang/test/CodeGen/X86/sse-builtins.c
@@ -738,6 +738,11 @@ __m128 test_mm_shuffle_ps(__m128 A, __m128 B) {
   return _mm_shuffle_ps(A, B, 0);
 }
 
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 4), 1.0f, 2.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 0), 1.0f, 1.0f, 5.0f, 5.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 255), 4.0f, 4.0f, 8.0f, 8.0f)));
+TEST_CONSTEXPR((match_m128(_mm_shuffle_ps(((__m128)(__v4sf){1.0f, 2.0f, 3.0f, 4.0f}), ((__m128)(__v4sf){5.0f, 6.0f, 7.0f, 8.0f}), 27), 4.0f, 3.0f, 6.0f, 5.0f)));
+
 __m128 test_mm_sqrt_ps(__m128 x) {
   // CHECK-LABEL: test_mm_sqrt_ps
   // CHECK: call {{.*}}<4 x float> @llvm.sqrt.v4f32(<4 x float> {{.*}})
diff --git a/clang/test/CodeGen/X86/sse2-builtins.c b/clang/test/CodeGen/X86/sse2-builtins.c
index 8e4fb86112c56..379ae48995d26 100644
--- a/clang/test/CodeGen/X86/sse2-builtins.c
+++ b/clang/test/CodeGen/X86/sse2-builtins.c
@@ -1314,6 +1314,11 @@ __m128d test_mm_shuffle_pd(__m128d A, __m128d B) {
   return _mm_shuffle_pd(A, B, 1);
 }
 
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 3), 2.0, 4.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 0), 1.0, 3.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 1), 2.0, 3.0)));
+TEST_CONSTEXPR((match_m128d(_mm_shuffle_pd(((__m128d)(__v2df){1.0, 2.0}), ((__m128d)(__v2df){3.0, 4.0}), 2), 1.0, 4.0)));
+
 __m128i test_mm_shufflehi_epi16(__m128i A) {
   // CHECK-LABEL: test_mm_shufflehi_epi16
   // CHECK: shufflevector <8 x i16> %{{.*}}, <8 x i16> poison, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
@@ -1418,6 +1423,8 @@ __m128i test_mm_slli_si128(__m128i A) {
   // CHECK: shufflevector <16 x i8> zeroinitializer, <16 x i8> %{{.*}}, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26>
   return _mm_slli_si128(A, 5);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_slli_si128(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 5), 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11));
+TEST_CONSTEXPR(match_v16qi(_mm_slli_si128(((__m128i)(__v16qi){1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_slli_si128_2(__m128i A) {
   // CHECK-LABEL: test_mm_slli_si128_2
@@ -1570,6 +1577,8 @@ __m128i test_mm_srli_si128(__m128i A) {
   // CHECK: shufflevector <16 x i8> %{{.*}}, <16 x i8> zeroinitializer, <16 x i32> <i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20>
   return _mm_srli_si128(A, 5);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_srli_si128(((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 5), 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 0, 0, 0, 0, 0));
+TEST_CONSTEXPR(match_v16qi(_mm_srli_si128(((__m128i)(__v16qi){0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15}), 16), 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0));
 
 __m128i test_mm_srli_si128_2(__m128i A) {
   // CHECK-LABEL: test_mm_srli_si128_2
diff --git a/clang/test/CodeGen/X86/ssse3-builtins.c b/clang/test/CodeGen/X86/ssse3-builtins.c
index f70afc01a1963..b7a4a2fe7ccd7 100644
--- a/clang/test/CodeGen/X86/ssse3-builtins.c
+++ b/clang/test/CodeGen/X86/ssse3-builtins.c
@@ -125,15 +125,18 @@ __m128i test_mm_sign_epi8(__m128i a, __m128i b) {
   // CHECK: call <16 x i8> @llvm.x86.ssse3.psign.b.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
   return _mm_sign_epi8(a, b);
 }
+TEST_CONSTEXPR(match_v16qi(_mm_sign_epi8((__m128i)(__v16qs){11,0,13,14, 0,16,17,18, 19,20,21,22, 23,24,25,26}, (__m128i)(__v16qs){0,1,0,1, -1,1,0,0, 0,0,1,1, -1,0,-1,0}), 0,0,0,14, 0,16,0,0, 0,0,21,22, -23,0,-25,0));
 
 __m128i test_mm_sign_epi16(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi16
   // CHECK: call <8 x i16> @llvm.x86.ssse3.psign.w.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
   return _mm_sign_epi16(a, b);
 }
+TEST_CONSTEXPR(match_v8hi(_mm_sign_epi16((__m128i)(__v8hi){0,-2,0,-4,0,-6,0,-8}, (__m128i)(__v8hi){-1,-2,-3,-4,-5,-6,7,-8}), 0,2,0,4,0,6,0,8));
 
 __m128i test_mm_sign_epi32(__m128i a, __m128i b) {
   // CHECK-LABEL: test_mm_sign_epi32
   // CHECK: call <4 x i32> @llvm.x86.ssse3.psign.d.128(<4 x i32> %{{.*}}, <4 x i32> %{{.*}})
   return _mm_sign_epi32(a, b);
 }
+TEST_CONSTEXPR(match_v4si(_mm_sign_epi32((__m128i)(__v4si){-1,-2,-3,-4}, (__m128i)(__v4si){-4,-3,-2,-1}), 1,2,3,4));
diff --git a/clang/test/CodeGen/allow-ubsan-check-divergence.c b/clang/test/CodeGen/allow-ubsan-check-divergence.c
new file mode 100644
index 0000000000000..a21d4f693269b
--- /dev/null
+++ b/clang/test/CodeGen/allow-ubsan-check-divergence.c
@@ -0,0 +1,30 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN:     | FileCheck %s --check-prefixes=CLEAN-O1
+
+// RUN: %clang_cc1 -triple x86_64-pc-linux-gnu -emit-llvm -O1 -o - %s \
+// RUN:     -fsanitize=signed-integer-overflow \
+// RUN:     -fsanitize-skip-hot-cutoff=signed-integer-overflow=1.0 \
+// RUN:     -fallow-runtime-check-skip-hot-cutoff=1.0 \
+// RUN:     | FileCheck %s --check-prefixes=UBSAN-O1
+
+// This test shows that -fsanitize-skip-hot-cutoff=...=1.0 plus
+// -fallow-runtime-check-skip-hot-cutoff=1.0 does not perfectly undo the
+// effects of -fsanitize.
+
+// CLEAN-O1-LABEL: define dso_local i32 @overflow(
+// CLEAN-O1-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// CLEAN-O1-NEXT:  [[ENTRY:.*:]]
+// CLEAN-O1-NEXT:    [[ADD:%.*]] = add nsw i32 [[Y]], [[X]]
+// CLEAN-O1-NEXT:    ret i32 [[ADD]]
+//
+// UBSAN-O1-LABEL: define dso_local noundef i32 @overflow(
+// UBSAN-O1-SAME: i32 noundef [[X:%.*]], i32 noundef [[Y:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+// UBSAN-O1-NEXT:  [[ENTRY:.*:]]
+// UBSAN-O1-NEXT:    [[TMP0:%.*]] = add i32 [[X]], [[Y]]
+// UBSAN-O1-NEXT:    ret i32 [[TMP0]]
+//
+int overflow(int x, int y) {
+  return x + y;
+}
diff --git a/clang/test/CodeGen/arm-acle-coproc.c b/clang/test/CodeGen/arm-acle-coproc.c
index 93b713ba97318..5acb9f65413a0 100644
--- a/clang/test/CodeGen/arm-acle-coproc.c
+++ b/clang/test/CodeGen/arm-acle-coproc.c
@@ -26,6 +26,7 @@
 // RUN: %clang_cc1 -triple armv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple armv9.6a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple armv9.7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv4 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv4t %s -E -dD -o - | FileCheck --check-prefix=CHECK-V4-THUMB %s
 // RUN: %clang_cc1 -triple thumbv5 %s -E -dD -o - | FileCheck --check-prefix=CHECK-V5-THUMB %s
@@ -56,6 +57,7 @@
 // RUN: %clang_cc1 -triple thumbv9.4a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.5a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv9.6a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
+// RUN: %clang_cc1 -triple thumbv9.7a %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8 %s
 // RUN: %clang_cc1 -triple thumbv8m.base %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-BASE %s
 // RUN: %clang_cc1 -triple thumbv8m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
 // RUN: %clang_cc1 -triple thumbv8.1m.main %s -E -dD -o - | FileCheck --check-prefix=CHECK-V8-MAIN %s
diff --git a/clang/test/CodeGen/arm-branch-protection-attr-2.c b/clang/test/CodeGen/arm-branch-protection-attr-2.c
index fad5dc0707fb9..53915370a71cd 100644
--- a/clang/test/CodeGen/arm-branch-protection-attr-2.c
+++ b/clang/test/CodeGen/arm-branch-protection-attr-2.c
@@ -23,16 +23,16 @@
 // NONE-NOT: !"branch-target-enforcement"
 // PART-NOT: !"branch-target-enforcement"
 // ALL-NOT:  !"branch-target-enforcement"
-// BTE:      !{i32 8, !"branch-target-enforcement", i32 1}
+// BTE:      !{i32 8, !"branch-target-enforcement", i32 2}
 
 // NONE-NOT: !"sign-return-address"
-// PART:     !{i32 8, !"sign-return-address", i32 1}
-// ALL:      !{i32 8, !"sign-return-address", i32 1}
+// PART:     !{i32 8, !"sign-return-address", i32 2}
+// ALL:      !{i32 8, !"sign-return-address", i32 2}
 // BTE-NOT:  !"sign-return-address"
 
 // NONE-NOT: !"sign-return-address-all", i32 0}
 // PART-NOT: !"sign-return-address-all", i32 0}
-// ALL:      !{i32 8, !"sign-return-address-all", i32 1}
+// ALL:      !{i32 8, !"sign-return-address-all", i32 2}
 // BTE-NOT:  !"sign-return-address-all", i32 0}
 
 void foo() {}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl
new file mode 100644
index 0000000000000..ddd0baed41f37
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-1.hlsl
@@ -0,0 +1,23 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+
+struct Input {
+  uint Idx : SV_DispatchThreadID;
+
+};
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:      %[[#TMP:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0
+// CHECK:      %[[#VAR:]] = alloca %struct.Input, align 8
+// CHECK:                   store %struct.Input %[[#TMP]], ptr %[[#VAR]], align 4
+// CHECK-DXIL:              call void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+// CHECK-SPIRV:             call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(Input input) {}
+
diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl
new file mode 100644
index 0000000000000..0d9c91e746454
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-2.hlsl
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+
+struct Input {
+  uint Idx : SV_DispatchThreadID;
+  uint Gid : SV_GroupID;
+};
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:     %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0
+// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0)
+// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.[[TARGET]].group.id.i32(i32 0)
+// CHECK:     %[[#TMP2:]] = insertvalue %struct.Input %[[#TMP1]], i32 %[[#GID]], 1
+// CHECK:      %[[#VAR:]] = alloca %struct.Input, align 8
+// CHECK:                   store %struct.Input %[[#TMP2]], ptr %[[#VAR]], align 4
+// CHECK-DXIL:              call void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+// CHECK-SPIRV:             call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(Input input) {}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl
new file mode 100644
index 0000000000000..f4c4d86933ca1
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-inherit.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+
+struct Inner {
+  uint Gid;
+};
+
+struct Input {
+  uint Idx : SV_DispatchThreadID;
+  Inner inner : SV_GroupIndex;
+};
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:     %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0
+// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.dx.flattened.thread.id.in.group()
+// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.spv.flattened.thread.id.in.group()
+// CHECK:     %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0
+// CHECK:     %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1
+// CHECK:      %[[#VAR:]] = alloca %struct.Input, align 8
+// CHECK:                   store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4
+// CHECK-DXIL:              call void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+// CHECK-SPIRV:             call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(Input input) {}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl
new file mode 100644
index 0000000000000..e1344dd87a6ed
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested-shadow.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+
+struct Inner {
+  uint Gid : SV_GroupID;
+};
+
+struct Input {
+  uint Idx : SV_DispatchThreadID;
+  Inner inner : SV_GroupIndex;
+};
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:     %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0
+// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.dx.flattened.thread.id.in.group()
+// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.spv.flattened.thread.id.in.group()
+// CHECK:     %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0
+// CHECK:     %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1
+// CHECK:      %[[#VAR:]] = alloca %struct.Input, align 8
+// CHECK:                   store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4
+// CHECK-DXIL:              call void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+// CHECK-SPIRV:             call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(Input input) {}
diff --git a/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl
new file mode 100644
index 0000000000000..cd6f9460bc617
--- /dev/null
+++ b/clang/test/CodeGenHLSL/semantics/semantic-struct-nested.hlsl
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL -DTARGET=dx
+// RUN: %clang_cc1 -triple spirv-linux-vulkan-library -x hlsl -emit-llvm -finclude-default-header -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV -DTARGET=spv
+
+
+struct Inner {
+  uint Gid : SV_GroupID;
+};
+
+struct Input {
+  uint Idx : SV_DispatchThreadID;
+  Inner inner;
+};
+
+// Make sure SV_DispatchThreadID translated into dx.thread.id.
+
+// CHECK:       define void @foo()
+// CHECK-DXIL:  %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id(i32 0)
+// CHECK-SPIRV: %[[#ID:]] = call i32 @llvm.[[TARGET]].thread.id.i32(i32 0)
+// CHECK:     %[[#TMP1:]] = insertvalue %struct.Input poison, i32 %[[#ID]], 0
+// CHECK-DXIL: %[[#GID:]] = call i32 @llvm.[[TARGET]].group.id(i32 0)
+// CHECK-SPIRV:%[[#GID:]] = call i32 @llvm.[[TARGET]].group.id.i32(i32 0)
+// CHECK:     %[[#TMP2:]] = insertvalue %struct.Inner poison, i32 %[[#GID]], 0
+// CHECK:     %[[#TMP3:]] = insertvalue %struct.Input %[[#TMP1]], %struct.Inner %[[#TMP2]], 1
+// CHECK:      %[[#VAR:]] = alloca %struct.Input, align 8
+// CHECK:                   store %struct.Input %[[#TMP3]], ptr %[[#VAR]], align 4
+// CHECK-DXIL:              call void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+// CHECK-SPIRV:             call spir_func void @{{.*}}foo{{.*}}(ptr %[[#VAR]])
+[shader("compute")]
+[numthreads(8,8,1)]
+void foo(Input input) {}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
index 8c3e5b70ea308..14fbeb24a96c2 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-cluster-dims.cl
@@ -26,8 +26,8 @@ kernel void foo(global int *p) { *p = 1; }
 // CHECK-NEXT:    ret void
 //
 //.
-// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
-// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
+// CHECK: attributes #[[ATTR0]] = { convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" "uniform-work-group-size"="false" }
+// CHECK: attributes #[[ATTR1]] = { alwaysinline convergent norecurse nounwind "amdgpu-cluster-dims"="0,0,0" "amdgpu-flat-work-group-size"="1,256" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1250" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32" }
 // CHECK: attributes #[[ATTR2]] = { convergent nounwind }
 //.
 // CHECK: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 600}
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index 7cc83c0a8064e..9bd096f3fcbc7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -109,8 +109,8 @@
 // GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 // GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1250: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
-// GFX1251: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1250: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
+// GFX1251: "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bitop3-insts,+ci-insts,+clusters,+dl-insts,+dot7-insts,+dot8-insts,+dpp,+fp8-conversion-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+permlane16-swap,+pk-add-min-max-insts,+prng-inst,+setprio-inc-wg-inst,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+wavefrontsize32"
 
 // GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+atomic-fmin-fmax-global-f32,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
index b6b475a7565ba..e4a5fe9014c2e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250.cl
@@ -21,6 +21,7 @@ typedef float __attribute__((ext_vector_type(8))) float8;
 typedef float __attribute__((ext_vector_type(16))) float16;
 typedef float __attribute__((ext_vector_type(32))) float32;
 typedef short __attribute__((ext_vector_type(2))) short2;
+typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
 
 // CHECK-LABEL: @test_setprio_inc_wg(
 // CHECK-NEXT:  entry:
@@ -1718,3 +1719,111 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
   *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 2);
   *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, 3);
 }
+
+// CHECK-LABEL: @test_add_min_max(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store i32 [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call i32 @llvm.amdgcn.add.max.i32(i32 [[TMP0]], i32 [[TMP1]], i32 [[TMP2]], i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call i32 @llvm.amdgcn.add.max.u32(i32 [[TMP5]], i32 [[TMP6]], i32 [[TMP7]], i1 true)
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = call i32 @llvm.amdgcn.add.min.i32(i32 [[TMP10]], i32 [[TMP11]], i32 [[TMP12]], i1 false)
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load i32, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load i32, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load i32, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = call i32 @llvm.amdgcn.add.min.u32(i32 [[TMP15]], i32 [[TMP16]], i32 [[TMP17]], i1 true)
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store i32 [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_add_min_max(global int *out, int a, int b, int c)
+{
+  *out = __builtin_amdgcn_add_max_i32(a, b, c, false);
+  *out = __builtin_amdgcn_add_max_u32(a, b, c, true);
+  *out = __builtin_amdgcn_add_min_i32(a, b, c, false);
+  *out = __builtin_amdgcn_add_min_u32(a, b, c, true);
+}
+
+// CHECK-LABEL: @test_pk_add_min_max(
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[UOUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[B_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[C_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[UA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[UB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[UC_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-NEXT:    [[UOUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UOUT_ADDR]] to ptr
+// CHECK-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-NEXT:    [[UA_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UA_ADDR]] to ptr
+// CHECK-NEXT:    [[UB_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UB_ADDR]] to ptr
+// CHECK-NEXT:    [[UC_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[UC_ADDR]] to ptr
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT:%.*]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store ptr addrspace(1) [[UOUT:%.*]], ptr [[UOUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i16> [[A:%.*]], ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store <2 x i16> [[B:%.*]], ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store <2 x i16> [[C:%.*]], ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UA:%.*]], ptr [[UA_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UB:%.*]], ptr [[UB_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    store <2 x i16> [[UC:%.*]], ptr [[UC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> [[TMP0]], <2 x i16> [[TMP1]], <2 x i16> [[TMP2]], i1 false)
+// CHECK-NEXT:    [[TMP4:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP3]], ptr addrspace(1) [[TMP4]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP6:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP7:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP8:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> [[TMP5]], <2 x i16> [[TMP6]], <2 x i16> [[TMP7]], i1 true)
+// CHECK-NEXT:    [[TMP9:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP8]], ptr addrspace(1) [[TMP9]], align 4
+// CHECK-NEXT:    [[TMP10:%.*]] = load <2 x i16>, ptr [[A_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP11:%.*]] = load <2 x i16>, ptr [[B_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP12:%.*]] = load <2 x i16>, ptr [[C_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP13:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> [[TMP10]], <2 x i16> [[TMP11]], <2 x i16> [[TMP12]], i1 false)
+// CHECK-NEXT:    [[TMP14:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP13]], ptr addrspace(1) [[TMP14]], align 4
+// CHECK-NEXT:    [[TMP15:%.*]] = load <2 x i16>, ptr [[UA_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP16:%.*]] = load <2 x i16>, ptr [[UB_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP17:%.*]] = load <2 x i16>, ptr [[UC_ADDR_ASCAST]], align 4
+// CHECK-NEXT:    [[TMP18:%.*]] = call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> [[TMP15]], <2 x i16> [[TMP16]], <2 x i16> [[TMP17]], i1 true)
+// CHECK-NEXT:    [[TMP19:%.*]] = load ptr addrspace(1), ptr [[UOUT_ADDR_ASCAST]], align 8
+// CHECK-NEXT:    store <2 x i16> [[TMP18]], ptr addrspace(1) [[TMP19]], align 4
+// CHECK-NEXT:    ret void
+//
+void test_pk_add_min_max(global short2 *out, global ushort2 *uout, short2 a, short2 b, short2 c, ushort2 ua, ushort2 ub, ushort2 uc)
+{
+  *out = __builtin_amdgcn_pk_add_max_i16(a, b, c, false);
+  *uout = __builtin_amdgcn_pk_add_max_u16(ua, ub, uc, true);
+  *out = __builtin_amdgcn_pk_add_min_i16(a, b, c, false);
+  *uout = __builtin_amdgcn_pk_add_min_u16(ua, ub, uc, true);
+}
diff --git a/clang/test/Driver/DTLTO/Inputs/dtlto-helper.py b/clang/test/Driver/DTLTO/Inputs/dtlto-helper.py
new file mode 100644
index 0000000000000..6eb9f68e43bce
--- /dev/null
+++ b/clang/test/Driver/DTLTO/Inputs/dtlto-helper.py
@@ -0,0 +1,11 @@
+from pathlib import Path
+import sys
+
+# Arg 1: "clang" path.
+p = Path(sys.argv[1])
+print(f"clang-name:{p.resolve().name}")
+# Arg 2: Non-zero for LLVM driver.
+if sys.argv[2] != "0":
+    print(f'prepend-arg:"--thinlto-remote-compiler-prepend-arg={p.name}"')
+else:
+    print("prepend-arg: ")
diff --git a/clang/test/Driver/DTLTO/dtlto.c b/clang/test/Driver/DTLTO/dtlto.c
index ecaa8e5132553..a5bc273c378a5 100644
--- a/clang/test/Driver/DTLTO/dtlto.c
+++ b/clang/test/Driver/DTLTO/dtlto.c
@@ -4,16 +4,18 @@
 /// Check DTLTO options are forwarded to the linker.
 
 /// Check that options are forwarded as expected with --thinlto-distributor=.
-// RUN: %python %S/filename.py %clang > %t_forward.log
+// RUN: %python %S/Inputs/dtlto-helper.py %clang %llvm-driver > %t_forward.log
 // RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
 // RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
 // RUN:   -fthinlto-distributor=d.exe -Werror >>%t_forward.log 2>&1
 // RUN: FileCheck %s --input-file=%t_forward.log --check-prefix=FORWARD
 
-// FORWARD: filename.py:[[CLANG:.*]]
-// FORWARD: ld.lld
+// FORWARD:      clang-name:[[CLANG:.*]]
+// FORWARD-NEXT: prepend-arg:[[PREPEND_ARG:.*]]
+// FORWARD:      ld.lld
 // FORWARD-SAME: "--thinlto-distributor=d.exe"
 // FORWARD-SAME: "--thinlto-remote-compiler={{[^"]*}}[[CLANG]]"
+// FORWARD-SAME: [[PREPEND_ARG]]
 // FORWARD-SAME: "--thinlto-distributor-arg=a1"
 // FORWARD-SAME: "--thinlto-distributor-arg=a2"
 // FORWARD-SAME: "--thinlto-distributor-arg=a3"
@@ -31,24 +33,25 @@
 
 /// Check the expected arguments are forwarded by default with only
 /// --thinlto-distributor=.
-// RUN: %python %S/filename.py %clang > %t_default.log
+// RUN: %python %S/Inputs/dtlto-helper.py %clang %llvm-driver > %t_default.log
 // RUN: %clang -flto=thin %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
 // RUN:   -fthinlto-distributor=d.exe -Werror >>%t_default.log 2>&1
 // RUN: FileCheck %s --input-file=%t_default.log --check-prefix=DEFAULT \
 // RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
 
-// DEFAULT: filename.py:[[CLANG:.*]]
-// DEFAULT: ld.lld
+// DEFAULT:      clang-name:[[CLANG:.*]]
+// DEFAULT-NEXT: prepend-arg:[[PREPEND_ARG:.*]]
+// DEFAULT:      ld.lld
 // DEFAULT-SAME: "--thinlto-distributor=d.exe"
 // DEFAULT-SAME: "--thinlto-remote-compiler={{[^"]*}}[[CLANG]]"
+// DEFAULT-SAME: [[PREPEND_ARG]]
 
 /// Check that nothing is forwarded when the compiler is not in LTO mode, and that
 /// appropriate unused option warnings are issued.
-// RUN: %python %S/filename.py %clang > %t_noflto.log
 // RUN: %clang %s -### -fuse-ld=lld --target=x86_64-linux-gnu \
-// RUN:   -fthinlto-distributor=d.exe  >>%t_noflto.log 2>&1
-// RUN: FileCheck %s --input-file=%t_noflto.log --check-prefix=NOFLTO \
-// RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
+// RUN:   -fthinlto-distributor=d.exe 2>&1 | \
+// RUN: FileCheck %s --check-prefix=NOFLTO --implicit-check-not=distributor \
+// RUN:   --implicit-check-not=remote-compiler
 
 // NOFLTO: warning: argument unused during compilation: '-fthinlto-distributor=d.exe'
 // NOFLTO: ld.lld
diff --git a/clang/test/Driver/DTLTO/filename.py b/clang/test/Driver/DTLTO/filename.py
deleted file mode 100644
index df1aeb6682543..0000000000000
--- a/clang/test/Driver/DTLTO/filename.py
+++ /dev/null
@@ -1,4 +0,0 @@
-from pathlib import Path
-import sys
-
-print(f"filename.py:{Path(sys.argv[1]).resolve().name}")
diff --git a/clang/test/Driver/DTLTO/lit.local.cfg b/clang/test/Driver/DTLTO/lit.local.cfg
new file mode 100644
index 0000000000000..0756c5cfa11ca
--- /dev/null
+++ b/clang/test/Driver/DTLTO/lit.local.cfg
@@ -0,0 +1,3 @@
+from lit.llvm import llvm_config
+
+config.substitutions.append(("%llvm-driver", "1" if "llvm-driver" in config.available_features else "0"))
diff --git a/clang/test/Driver/DTLTO/ps5-dtlto.c b/clang/test/Driver/DTLTO/ps5-dtlto.c
index 4c10a026f944b..b59fb48512000 100644
--- a/clang/test/Driver/DTLTO/ps5-dtlto.c
+++ b/clang/test/Driver/DTLTO/ps5-dtlto.c
@@ -3,16 +3,18 @@
 /// Check DTLTO options are forwarded to the linker.
 
 /// Check that options are forwarded as expected with --thinlto-distributor=.
-// RUN: %python %S/filename.py %clang > %t_forward.log
+// RUN: %python %S/Inputs/dtlto-helper.py %clang %llvm-driver > %t_forward.log
 // RUN: %clang -flto=thin %s -### --target=x86_64-sie-ps5 \
 // RUN:   -Xthinlto-distributor=a1 -Xthinlto-distributor=a2,a3 \
 // RUN:   -fthinlto-distributor=d.exe -Werror >>%t_forward.log 2>&1
 // RUN: FileCheck %s --input-file=%t_forward.log --check-prefix=FORWARD
 
-// FORWARD: filename.py:[[CLANG:.*]]
-// FORWARD: prospero-lld
+// FORWARD:      clang-name:[[CLANG:.*]]
+// FORWARD-NEXT: prepend-arg:[[PREPEND_ARG:.*]]
+// FORWARD:      prospero-lld
 // FORWARD-SAME: "--thinlto-distributor=d.exe"
 // FORWARD-SAME: "--thinlto-remote-compiler={{[^"]*}}[[CLANG]]"
+// FORWARD-SAME: [[PREPEND_ARG]]
 // FORWARD-SAME: "--thinlto-distributor-arg=a1"
 // FORWARD-SAME: "--thinlto-distributor-arg=a2"
 // FORWARD-SAME: "--thinlto-distributor-arg=a3"
@@ -30,20 +32,22 @@
 
 /// Check the expected arguments are forwarded by default with only
 /// --thinlto-distributor=.
-// RUN: %python %S/filename.py %clang > %t_default.log
+// RUN: %python %S/Inputs/dtlto-helper.py %clang %llvm-driver > %t_default.log
 // RUN: %clang -flto=thin %s -### --target=x86_64-sie-ps5 \
 // RUN:   -fthinlto-distributor=d.exe -Werror >>%t_default.log 2>&1
 // RUN: FileCheck %s --input-file=%t_default.log --check-prefix=DEFAULT \
 // RUN:   --implicit-check-not=distributor --implicit-check-not=remote-compiler
 
-// DEFAULT: filename.py:[[CLANG:.*]]
-// DEFAULT: prospero-lld
+// DEFAULT:      clang-name:[[CLANG:.*]]
+// DEFAULT-NEXT: prepend-arg:[[PREPEND_ARG:.*]]
+// DEFAULT:      prospero-lld
 // DEFAULT-SAME: "--thinlto-distributor=d.exe"
 // DEFAULT-SAME: "--thinlto-remote-compiler={{[^"]*}}[[CLANG]]"
+// DEFAULT-SAME: [[PREPEND_ARG]]
 
 /// Check that the arguments are forwarded unconditionally even when the
 /// compiler is not in LTO mode.
-// RUN: %python %S/filename.py %clang > %t_noflto.log
+// RUN: %python %S/Inputs/dtlto-helper.py %clang %llvm-driver > %t_noflto.log
 // RUN: %clang %s -### --target=x86_64-sie-ps5 \
 // RUN:   -fthinlto-distributor=d.exe -Werror >>%t_noflto.log 2>&1
 // RUN: FileCheck %s --input-file=%t_noflto.log --check-prefix=DEFAULT \
diff --git a/clang/test/Driver/aarch64-ptrauth.c b/clang/test/Driver/aarch64-ptrauth.c
index 5bb963a90fcbd..b080a77195c8c 100644
--- a/clang/test/Driver/aarch64-ptrauth.c
+++ b/clang/test/Driver/aarch64-ptrauth.c
@@ -22,25 +22,39 @@
 // RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // RUN: %clang -### -c --target=aarch64-linux-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI1
 // PAUTHABI1:      "-cc1"{{.*}} "-triple" "aarch64-unknown-linux-pauthtest"
-// PAUTHABI1-SAME: "-target-abi" "pauthtest"
 // PAUTHABI1-SAME: "-fptrauth-intrinsics" "-fptrauth-calls" "-fptrauth-returns" "-fptrauth-auth-traps" "-fptrauth-vtable-pointer-address-discrimination" "-fptrauth-vtable-pointer-type-discrimination" "-fptrauth-type-info-vtable-pointer-discrimination" "-fptrauth-indirect-gotos" "-fptrauth-init-fini" "-fptrauth-init-fini-address-discrimination" "-faarch64-jump-table-hardening"
+// PAUTHABI1-SAME: "-target-abi" "pauthtest"
+// PAUTHABI1-NOT: "-fptrauth-function-pointer-type-discrimination"
 
-// RUN: %clang -### -c --target=aarch64 -mabi=pauthtest -fno-ptrauth-intrinsics \
+// RUN: %clang -### -c --target=aarch64-linux -mabi=pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fno-ptrauth-indirect-gotos \
 // RUN:   -fno-ptrauth-init-fini -fno-ptrauth-init-fini-address-discrimination \
 // RUN:   -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
-// RUN: %clang -### -c --target=aarch64-pauthtest -fno-ptrauth-intrinsics \
+// RUN: %clang -### -c --target=aarch64-linux-pauthtest -fno-ptrauth-intrinsics \
 // RUN:   -fno-ptrauth-calls -fno-ptrauth-returns -fno-ptrauth-auth-traps \
 // RUN:   -fno-ptrauth-vtable-pointer-address-discrimination -fno-ptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fno-ptrauth-type-info-vtable-pointer-discrimination -fno-ptrauth-indirect-gotos \
 // RUN:   -fno-ptrauth-init-fini -fno-ptrauth-init-fini-address-discrimination \
 // RUN:   -fno-aarch64-jump-table-hardening %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
-// PAUTHABI2:     "-cc1"
-// PAUTHABI2-NOT: "-fptrauth-
+
+//// Non-linux OS: pauthtest ABI has no effect in terms of passing ptrauth cc1 flags.
+//// An error about unsupported ABI will be emitted later in pipeline (see ERR2 below)
+// RUN: %clang -### -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI2
+
+// PAUTHABI2:      "-cc1"
+// PAUTHABI2-SAME: "-target-abi" "pauthtest"
+// PAUTHABI2-NOT:  "-fptrauth-
 // PAUTHABI2-NOT: "-faarch64-jump-table-hardening"
 
+//// Non-linux OS: pauthtest environment does not correspond to pauthtest ABI; aapcs is the default.
+// RUN: %clang -### -c --target=aarch64-pauthtest %s 2>&1 | FileCheck %s --check-prefix=PAUTHABI3
+// PAUTHABI3:      "-cc1"
+// PAUTHABI3-SAME: "-target-abi" "aapcs"
+// PAUTHABI3-NOT:  "-fptrauth-
+// PAUTHABI3-NOT: "-faarch64-jump-table-hardening"
+
 // RUN: not %clang -### -c --target=x86_64 -fptrauth-intrinsics -fptrauth-calls -fptrauth-returns -fptrauth-auth-traps \
 // RUN:   -fptrauth-vtable-pointer-address-discrimination -fptrauth-vtable-pointer-type-discrimination \
 // RUN:   -fptrauth-type-info-vtable-pointer-discrimination -fptrauth-indirect-gotos -fptrauth-init-fini \
@@ -57,10 +71,12 @@
 // ERR1-NEXT: error: unsupported option '-fptrauth-init-fini-address-discrimination' for target '{{.*}}'
 // ERR1-NEXT: error: unsupported option '-faarch64-jump-table-hardening' for target '{{.*}}'
 
-//// Only support PAuth ABI for Linux as for now.
-// RUN: not %clang -o /dev/null -c --target=aarch64-unknown -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
-// RUN: not %clang -o /dev/null -c --target=aarch64-unknown-pauthtest       %s 2>&1 | FileCheck %s --check-prefix=ERR2
-// ERR2: error: ABI 'pauthtest' is not supported for 'aarch64-unknown-unknown-pauthtest'
+
+// RUN: not %clang -c --target=aarch64 -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR2
+//// The ABI is not specified explicitly, and for non-Linux pauthtest environment does not correspond
+//// to pauthtest ABI (each OS target defines this behavior separately). Do not emit an error.
+// RUN:     %clang -c --target=aarch64-pauthtest       %s -o /dev/null
+// ERR2: error: unknown target ABI 'pauthtest'
 
 //// PAuth ABI is encoded as environment part of the triple, so don't allow to explicitly set other environments.
 // RUN: not %clang -### -c --target=aarch64-linux-gnu -mabi=pauthtest %s 2>&1 | FileCheck %s --check-prefix=ERR3
diff --git a/clang/test/Driver/aarch64-v96a.c b/clang/test/Driver/aarch64-v96a.c
index de7890140ebd3..e0081bbbdabfe 100644
--- a/clang/test/Driver/aarch64-v96a.c
+++ b/clang/test/Driver/aarch64-v96a.c
@@ -6,7 +6,7 @@
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
-// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
+// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}}
 
 // RUN: %clang -target aarch64_be -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
@@ -14,7 +14,7 @@
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
-// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
+// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}}
 
 // ===== Features supported on aarch64 =====
 
diff --git a/clang/test/Driver/aarch64-v97a.c b/clang/test/Driver/aarch64-v97a.c
new file mode 100644
index 0000000000000..1e54e6bbe3ecc
--- /dev/null
+++ b/clang/test/Driver/aarch64-v97a.c
@@ -0,0 +1,59 @@
+// ===== Base v9.7a architecture =====
+
+// RUN: %clang -target aarch64 -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64 -mlittle-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A %s
+// GENERICV97A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p3"{{.*}}
+
+// RUN: %clang -target aarch64_be -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64 -mbig-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// RUN: %clang -target aarch64_be -mbig-endian -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV97A-BE %s
+// GENERICV97A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p3"{{.*}}
+
+// ===== Features supported on aarch64 =====
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sme2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SME2p3 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sme2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SME2p3 %s
+// V97A-SME2p3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sme2p3"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sve2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE2p3 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sve2p3 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE2p3 %s
+// V97A-SVE2p3: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sve2p3"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+sve-b16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE-B16MM %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+sve-b16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-SVE-B16MM %s
+// V97A-SVE-B16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+sve-b16mm"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-F16MM %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V97A-F16MM %s
+// V97A-F16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+f16mm"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+cmh -### -c %s 2>&1 | FileCheck -check-prefix=V97A-CMH %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+cmh -### -c %s 2>&1 | FileCheck -check-prefix=V97A-CMH %s
+// V97A-CMH: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+cmh"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+lscp -### -c %s 2>&1 | FileCheck -check-prefix=V97A-LSCP %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+lscp -### -c %s 2>&1 | FileCheck -check-prefix=V97A-LSCP %s
+// V97A-LSCP: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+lscp"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+tlbid -### -c %s 2>&1 | FileCheck -check-prefix=V97A-TLBID %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+tlbid -### -c %s 2>&1 | FileCheck -check-prefix=V97A-TLBID %s
+// V97A-TLBID: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+tlbid"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mpamv2 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MPAMv2 %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mpamv2 -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MPAMv2 %s
+// V97A-MPAMv2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mpamv2"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+mtetc -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MTETC %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+mtetc -### -c %s 2>&1 | FileCheck -check-prefix=V97A-MTETC %s
+// V97A-MTETC: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+mtetc"
+
+// RUN: %clang -target aarch64 -march=armv9.7a+gcie -### -c %s 2>&1 | FileCheck -check-prefix=V97A-GCIE %s
+// RUN: %clang -target aarch64 -march=armv9.7-a+gcie -### -c %s 2>&1 | FileCheck -check-prefix=V97A-GCIE %s
+// V97A-GCIE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.7a"{{.*}} "-target-feature" "+gcie"
diff --git a/clang/test/Driver/arm-cortex-cpus-1.c b/clang/test/Driver/arm-cortex-cpus-1.c
index 5d3169ff3d503..08cb63b1a9230 100644
--- a/clang/test/Driver/arm-cortex-cpus-1.c
+++ b/clang/test/Driver/arm-cortex-cpus-1.c
@@ -512,3 +512,20 @@
 // RUN: %clang -target arm -march=armebv9.6a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V96A %s
 // RUN: %clang -target arm -march=armebv9.6-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V96A %s
 // CHECK-BE-V96A: "-cc1"{{.*}} "-triple" "armebv9.6{{.*}}" "-target-cpu" "generic"
+//
+// RUN: %clang -target armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -march=armv9.7a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// RUN: %clang -target arm -mlittle-endian -march=armv9.7-a -mlittle-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-V97A %s
+// CHECK-V97A: "-cc1"{{.*}} "-triple" "armv9.7{{.*}}" "-target-cpu" "generic"
+
+// RUN: %clang -target armebv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armv9.7a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armeb -march=armebv9.7a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target armeb -march=armebv9.7-a -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target arm -march=armebv9.7a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// RUN: %clang -target arm -march=armebv9.7-a -mbig-endian -### -c %s 2>&1 | FileCheck -check-prefix=CHECK-BE-V97A %s
+// CHECK-BE-V97A: "-cc1"{{.*}} "-triple" "armebv9.7{{.*}}" "-target-cpu" "generic"
diff --git a/clang/test/Driver/baremetal-multilib-custom-error.yaml b/clang/test/Driver/baremetal-multilib-custom-error.yaml
index 0be92e26aae28..bc06ed468572c 100644
--- a/clang/test/Driver/baremetal-multilib-custom-error.yaml
+++ b/clang/test/Driver/baremetal-multilib-custom-error.yaml
@@ -1,4 +1,3 @@
-# REQUIRES: shell
 # UNSUPPORTED: system-windows
 
 # RUN: %clang --multi-lib-config=%s -no-canonical-prefixes -print-multi-directory 2>&1 \
diff --git a/clang/test/Driver/config-file3.c b/clang/test/Driver/config-file3.c
index f359e02705993..c9b26763794b0 100644
--- a/clang/test/Driver/config-file3.c
+++ b/clang/test/Driver/config-file3.c
@@ -1,9 +1,5 @@
 // Needs symlinks
 // UNSUPPORTED: system-windows
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: x86-registered-target
 
 // RUN: rm -rf %t && mkdir %t
diff --git a/clang/test/Driver/config-zos.c b/clang/test/Driver/config-zos.c
index a21753ee5886f..055c4c981977b 100644
--- a/clang/test/Driver/config-zos.c
+++ b/clang/test/Driver/config-zos.c
@@ -1,9 +1,5 @@
 // Needs symlinks
 // UNSUPPORTED: system-windows
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: systemz-registered-target
 
 // RUN: rm -rf %t && mkdir %t
diff --git a/clang/test/Driver/config-zos1.c b/clang/test/Driver/config-zos1.c
index 9dd6f55a777fc..cf4f13b3879df 100644
--- a/clang/test/Driver/config-zos1.c
+++ b/clang/test/Driver/config-zos1.c
@@ -1,8 +1,4 @@
 // UNSUPPORTED: system-windows
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: systemz-registered-target
 
 // RUN: export CLANG_CONFIG_PATH=%S/Inputs/config-zos
diff --git a/clang/test/Driver/fsanitize-alloc-token.c b/clang/test/Driver/fsanitize-alloc-token.c
index 2964f60c4f26f..6d8bda16dfb96 100644
--- a/clang/test/Driver/fsanitize-alloc-token.c
+++ b/clang/test/Driver/fsanitize-alloc-token.c
@@ -41,3 +41,14 @@
 // CHECK-MAX: "-falloc-token-max=42"
 // RUN: not %clang --target=x86_64-linux-gnu -fsanitize=alloc-token -falloc-token-max=-1 %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MAX %s
 // CHECK-INVALID-MAX: error: invalid value
+
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=increment %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-INCREMENT %s
+// CHECK-MODE-INCREMENT: "-falloc-token-mode=increment"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=random %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-RANDOM %s
+// CHECK-MODE-RANDOM: "-falloc-token-mode=random"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehash %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASH %s
+// CHECK-MODE-TYPEHASH: "-falloc-token-mode=typehash"
+// RUN: %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=typehashpointersplit %s -### 2>&1 | FileCheck -check-prefix=CHECK-MODE-TYPEHASHPTRSPLIT %s
+// CHECK-MODE-TYPEHASHPTRSPLIT: "-falloc-token-mode=typehashpointersplit"
+// RUN: not %clang --target=x86_64-linux-gnu -Xclang -falloc-token-mode=asdf %s 2>&1 | FileCheck -check-prefix=CHECK-INVALID-MODE %s
+// CHECK-INVALID-MODE: error: invalid value 'asdf'
diff --git a/clang/test/Driver/fuchsia.c b/clang/test/Driver/fuchsia.c
index 3fb2a94124cd1..d0fec18e13a20 100644
--- a/clang/test/Driver/fuchsia.c
+++ b/clang/test/Driver/fuchsia.c
@@ -312,3 +312,13 @@
 // RUN:     | FileCheck %s -check-prefix=CHECK-NOSTDLIB-NOLIBC
 // CHECK-NOSTDLIB-NOLIBC-NOT: "warning:"
 // CHECK-NOSTDLIB-NOLIBC-NOT: "error:"
+
+// RUN: not %clang -### %s --target=aarch64-unknown-fuchsia \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     | FileCheck %s -check-prefix=CHECK-NONX86-SAFESTACK
+// RUN: not %clang -### %s --target=riscv64-unknown-fuchsia \
+// RUN:     -fsanitize=safe-stack 2>&1 \
+// RUN:     -resource-dir=%S/Inputs/resource_dir_with_per_target_subdir \
+// RUN:     | FileCheck %s -check-prefix=CHECK-NONX86-SAFESTACK
+// CHECK-NONX86-SAFESTACK: error: unsupported option '-fsanitize=safe-stack' for target '{{.*}}'
diff --git a/clang/test/Driver/hexagon-toolchain-elf.c b/clang/test/Driver/hexagon-toolchain-elf.c
index de2ebfeeda26c..bff4819488453 100644
--- a/clang/test/Driver/hexagon-toolchain-elf.c
+++ b/clang/test/Driver/hexagon-toolchain-elf.c
@@ -166,6 +166,13 @@
 // CHECK250: "-cc1" {{.*}} "-target-cpu" "hexagonv79"
 // CHECK250: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v79/crt0
 
+// RUN: not %clang -### --target=hexagon-unknown-elf \
+// RUN:   -ccc-install-dir %S/Inputs/hexagon_tree/Tools/bin \
+// RUN:   -mcpu=hexagonv81 -fuse-ld=hexagon-link \
+// RUN:   %s 2>&1 | FileCheck -check-prefix=CHECK260 %s
+// CHECK260: "-cc1" {{.*}} "-target-cpu" "hexagonv81"
+// CHECK260: hexagon-link{{.*}}/Inputs/hexagon_tree/Tools/bin/../target/hexagon/lib/v81/crt0
+
 // -----------------------------------------------------------------------------
 // Test Linker related args
 // -----------------------------------------------------------------------------
diff --git a/clang/test/Driver/hip-toolchain-no-rdc.hip b/clang/test/Driver/hip-toolchain-no-rdc.hip
index 2f40fd478671e..840334e19e7f2 100644
--- a/clang/test/Driver/hip-toolchain-no-rdc.hip
+++ b/clang/test/Driver/hip-toolchain-no-rdc.hip
@@ -214,3 +214,10 @@
 // AMDGCNSPIRV: {{".*clang-offload-bundler.*"}} "-type=o"
 // AMDGCNSPIRV-SAME: "-targets={{.*}}hipv4-spirv64-amd-amdhsa--amdgcnspirv,hipv4-amdgcn-amd-amdhsa--gfx900"
 // AMDGCNSPIRV-SAME: "-input=[[AMDGCNSPV_CO]]" "-input=[[GFX900_CO]]"
+
+// Check verbose printing with the new driver.
+// RUN: %clang -### --target=x86_64-linux-gnu -fno-gpu-rdc -nogpulib -nogpuinc \
+// RUN:   --offload-new-driver --offload-arch=gfx908 -v %s 2>&1 | FileCheck %s --check-prefix=VERBOSE
+// VERBOSE: clang-linker-wrapper
+// VERBOSE-NOT: --device-compiler=amdgcn-amd-amdhsa=-v
+// VERBOSE-NOT: --wrapper-verbose
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index 5c840a60685c6..7294c33959e7e 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -8,6 +8,7 @@
 // CHECK-NEXT:     bf16                FEAT_BF16                                              Enable BFloat16 Extension
 // CHECK-NEXT:     brbe                FEAT_BRBE                                              Enable Branch Record Buffer Extension
 // CHECK-NEXT:     bti                 FEAT_BTI                                               Enable Branch Target Identification
+// CHECK-NEXT:     cmh                 FEAT_CMH                                               Enable Armv9.7-A Contention Management Hints
 // CHECK-NEXT:     cmpbr               FEAT_CMPBR                                             Enable Armv9.6-A base compare and branch instructions
 // CHECK-NEXT:     fcma                FEAT_FCMA                                              Enable Armv8.3-A Floating-point complex number support
 // CHECK-NEXT:     cpa                 FEAT_CPA                                               Enable Armv9.5-A Checked Pointer Arithmetic
@@ -17,6 +18,9 @@
 // CHECK-NEXT:     d128                FEAT_D128, FEAT_LVA3, FEAT_SYSREG128, FEAT_SYSINSTR128 Enable Armv9.4-A 128-bit Page Table Descriptors, System Registers and instructions
 // CHECK-NEXT:     dit                 FEAT_DIT                                               Enable Armv8.4-A Data Independent Timing instructions
 // CHECK-NEXT:     dotprod             FEAT_DotProd                                           Enable dot product support
+// CHECK-NEXT:     f16f32dot           FEAT_F16F32DOT                                         Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision
+// CHECK-NEXT:     f16f32mm            FEAT_F16F32MM                                          Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision
+// CHECK-NEXT:     f16mm               FEAT_F16MM                                             Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate
 // CHECK-NEXT:     f32mm               FEAT_F32MM                                             Enable Matrix Multiply FP32 Extension
 // CHECK-NEXT:     f64mm               FEAT_F64MM                                             Enable Matrix Multiply FP64 Extension
 // CHECK-NEXT:     f8f16mm             FEAT_F8F16MM                                           Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication
@@ -31,19 +35,23 @@
 // CHECK-NEXT:     fp8fma              FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
 // CHECK-NEXT:     fprcvt              FEAT_FPRCVT                                            Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of different input and output sizes
 // CHECK-NEXT:     fp16                FEAT_FP16                                              Enable half-precision floating-point data processing
+// CHECK-NEXT:     gcie                FEAT_GCIE                                              Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension
 // CHECK-NEXT:     gcs                 FEAT_GCS                                               Enable Armv9.4-A Guarded Call Stack Extension
 // CHECK-NEXT:     hbc                 FEAT_HBC                                               Enable Armv8.8-A Hinted Conditional Branches Extension
 // CHECK-NEXT:     i8mm                FEAT_I8MM                                              Enable Matrix Multiply Int8 Extension
 // CHECK-NEXT:     ite                 FEAT_ITE                                               Enable Armv9.4-A Instrumentation Extension
 // CHECK-NEXT:     jscvt               FEAT_JSCVT                                             Enable Armv8.3-A JavaScript FP conversion instructions
 // CHECK-NEXT:     ls64                FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA              Enable Armv8.7-A LD64B/ST64B Accelerator Extension
+// CHECK-NEXT:     lscp                FEAT_LSCP                                              Enable Armv9.7-A Load-acquire and store-release pair extension
 // CHECK-NEXT:     lse                 FEAT_LSE                                               Enable Armv8.1-A Large System Extension (LSE) atomic instructions
 // CHECK-NEXT:     lse128              FEAT_LSE128                                            Enable Armv9.4-A 128-bit Atomic instructions
 // CHECK-NEXT:     lsfe                FEAT_LSFE                                              Enable Armv9.6-A base Atomic floating-point in-memory instructions
 // CHECK-NEXT:     lsui                FEAT_LSUI                                              Enable Armv9.6-A unprivileged load/store instructions
 // CHECK-NEXT:     lut                 FEAT_LUT                                               Enable Lookup Table instructions
 // CHECK-NEXT:     mops                FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
+// CHECK-NEXT:     mpamv2              FEAT_MPAMv2                                            Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions
 // CHECK-NEXT:     memtag              FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
+// CHECK-NEXT:     mtetc               FEAT_MTETC                                             Enable Virtual Memory Tagging Extension
 // CHECK-NEXT:     simd                FEAT_AdvSIMD                                           Enable Advanced SIMD instructions
 // CHECK-NEXT:     occmo               FEAT_OCCMO                                             Enable Armv9.6-A Outer cacheable cache maintenance operations
 // CHECK-NEXT:     pauth               FEAT_PAuth                                             Enable Armv8.3-A Pointer Authentication extension
@@ -76,6 +84,7 @@
 // CHECK-NEXT:     sme2                FEAT_SME2                                              Enable Scalable Matrix Extension 2 (SME2) instructions
 // CHECK-NEXT:     sme2p1              FEAT_SME2p1                                            Enable Scalable Matrix Extension 2.1 instructions
 // CHECK-NEXT:     sme2p2              FEAT_SME2p2                                            Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions
+// CHECK-NEXT:     sme2p3              FEAT_SME2p3                                            Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions
 // CHECK-NEXT:     profile             FEAT_SPE                                               Enable Statistical Profiling extension
 // CHECK-NEXT:     predres2            FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
 // CHECK-NEXT:     ssbs                FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
@@ -89,6 +98,7 @@
 // CHECK-NEXT:     sve-aes             FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable SVE AES and quadword SVE polynomial multiply instructions
 // CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and multi-vector quadword polynomial multiply instructions
 // CHECK-NEXT:     sve-b16b16          FEAT_SVE_B16B16                                        Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions
+// CHECK-NEXT:     sve-b16mm           FEAT_SVE_B16MM                                         Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate
 // CHECK-NEXT:     sve-bfscale         FEAT_SVE_BFSCALE                                       Enable Armv9.6-A SVE BFloat16 scaling instructions
 // CHECK-NEXT:     sve-bitperm         FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
 // CHECK-NEXT:     sve-f16f32mm        FEAT_SVE_F16F32MM                                      Enable Armv9.6-A FP16 to FP32 Matrix Multiply
@@ -101,7 +111,9 @@
 // CHECK-NEXT:     sve2-sm4                                                                   Shorthand for +sve2+sve-sm4
 // CHECK-NEXT:     sve2p1              FEAT_SVE2p1                                            Enable Scalable Vector Extension 2.1 instructions
 // CHECK-NEXT:     sve2p2              FEAT_SVE2p2                                            Enable Armv9.6-A Scalable Vector Extension 2.2 instructions
+// CHECK-NEXT:     sve2p3              FEAT_SVE2p3                                            Enable Armv9.7-A Scalable Vector Extension 2.3 instructions
 // CHECK-NEXT:     the                 FEAT_THE                                               Enable Armv8.9-A Translation Hardening Extension
+// CHECK-NEXT:     tlbid               FEAT_TLBID                                             Enable Armv9.7-A TLBI Domains extension
 // CHECK-NEXT:     tlbiw               FEAT_TLBIW                                             Enable Armv9.5-A TLBI VMALL for Dirty State
 // CHECK-NEXT:     tme                 FEAT_TME                                               Enable Transactional Memory Extension
 // CHECK-NEXT:     wfxt                FEAT_WFxT                                              Enable Armv8.7-A WFET and WFIT instruction
diff --git a/clang/test/Driver/print-supported-extensions-riscv.c b/clang/test/Driver/print-supported-extensions-riscv.c
index 7972b9e43e73e..cb812736786a9 100644
--- a/clang/test/Driver/print-supported-extensions-riscv.c
+++ b/clang/test/Driver/print-supported-extensions-riscv.c
@@ -190,6 +190,11 @@
 // CHECK-NEXT:     xsfmm64t             0.6       'XSfmm64t' (TE=64 configuration)
 // CHECK-NEXT:     xsfmmbase            0.6       'XSfmmbase' (All non arithmetic instructions for all TEWs and sf.vtzero)
 // CHECK-NEXT:     xsfvcp               1.0       'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions)
+// CHECK-NEXT:     xsfvfbfexp16e        0.5       'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16)
+// CHECK-NEXT:     xsfvfexp16e          0.5       'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision)
+// CHECK-NEXT:     xsfvfexp32e          0.5       'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision)
+// CHECK-NEXT:     xsfvfexpa            0.2       'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)
+// CHECK-NEXT:     xsfvfexpa64e         0.2       'XSfvfexpa64e' (SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision)
 // CHECK-NEXT:     xsfvfnrclipxfqf      1.0       'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)
 // CHECK-NEXT:     xsfvfwmaccqqq        1.0       'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction (4-by-4))
 // CHECK-NEXT:     xsfvqmaccdod         1.0       'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2))
diff --git a/clang/test/Frontend/absolute-paths-symlinks.c b/clang/test/Frontend/absolute-paths-symlinks.c
index 8170910b6f6ec..80bca34eb9d4b 100644
--- a/clang/test/Frontend/absolute-paths-symlinks.c
+++ b/clang/test/Frontend/absolute-paths-symlinks.c
@@ -12,6 +12,5 @@
 // CHECK-SAME: error: unknown type name
 This do not compile
 
-// REQUIRES: shell
 // Don't make symlinks on Windows.
 // UNSUPPORTED: system-windows
diff --git a/clang/test/Frontend/arm-ignore-branch-protection-option.c b/clang/test/Frontend/arm-ignore-branch-protection-option.c
index 99a2accef3ae2..45bdb37f5ed1a 100644
--- a/clang/test/Frontend/arm-ignore-branch-protection-option.c
+++ b/clang/test/Frontend/arm-ignore-branch-protection-option.c
@@ -15,4 +15,4 @@ __attribute__((target("arch=cortex-m0"))) void f() {}
 // CHECK-NOT:  attributes { {{.*}} "branch-target-enforcement"
 
 /// Check that there are branch protection module attributes despite the warning.
-// CHECK: !{i32 8, !"branch-target-enforcement", i32 1}
+// CHECK: !{i32 8, !"branch-target-enforcement", i32 2}
diff --git a/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp b/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
index 072f217ff7b19..a5a17dd5a4d82 100644
--- a/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
+++ b/clang/test/Frontend/cfi-unchecked-callee-attribute.cpp
@@ -9,6 +9,7 @@ void (*checked_ptr)(void) = unchecked;  // expected-warning{{implicit conversion
 void (CFI_UNCHECKED_CALLEE *unchecked_ptr)(void) = unchecked;
 void (CFI_UNCHECKED_CALLEE *from_normal)(void) = checked;
 void (CFI_UNCHECKED_CALLEE *c_no_function_decay)(void) = &unchecked;
+void (CFI_UNCHECKED_CALLEE __attribute__((noreturn)) *other_conflict)(void) = &checked; // expected-error{{cannot initialize a variable of type 'void (*)() __attribute__((noreturn)) __attribute__((cfi_unchecked_callee))' with an rvalue of type 'void (*)()'}}
 void (CFI_UNCHECKED_CALLEE *arr[10])(void);
 void (*cfi_elem)(void) = arr[1];  // expected-warning{{implicit conversion from 'void (*)() __attribute__((cfi_unchecked_callee))' to 'void (*)()' discards 'cfi_unchecked_callee' attribute}}
 void (CFI_UNCHECKED_CALLEE *cfi_unchecked_elem)(void) = arr[1];
diff --git a/clang/test/Headers/floatneeds.c b/clang/test/Headers/floatneeds.c
new file mode 100644
index 0000000000000..e544b41009908
--- /dev/null
+++ b/clang/test/Headers/floatneeds.c
@@ -0,0 +1,41 @@
+// RUN: rm -fR %t
+// RUN: split-file %s %t
+// RUN: %clang_cc1 -fsyntax-only -verify=c99 -std=c99 %t/floatneeds0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c99 -std=c99 %t/floatneeds1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c23 -std=c23 %t/floatneeds0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c23 -std=c23 %t/floatneeds1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c99 %t/floatneeds0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c99-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c99 %t/floatneeds1.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c23-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c23 %t/floatneeds0.c
+// RUN: %clang_cc1 -fsyntax-only -verify=c23-modules -fmodules -fimplicit-module-maps -fmodules-cache-path=%t -std=c23 %t/floatneeds1.c
+
+// Use C99 to verify that __need_ can be used to get types that wouldn't normally be available.
+
+//--- floatneeds0.c
+float infinity0 = INFINITY; // c99-error{{undeclared identifier 'INFINITY'}} c23-error{{undeclared identifier 'INFINITY'}} \
+                               c99-modules-error{{undeclared identifier 'INFINITY'}} c23-modules-error{{undeclared identifier 'INFINITY'}}
+float nan0 = NAN; // c99-error{{undeclared identifier 'NAN'}} c23-error{{undeclared identifier 'NAN'}} \
+                     c99-modules-error{{undeclared identifier 'NAN'}} c23-modules-error{{undeclared identifier 'NAN'}}
+float max0 = FLT_MAX; // c99-error{{undeclared identifier 'FLT_MAX'}} c23-error{{undeclared identifier 'FLT_MAX'}} \
+                         c99-modules-error{{undeclared identifier 'FLT_MAX'}} c23-modules-error{{undeclared identifier 'FLT_MAX'}}
+
+#define __need_infinity_nan
+#include <float.h>
+float infinity1 = INFINITY;
+float nan1 = NAN;
+float max1 = FLT_MAX; // c99-error{{undeclared identifier}} c23-error{{undeclared identifier}} \
+                         c99-modules-error{{undeclared identifier}} c23-modules-error{{undeclared identifier}}
+
+#include <float.h>
+float infinity2 = INFINITY;
+float nan2 = NAN;
+float max2 = FLT_MAX;
+
+//--- floatneeds1.c
+// c23-no-diagnostics
+// c23-modules-no-diagnostics
+
+#include <float.h>
+float infinity0 = INFINITY; // c99-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
+float nan0 = NAN; // c99-error{{undeclared identifier}} c99-modules-error{{undeclared identifier}}
+float max0 = FLT_MAX;
diff --git a/clang/test/Interpreter/pretty-print.c b/clang/test/Interpreter/pretty-print.c
index 588df70e33e84..d0712fb152107 100644
--- a/clang/test/Interpreter/pretty-print.c
+++ b/clang/test/Interpreter/pretty-print.c
@@ -75,9 +75,10 @@ int * ptr = (int*)0x123; ptr
 int * null_ptr = (int*)0; null_ptr
 // CHECK-NEXT: (int *) 0x0
 
+union U { int I; float F; } u; u.I = 12; u.I
+// CHECK-NEXT: (int) 12
+
 // TODO: _Bool, _Complex, _Atomic, and _BitInt
-// union U { int I; float F; } u; u.I = 12; u.I
-// TODO-CHECK-NEXT: (int) 12
 // struct S1{} s1; s1
 // TODO-CHECK-NEXT: (S1 &) @0x{{[0-9a-f]+}}
 
@@ -86,4 +87,21 @@ int * null_ptr = (int*)0; null_ptr
 // E.d
 // TODO-CHECK-NEXT: (int) 22
 
+// -----------------------------------------------------------------------------
+// Tentative definition handling (C99 6.9.2)
+// Verify that multiple distinct tentative definitions across inputs no longer
+// conflict. Each variable should emit correctly in its own incremental module.
+// -----------------------------------------------------------------------------
+
+int t1;
+int t2;
+int t3;
+t1 = 1; t2 = 2; t3 = 3;
+t1 + t2 + t3
+// CHECK-NEXT: (int) 6
+
+// A redefinition of an existing tentative variable should still fail.
+int t1;
+// expected-error {{duplicate definition of symbol '_t1'}}
+
 %quit
diff --git a/clang/test/Modules/crash-vfs-path-symlink-component.m b/clang/test/Modules/crash-vfs-path-symlink-component.m
index 45a68659a8258..4cc4467166bd9 100644
--- a/clang/test/Modules/crash-vfs-path-symlink-component.m
+++ b/clang/test/Modules/crash-vfs-path-symlink-component.m
@@ -1,9 +1,5 @@
 // Needs symlinks
 // UNSUPPORTED: system-windows
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 // REQUIRES: crash-recovery
 
 // FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
diff --git a/clang/test/Modules/crash-vfs-path-traversal.m b/clang/test/Modules/crash-vfs-path-traversal.m
index 8ab24755ed76f..680cda34700cf 100644
--- a/clang/test/Modules/crash-vfs-path-traversal.m
+++ b/clang/test/Modules/crash-vfs-path-traversal.m
@@ -2,10 +2,6 @@
 // UNSUPPORTED: ms-sdk, target={{.*-(ps4|ps5)}}
 // Some assertions in this test use Linux style (/) file paths.
 // UNSUPPORTED: system-windows
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 
 // FIXME: Canonicalizing paths to remove relative traversal components
 // currenty fails a unittest on windows and is disable by default.
diff --git a/clang/test/Modules/crash-vfs-relative-overlay.m b/clang/test/Modules/crash-vfs-relative-overlay.m
index 048c65b909135..36849246b9db4 100644
--- a/clang/test/Modules/crash-vfs-relative-overlay.m
+++ b/clang/test/Modules/crash-vfs-relative-overlay.m
@@ -1,9 +1,5 @@
 // UNSUPPORTED: system-windows
 // REQUIRES: crash-recovery
-// env -u is not supported on AIX.
-// TODO(boomanaiden154): Remove this once we have switched over to lit's
-// internal shell which does support env -u.
-// UNSUPPORTED: target={{.*}}-zos{{.*}}, target={{.*}}-aix{{.*}}
 
 // FIXME: This XFAIL is cargo-culted from crash-report.c. Do we need it?
 // XFAIL: target={{.*-windows-gnu}}
diff --git a/clang/test/Modules/explicit-build-cwd.c b/clang/test/Modules/explicit-build-cwd.c
new file mode 100644
index 0000000000000..af8b74361d685
--- /dev/null
+++ b/clang/test/Modules/explicit-build-cwd.c
@@ -0,0 +1,17 @@
+// This test checks that explicitly building the same module from different
+// working directories results in the same PCM contents.
+
+// RUN: rm -rf %t
+// RUN: split-file %s %t
+// RUN: mkdir %t/one
+// RUN: mkdir %t/two
+
+//--- module.modulemap
+module M { header "M.h" }
+
+//--- M.h
+
+// RUN: cd %t/one && %clang_cc1 -fmodules -emit-module %t/module.modulemap -fmodule-name=M -o %t/M_one.pcm
+// RUN: cd %t/two && %clang_cc1 -fmodules -emit-module %t/module.modulemap -fmodule-name=M -o %t/M_two.pcm
+
+// RUN: diff %t/M_one.pcm %t/M_two.pcm
diff --git a/clang/test/Modules/relative-resource-dir.m b/clang/test/Modules/relative-resource-dir.m
index 96f2d8efc7860..1183184796d1e 100644
--- a/clang/test/Modules/relative-resource-dir.m
+++ b/clang/test/Modules/relative-resource-dir.m
@@ -8,5 +8,6 @@
 // RUN:   -fimplicit-module-maps -fmodules-cache-path=%t.mcp \
 // RUN:   -fbuiltin-headers-in-system-modules \
 // RUN:   -resource-dir resource-dir \
+// RUN:   -internal-isystem resource-dir/include \
 // RUN:   -emit-module %S/Inputs/builtin-headers/module.modulemap \
 // RUN:   -fmodule-name=ModuleWithBuiltinHeader -o %t.pcm
diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c
index d7201207db572..4dd243e57a63e 100644
--- a/clang/test/Preprocessor/aarch64-target-features.c
+++ b/clang/test/Preprocessor/aarch64-target-features.c
@@ -215,6 +215,7 @@
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9.6-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
+// RUN: %clang -target aarch64-none-linux-gnu -march=armv9.7-a -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // RUN: %clang -target aarch64-none-linux-gnu -march=armv9-a+sve2 -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SVE2 %s
 // CHECK-SVE2: __ARM_FEATURE_FP16_SCALAR_ARITHMETIC 1
 // CHECK-SVE2: __ARM_FEATURE_FP16_VECTOR_ARITHMETIC 1
@@ -691,6 +692,7 @@
 // RUN: %clang -target aarch64-none-elf -march=armv9.4-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.5-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // RUN: %clang -target aarch64-none-elf -march=armv9.6-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
+// RUN: %clang -target aarch64-none-elf -march=armv9.7-a -x c -E -dM %s -o - | FileCheck --check-prefixes=CHECK-V81-OR-LATER,CHECK-V83-OR-LATER,CHECK-V85-OR-LATER %s
 // CHECK-V81-OR-LATER: __ARM_FEATURE_ATOMICS 1
 // CHECK-V85-OR-LATER: __ARM_FEATURE_BTI 1
 // CHECK-V83-OR-LATER: __ARM_FEATURE_COMPLEX 1
diff --git a/clang/test/Preprocessor/arm-target-features.c b/clang/test/Preprocessor/arm-target-features.c
index fc37bebcb8330..689cd8184fd3b 100644
--- a/clang/test/Preprocessor/arm-target-features.c
+++ b/clang/test/Preprocessor/arm-target-features.c
@@ -934,6 +934,11 @@
 // CHECK-V96A: #define __ARM_ARCH_9_6A__ 1
 // CHECK-V96A: #define __ARM_ARCH_PROFILE 'A'
 
+// RUN: %clang -target armv9.7a-none-none-eabi -x c -E -dM %s -o - | FileCheck -match-full-lines --check-prefix=CHECK-V97A %s
+// CHECK-V97A: #define __ARM_ARCH 9
+// CHECK-V97A: #define __ARM_ARCH_9_7A__ 1
+// CHECK-V97A: #define __ARM_ARCH_PROFILE 'A'
+
 // RUN: %clang -target arm-none-none-eabi -march=armv7-m -mfpu=softvfp -x c -E -dM %s -o - | FileCheck --check-prefix=CHECK-SOFTVFP %s
 // CHECK-SOFTVFP-NOT: #define __ARM_FP 0x
 
diff --git a/clang/test/Preprocessor/cxx_oper_comma.cpp b/clang/test/Preprocessor/cxx_oper_comma.cpp
new file mode 100644
index 0000000000000..5db803ae41f32
--- /dev/null
+++ b/clang/test/Preprocessor/cxx_oper_comma.cpp
@@ -0,0 +1,45 @@
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify -std=c++98
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify -std=c++11
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify -std=c++14
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify -std=c++17
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify -std=c++20
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify=expected,since-cxx23 -std=c++23 -Wno-c23-extensions
+// RUN: %clang_cc1 -E -pedantic-errors %s -verify=expected,since-cxx23 -std=c++2c -Wno-c23-extensions
+
+// Test 1: Top-level comma
+// expected-error@+1 {{expected end of line in preprocessor expression}}
+#if 1, 2
+#endif
+
+// Test 2: Comma in conditional expression(CWG3017)
+// Per CWG 3017, this exact case highlights the specification gap
+// where C++ lacks explicit prohibition of comma operators in #if
+// expected-error@+1 {{comma operator in operand of #if}}
+#if 1 ? 1, 0 : 3
+#endif
+
+// Test 3: Parenthesized comma
+// expected-error@+1 {{comma operator in operand of #if}}
+#if (1, 2)
+#endif
+
+// Test 4: Multiple commas
+// expected-error@+1 {{expected end of line in preprocessor expression}}
+#if 1, 2, 3
+#endif
+
+// Test 5: Comma in #elif
+#if 0
+#elif (1, 2) // expected-error {{comma operator in operand of #if}}
+#endif
+
+// Test 6: Leading comma (syntax error)
+// expected-error@+1 {{invalid token at start of a preprocessor expression}}
+#if ,
+#endif
+
+// Test 7: Comma in #embed limit parameter (C++23+)
+#if __cplusplus >= 202302L
+// since-cxx23-error@+1 {{expected ')'}}
+#embed "jk.txt" limit(1, 2)
+#endif
diff --git a/clang/test/Preprocessor/hexagon-predefines.c b/clang/test/Preprocessor/hexagon-predefines.c
index eebf48117d80c..bdedcf45ab118 100644
--- a/clang/test/Preprocessor/hexagon-predefines.c
+++ b/clang/test/Preprocessor/hexagon-predefines.c
@@ -171,6 +171,23 @@
 // CHECK-V79HVX-128B: #define __HVX__ 1
 // CHECK-V79HVX-128B: #define __hexagon__ 1
 
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv81 %s\
+// RUN: | FileCheck %s -check-prefix CHECK-V81
+// CHECK-V81: #define __HEXAGON_ARCH__ 81
+// CHECK-V81: #define __HEXAGON_PHYSICAL_SLOTS__ 4
+// CHECK-V81: #define __HEXAGON_V81__ 1
+// CHECK-V81: #define __hexagon__ 1
+
+// RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv81 \
+// RUN: -target-feature +hvxv81 -target-feature +hvx-length128b %s | FileCheck \
+// RUN: %s -check-prefix CHECK-V81HVX-128B
+// CHECK-V81HVX-128B: #define __HEXAGON_ARCH__ 81
+// CHECK-V81HVX-128B: #define __HEXAGON_V81__ 1
+// CHECK-V81HVX-128B: #define __HVX_ARCH__ 81
+// CHECK-V81HVX-128B: #define __HVX_LENGTH__ 128
+// CHECK-V81HVX-128B: #define __HVX__ 1
+// CHECK-V81HVX-128B: #define __hexagon__ 1
+
 // RUN: %clang_cc1 -E -dM -triple hexagon-unknown-elf -target-cpu hexagonv67 \
 // RUN: -target-feature +hvxv67 -target-feature +hvx-length128b %s | FileCheck \
 // RUN: %s -check-prefix CHECK-ELF
diff --git a/clang/test/Preprocessor/riscv-target-features-sifive.c b/clang/test/Preprocessor/riscv-target-features-sifive.c
index 1c49b5549f3b7..5002f621b7c39 100644
--- a/clang/test/Preprocessor/riscv-target-features-sifive.c
+++ b/clang/test/Preprocessor/riscv-target-features-sifive.c
@@ -5,6 +5,11 @@
 
 // CHECK-NOT: __riscv_xsfcease {{.*$}}
 // CHECK-NOT: __riscv_xsfvcp {{.*$}}
+// CHECK-NOT: __riscv_xsfvfbfexp16e {{.*$}}
+// CHECK-NOT: __riscv_xsfvfexp16e {{.*$}}
+// CHECK-NOT: __riscv_xsfvfexp32e {{.*$}}
+// CHECK-NOT: __riscv_xsfvfexpa {{.*$}}
+// CHECK-NOT: __riscv_xsfvfexpa64e {{.*$}}
 // CHECK-NOT: __riscv_xsfvfnrclipxfqf {{.*$}}
 // CHECK-NOT: __riscv_xsfvfwmaccqqq {{.*$}}
 // CHECK-NOT: __riscv_xsfqmaccdod {{.*$}}
@@ -38,6 +43,46 @@
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVCP-EXT %s
 // CHECK-XSFVCP-EXT: __riscv_xsfvcp 1000000{{$}}
 
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfvfbfexp16e_zvfbfmin -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFBFEXP16E-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfvfbfexp16e_zvfbfmin -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFBFEXP16E-EXT %s
+// CHECK-XSFVFBFEXP16E-EXT: __riscv_xsfvfbfexp16e 5000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfvfexp16e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXP16E-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfvfexp16e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXP16E-EXT %s
+// CHECK-XSFVFEXP16E-EXT: __riscv_xsfvfexp16e 5000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfvfexp32e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXP32E-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfvfexp32e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXP32E-EXT %s
+// CHECK-XSFVFEXP32E-EXT: __riscv_xsfvfexp32e 5000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfvfexpa -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfvfexpa -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA-EXT %s
+// CHECK-XSFVFEXPA-EXT: __riscv_xsfvfexpa 2000{{$}}
+
+// RUN: %clang --target=riscv32-unknown-linux-gnu \
+// RUN:   -march=rv32ixsfvfexpa64e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA64E-EXT %s
+// RUN: %clang --target=riscv64-unknown-linux-gnu \
+// RUN:   -march=rv64ixsfvfexpa64e -E -dM %s \
+// RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFEXPA64E-EXT %s
+// CHECK-XSFVFEXPA64E-EXT: __riscv_xsfvfexpa64e 2000{{$}}
+
 // RUN: %clang --target=riscv32-unknown-linux-gnu \
 // RUN:   -march=rv32ixsfvfnrclipxfqf -E -dM %s \
 // RUN:   -o - | FileCheck --check-prefix=CHECK-XSFVFNRCLIPXFQF-EXT %s
diff --git a/clang/test/Sema/callingconv-devicekernel.cpp b/clang/test/Sema/callingconv-devicekernel.cpp
new file mode 100644
index 0000000000000..f5da873fb68bd
--- /dev/null
+++ b/clang/test/Sema/callingconv-devicekernel.cpp
@@ -0,0 +1,16 @@
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple nvptx64-nvidia-cuda- -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple spir64 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple spirv64 -fsyntax-only -verify %s
+
+[[clang::device_kernel]] void kernel1() {}
+
+namespace {
+[[clang::device_kernel]] void kernel2() {} // expected-error {{'kernel2' is specified as a device kernel but it is not externally visible}}
+}
+
+namespace ns {
+  [[clang::device_kernel]] void kernel3() {}
+}
+
+[[clang::device_kernel]] static void kernel4() {} // expected-error {{'kernel4' is specified as a device kernel but it is not externally visible}}
diff --git a/clang/test/Sema/callingconv.c b/clang/test/Sema/callingconv.c
index f0b8b80a32974..28342b56af39a 100644
--- a/clang/test/Sema/callingconv.c
+++ b/clang/test/Sema/callingconv.c
@@ -55,6 +55,10 @@ int __attribute__((aarch64_vector_pcs)) aavpcs(void); // expected-warning {{'aar
 int __attribute__((aarch64_sve_pcs)) aasvepcs(void);  // expected-warning {{'aarch64_sve_pcs' calling convention is not supported for this target}}
 
 int __attribute__((amdgpu_kernel)) amdgpu_kernel(void); // expected-warning {{'amdgpu_kernel' calling convention is not supported for this target}}
+int __attribute__((device_kernel)) device_kernel(void) { // expected-warning {{'device_kernel' calling convention is not supported for this target}}
+}
+int __attribute__((sycl_kernel)) sycl_kernel(void) { // expected-warning {{'sycl_kernel' attribute ignored}}
+}
 
 // PR6361
 void ctest3();
diff --git a/clang/test/Sema/sifive-xsfmm.c b/clang/test/Sema/sifive-xsfmm.c
new file mode 100644
index 0000000000000..7e055dd70c4b1
--- /dev/null
+++ b/clang/test/Sema/sifive-xsfmm.c
@@ -0,0 +1,30 @@
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +xsfmmbase -target-feature +xsfmm32a -target-feature +xsfmm32a8f \
+// RUN:   -target-feature +xsfmm32a16f -target-feature +xsfmm32a32f -target-feature +xsfmm64a64f \
+// RUN:   -target-feature +xsfmm32a4f -target-feature +xsfmm32a8i -disable-O0-optnone  \
+// RUN:   -fsyntax-only %s -verify
+// REQUIRES: riscv-registered-target
+#include <sifive_vector.h>
+
+void test(vfloat32m8_t arg0, vuint8m8_t arg1) {
+  __riscv_sf_mm_f_f_w1(4, arg0, arg0, 1, 2, 3);
+  __riscv_sf_mm_e5m2_e4m3(8, arg1, arg1, 1, 2, 3);
+  __riscv_sf_mm_u_u(12, arg1, arg1, 1, 2, 3);
+  __riscv_sf_vtzero_t_e8w1(0, 0, 0);
+
+  __riscv_sf_mm_f_f_w1(5, arg0, arg0, 1, 2, 3); /* expected-error {{argument should be a multiple of 4}} */
+  __riscv_sf_mm_e5m2_e4m3(7, arg1, arg1, 1, 2, 3); /* expected-error {{argument should be a multiple of 4}} */
+  __riscv_sf_mm_u_u(15, arg1, arg1, 1, 2, 3); /* expected-error {{argument should be a multiple of 4}} */
+  __riscv_sf_mm_f_f_w1(16, arg0, arg0, 1, 2, 3); /* expected-error {{argument value 16 is outside the valid range [0, 15]}} */
+  __riscv_sf_mm_e5m2_e4m3(20, arg1, arg1, 1, 2, 3); /* expected-error {{argument value 20 is outside the valid range [0, 15]}} */
+  __riscv_sf_mm_u_u(24, arg1, arg1, 1, 2, 3); /* expected-error {{argument value 24 is outside the valid range [0, 15]}} */
+  __riscv_sf_vtzero_t_e8w1(18, 0, 0); /* expected-error {{argument value 18 is outside the valid range [0, 15]}} */
+  __riscv_sf_vtzero_t_e16w1(3, 0, 0); /* expected-error {{argument should be a multiple of 2}} */
+  __riscv_sf_vtzero_t_e16w2(3, 0, 0); /* expected-error {{argument should be a multiple of 4}} */
+  __riscv_sf_vtzero_t_e32w1(5, 0, 0); /* expected-error {{argument should be a multiple of 4}} */
+  __riscv_sf_vtzero_t_e32w2(5, 0, 0); /* expected-error {{argument should be a multiple of 2}} */
+  __riscv_sf_vtzero_t(5, 0, 0, 7, 1); /* expected-error {{argument value 7 is outside the valid range [3, 6]}} */
+  __riscv_sf_vtzero_t(5, 0, 0, 2, 1); /* expected-error {{argument value 2 is outside the valid range [3, 6]}} */
+  __riscv_sf_vtzero_t(5, 0, 0, 6, 3); /* expected-error {{RISC-V XSfmm twiden must be 1, 2 or 4}} */
+  __riscv_sf_vtzero_t(5, 0, 0, 6, 5); /* expected-error {{RISC-V XSfmm twiden must be 1, 2 or 4}} */
+}
diff --git a/clang/test/Sema/sifive_sf_vset_invalid.c b/clang/test/Sema/sifive_sf_vset_invalid.c
new file mode 100644
index 0000000000000..96d8e0d2a6b1d
--- /dev/null
+++ b/clang/test/Sema/sifive_sf_vset_invalid.c
@@ -0,0 +1,17 @@
+// REQUIRES: riscv-registered-target
+// RUN: %clang_cc1 -triple riscv64 -target-feature +v \
+// RUN:   -target-feature +xsfmmbase -disable-O0-optnone \
+// RUN:   -o - -fsyntax-only %s -verify
+
+#include <sifive_vector.h>
+
+void test(size_t vl) {
+  __riscv_sf_vsettnt(vl, 1, 8);
+  // expected-error@-1 {{argument value 8 is outside the valid range [1, 3]}}
+  __riscv_sf_vsettm(vl, 8, 9);
+  // expected-error@-1 {{argument value 8 is outside the valid range [0, 3]}}
+  __riscv_sf_vsettn(vl, 8, 2);
+  // expected-error@-1 {{argument value 8 is outside the valid range [0, 3]}}
+  __riscv_sf_vsettk(vl, 0, 0);
+  // expected-error@-1 {{argument value 0 is outside the valid range [1, 3]}}
+}
diff --git a/clang/test/SemaCXX/alloc-token.cpp b/clang/test/SemaCXX/alloc-token.cpp
new file mode 100644
index 0000000000000..be7acb7d42ef2
--- /dev/null
+++ b/clang/test/SemaCXX/alloc-token.cpp
@@ -0,0 +1,79 @@
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -fexperimental-new-constant-interpreter
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -falloc-token-mode=typehash -DMODE_TYPEHASH
+// RUN: %clang_cc1 -triple x86_64-linux-gnu -std=c++23 -fsyntax-only -verify %s -falloc-token-max=2 -DTOKEN_MAX=2
+
+#if !__has_builtin(__builtin_infer_alloc_token)
+#error "missing __builtin_infer_alloc_token"
+#endif
+
+struct NoPtr {
+  int x;
+  long y;
+};
+
+struct WithPtr {
+  int a;
+  char *buf;
+};
+
+// Check specific known values; these are guaranteed to be stable.
+#ifdef MODE_TYPEHASH
+static_assert(__builtin_infer_alloc_token(sizeof(int)) == 2689373973731826898ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(char*)) == 2250492667400517147ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(NoPtr)) == 7465259095297095368ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(WithPtr)) == 11898882936532569145ULL);
+#elif defined(TOKEN_MAX)
+#  if TOKEN_MAX == 2
+static_assert(__builtin_infer_alloc_token(sizeof(int)) == 0);
+static_assert(__builtin_infer_alloc_token(sizeof(char*)) == 1);
+static_assert(__builtin_infer_alloc_token(sizeof(NoPtr)) == 0);
+static_assert(__builtin_infer_alloc_token(sizeof(WithPtr)) == 1);
+#  else
+#    error "unhandled TOKEN_MAX case"
+#  endif
+#else
+static_assert(__builtin_infer_alloc_token(sizeof(int)) == 2689373973731826898ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(char*)) == 11473864704255292954ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(NoPtr)) == 7465259095297095368ULL);
+static_assert(__builtin_infer_alloc_token(sizeof(WithPtr)) == 11898882936532569145ULL);
+#endif
+
+// Template function.
+template <typename T>
+constexpr unsigned long get_token() {
+  return __builtin_infer_alloc_token(sizeof(T));
+}
+static_assert(__builtin_infer_alloc_token(sizeof(int)) == get_token<int>());
+
+// Test complex expressions.
+static_assert(__builtin_constant_p(__builtin_infer_alloc_token(sizeof(int))));
+static_assert(__builtin_infer_alloc_token(sizeof(NoPtr) * 2, 1) == get_token<NoPtr>());
+static_assert(__builtin_infer_alloc_token(1, 4 + sizeof(NoPtr)) == get_token<NoPtr>());
+static_assert(__builtin_infer_alloc_token(sizeof(NoPtr) << 8) == get_token<NoPtr>());
+
+// Test usable as a template param.
+template <unsigned long ID, typename T>
+struct token_for_type {
+  static_assert(ID == get_token<T>());
+  static constexpr unsigned long value = ID;
+};
+static_assert(token_for_type<__builtin_infer_alloc_token(sizeof(int)), int>::value == get_token<int>());
+
+template <typename T = void>
+void template_test() {
+  __builtin_infer_alloc_token(T()); // no error if not instantiated
+}
+
+template <typename T>
+void negative_template_test() {
+  __builtin_infer_alloc_token(T()); // expected-error {{argument may not have 'void' type}}
+}
+
+void negative_tests() {
+  __builtin_infer_alloc_token(); // expected-error {{too few arguments to function call}}
+  __builtin_infer_alloc_token((void)0); // expected-error {{argument may not have 'void' type}}
+  negative_template_test<void>(); // expected-note {{in instantiation of function template specialization 'negative_template_test<void>' requested here}}
+  constexpr auto inference_fail = __builtin_infer_alloc_token(123); // expected-error {{must be initialized by a constant expression}} \
+                                                                    // expected-note {{could not infer allocation type for __builtin_infer_alloc_token}}
+}
diff --git a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
index 137f46ee3dc01..289059ea86eb9 100644
--- a/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
+++ b/clang/test/SemaCXX/cxx2c-fold-exprs.cpp
@@ -157,66 +157,55 @@ static_assert(And1<S, S>() == 1);
 // FIXME: The diagnostics are not so great
 static_assert(And1<int>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                  // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int>]}}
-                                 // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                 // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And1<S, int>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                    // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <S, int>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And1<int, S>() == 1); // expected-error {{no matching function for call to 'And1'}}
                                    // expected-note@#and1 {{candidate template ignored: constraints not satisfied [with T = <int, S>]}}
-                                   // expected-note@#and1 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<S>() == 2);
 static_assert(And2<S, S>() == 2);
 static_assert(And2<int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
                                   // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                  // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                  // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                  // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And2<int, int>() == 2);  // expected-error {{no matching function for call to 'And2'}}
                                       // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}} \
-                                      // expected-note@#and2 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                      // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<S, int>() == 2); // expected-error {{no matching function for call to 'And2'}}
                                    // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And2<int, S>() == 2); // expected-error {{no matching function for call to 'And2'}}
                                    // expected-note@#and2 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                   // expected-note@#and2 {{because 'typename T::type' does not satisfy 'C'}}
-                                 // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(And3<S>() == 3);
 static_assert(And3<S, S>() == 3);
 static_assert(And3<int>() == 3);   // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<int, int>() == 3);  // expected-error {{no matching function for call to 'And3'}}
                                       // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <int>]}}
-                                      // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                     // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                      // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<S, int>() == 3); // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = S, U = <int>]}}
-                                   // expected-note@#and3 {{because 'typename U::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(And3<int, S>() == 3); // expected-error {{no matching function for call to 'And3'}}
                                    // expected-note@#and3 {{candidate template ignored: constraints not satisfied [with T = int, U = <S>]}}
-                                   // expected-note@#and3 {{because 'typename T::type' does not satisfy 'C'}}
-                                   // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                   // expected-note@#and3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 
 static_assert(Or1<>() == 1); // expected-error {{no matching function for call to 'Or1'}}
@@ -227,8 +216,7 @@ static_assert(Or1<S, int>() == 1);
 static_assert(Or1<S, S>() == 1);
 static_assert(Or1<int>() == 1); // expected-error {{no matching function for call to 'Or1'}}
                                 // expected-note@#or1 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or1 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or1 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 
 static_assert(Or2<S>() == 2);
 static_assert(Or2<int, S>() == 2);
@@ -236,16 +224,14 @@ static_assert(Or2<S, int>() == 2);
 static_assert(Or2<S, S>() == 2);
 static_assert(Or2<int>() == 2); // expected-error {{no matching function for call to 'Or2'}}
                                 // expected-note@#or2 {{candidate template ignored: constraints not satisfied [with T = int, U = <>]}}
-                                // expected-note@#or2 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or2 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 static_assert(Or3<S>() == 3);
 static_assert(Or3<int, S>() == 3);
 static_assert(Or3<S, int>() == 3);
 static_assert(Or3<S, S>() == 3);
 static_assert(Or3<int>() == 3); // expected-error {{no matching function for call to 'Or3'}}
                                 // expected-note@#or3 {{candidate template ignored: constraints not satisfied}}
-                                // expected-note@#or3 {{because 'typename T::type' does not satisfy 'C'}}
-                                // expected-note@#C {{because 'T' does not satisfy 'A'}}
+                                // expected-note@#or3 {{because substituted constraint expression is ill-formed: type 'int' cannot be used prior to '::' because it has no members}}
 }
 
 namespace bool_conversion_break {
diff --git a/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp b/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp
new file mode 100644
index 0000000000000..8fad2fb05bc6d
--- /dev/null
+++ b/clang/test/SemaCXX/ptrauth-nested-incomplete-types.cpp
@@ -0,0 +1,46 @@
+// RUN: %clang_cc1 -fptrauth-intrinsics -fsyntax-only -ferror-limit 1 -verify -std=c++26 %s
+// RUN: %clang_cc1 -fptrauth-intrinsics -fsyntax-only -ferror-limit 1 -verify -std=c++03 %s
+// RUN: %clang_cc1                      -fsyntax-only -ferror-limit 1 -verify -std=c++03 %s
+
+/// Force two errors so we hit the error limit leading to skip of template instantiation
+# "" // expected-error {{invalid preprocessing directive}}
+# ""
+// expected-error@* {{too many errors emitted}}
+
+template <typename>
+struct a {};
+struct test_polymorphic {
+  virtual ~test_polymorphic();
+  a<int> field;
+};
+static_assert(__is_trivially_relocatable(test_polymorphic));
+
+struct test_struct {
+  test_struct(int) {}
+  void test_instantiate() {
+    test_struct d(0);
+  }
+  void test_type_trait_query() {
+    __is_trivially_relocatable(test_struct);
+  }
+  a<int> e;
+};
+
+struct test_struct2 {
+  test_struct member;
+  void test() {
+    test_struct2 t{.member = {0}};
+  }
+};
+
+struct test_subclass : test_struct {
+   test_subclass() : test_struct(0) {
+   }
+
+   void test_subclass_instantiation() {
+    test_subclass subclass{};
+   }
+  void test_subclass_type_trait_query() {
+    __is_trivially_relocatable(test_subclass);
+  }
+};
diff --git a/clang/test/SemaHLSL/BuiltIns/matrix-constructors-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/matrix-constructors-errors.hlsl
new file mode 100644
index 0000000000000..e60a8654a719a
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/matrix-constructors-errors.hlsl
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef float float2x1 __attribute__((matrix_type(2,1)));
+typedef float float2x2 __attribute__((matrix_type(2,2)));
+typedef float float2 __attribute__((ext_vector_type(2)));
+
+struct S { float f; };
+struct S2 { float2 f;};
+
+[numthreads(1,1,1)]
+void entry() {
+ float2x1 LilMat = float2x1(1.0, 2.0);
+ float2x1 BrokenMat = float2x1(1.0, 2.0, 3.0); // expected-error{{too many initializers in list for type 'float2x1' (aka 'matrix<float, 2, 1>') (expected 2 but found 3)}}
+ float2x2 NormieMat = float2x2(LilMat, 3.0, 4.0, 5.0); // expected-error{{too many initializers in list for type 'float2x2' (aka 'matrix<float, 2, 2>') (expected 4 but found 5)}}
+ float2x2 BrokenNormie = float2x2(3.0, 4.0); // expected-error{{too few initializers in list for type 'float2x2' (aka 'matrix<float, 2, 2>') (expected 4 but found 2)}}
+ float2x1 OverwhemledNormie = float2x1(3.0, 4.0, 5.0, 6.0); // expected-error{{too many initializers in list for type 'float2x1' (aka 'matrix<float, 2, 1>') (expected 2 but found 4)}}
+
+  // These should work in HLSL and not error
+  S s;
+  float2x1 GettingStrange = float2x1(s, s); 
+
+  S2 s2;
+  float2x2 GettingStrange2 = float2x2(s2, s2); 
+
+  // HLSL does not yet allow user-defined conversions.
+  struct T {
+    operator float() const { return 1.0f; }
+  } t;
+  // TODO: Should this work? Today HLSL doesn't resolve user-defined conversions here, but we maybe should...
+  float2x1 foo5 = float2x1(t, t); // expected-error{{too few initializers in list for type 'float2x1' (aka 'matrix<float, 2, 1>') (expected 2 but found 0)}}
+}
diff --git a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
index 393d7300605c0..bcc94f0632d64 100644
--- a/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
+++ b/clang/test/SemaHLSL/Semantics/entry_parameter.hlsl
@@ -11,4 +11,9 @@ void CSMain(int GI : SV_GroupIndex, uint ID : SV_DispatchThreadID, uint GID : SV
 // CHECK-NEXT: HLSLSV_GroupIDAttr
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:96 GThreadID 'uint'
 // CHECK-NEXT: HLSLSV_GroupThreadIDAttr
+
+// CHECK:      HLSLSV_GroupIndexAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GI' 'int' 0
+// CHECK-NEXT: HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'ID' 'uint':'unsigned int' 0
+// CHECK-NEXT: HLSLSV_GroupIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GID' 'uint':'unsigned int' 0
+// CHECK-NEXT: HLSLSV_GroupThreadIDAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'GThreadID' 'uint':'unsigned int' 0
 }
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
index 32bc5f55b2abd..27a8e4a0e2662 100644
--- a/clang/test/SemaHLSL/Semantics/position.ps.hlsl
+++ b/clang/test/SemaHLSL/Semantics/position.ps.hlsl
@@ -1,7 +1,10 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s
 
-float4 main(float4 a : SV_Position) {
+// FIXME(Keenuts): add mandatory output semantic once those are implemented.
+float4 main(float4 a : SV_Position2) {
 // CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (float4)'
 // CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:20 a 'float4':'vector<float, 4>'
 // CHECK-NEXT: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}>
+
+// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> ParmVar 0x{{[0-9a-fA-F]+}} 'a' 'float4':'vector<float, 4>' 2 SemanticExplicitIndex
 }
diff --git a/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl b/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl
new file mode 100644
index 0000000000000..9f57231bea0c1
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/position.ps.struct.hlsl
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-pixel -x hlsl -finclude-default-header -o - %s -ast-dump | FileCheck %s
+
+struct S {
+  float4 f0 : SV_Position;
+// CHECK: FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:10 f0 'float4':'vector<float, 4>'
+// CHECK:   HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <col:15> <<<NULL>>> 0
+  float4 f1 : SV_Position3;
+// CHECK: FieldDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:10 f1 'float4':'vector<float, 4>'
+// CHECK:   HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <col:15> <<<NULL>>> 3 SemanticExplicitIndex
+};
+
+// FIXME(Keenuts): add mandatory output semantic once those are implemented.
+float4 main(S s) {
+// CHECK: FunctionDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> line:[[@LINE-1]]:8 main 'float4 (S)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9a-fA-F]+}} <{{.*}}> col:15 s 'S'
+
+// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> Field 0x{{[0-9a-fA-F]+}} 'f0' 'float4':'vector<float, 4>' 0
+// CHECK: HLSLSV_PositionAttr 0x{{[0-9a-fA-F]+}} <{{.*}}> Field 0x{{[0-9a-fA-F]+}} 'f1' 'float4':'vector<float, 4>' 3 SemanticExplicitIndex
+}
diff --git a/clang/test/SemaHLSL/Semantics/struct_input.hlsl b/clang/test/SemaHLSL/Semantics/struct_input.hlsl
new file mode 100644
index 0000000000000..66cab9552d4d9
--- /dev/null
+++ b/clang/test/SemaHLSL/Semantics/struct_input.hlsl
@@ -0,0 +1,19 @@
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -finclude-default-header -o - %s -verify
+
+struct S {
+  float4 f0 : SV_Position;
+// expected-error@+2 {{semantic annotations must be present for all parameters of an entry function or patch constant function}}
+// expected-note@+1 {{'f1' used here}}
+  float4 f1;
+};
+
+[shader("pixel")]
+// expected-note@+1 {{'s' declared here}}
+void main(S s) {
+}
+
+[shader("pixel")]
+// expected-error@+2 {{semantic annotations must be present for all parameters of an entry function or patch constant function}}
+// expected-note@+1 {{'f' declared here}}
+void main2(float4 p : SV_POSITION, float4 f)
+{ }
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
index 3cea47b66d6a6..da6a03bc93eeb 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-param.cl
@@ -15,6 +15,8 @@ typedef half __attribute__((ext_vector_type(32))) half32;
 typedef float __attribute__((ext_vector_type(8))) float8;
 typedef float __attribute__((ext_vector_type(16))) float16;
 typedef float __attribute__((ext_vector_type(32))) float32;
+typedef short __attribute__((ext_vector_type(2))) short2;
+typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
 
 typedef int    v4i   __attribute__((ext_vector_type(4)));
 typedef int    v8i   __attribute__((ext_vector_type(8)));
@@ -165,3 +167,19 @@ void test_cvt_f32_fp8_e5m3(global int* out, int a)
 {
   *out = __builtin_amdgcn_cvt_f32_fp8_e5m3(a, a); // expected-error {{'__builtin_amdgcn_cvt_f32_fp8_e5m3' must be a constant integer}}
 }
+
+void test_add_min_max(global int *out, int a, int b, int c, bool clamp)
+{
+  *out = __builtin_amdgcn_add_max_i32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_max_i32' must be a constant integer}}
+  *out = __builtin_amdgcn_add_max_u32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_max_u32' must be a constant integer}}
+  *out = __builtin_amdgcn_add_min_i32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_min_i32' must be a constant integer}}
+  *out = __builtin_amdgcn_add_min_u32(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_add_min_u32' must be a constant integer}}
+}
+
+void test_pk_add_min_max(global short2 *out, global ushort2 *uout, short2 a, short2 b, short2 c, ushort2 ua, ushort2 ub, ushort2 uc, bool clamp)
+{
+  *out = __builtin_amdgcn_pk_add_max_i16(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_pk_add_max_i16' must be a constant integer}}
+  *uout = __builtin_amdgcn_pk_add_max_u16(ua, ub, uc, clamp); // expected-error {{'__builtin_amdgcn_pk_add_max_u16' must be a constant integer}}
+  *out = __builtin_amdgcn_pk_add_min_i16(a, b, c, clamp); // expected-error {{'__builtin_amdgcn_pk_add_min_i16' must be a constant integer}}
+  *uout = __builtin_amdgcn_pk_add_min_u16(ua, ub, uc, clamp); // expected-error {{'__builtin_amdgcn_pk_add_min_u16' must be a constant integer}}
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
index d7045cdf55837..e53beae474718 100644
--- a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250.cl
@@ -3,9 +3,21 @@
 
 typedef unsigned int uint;
 typedef unsigned short int ushort;
+typedef short __attribute__((ext_vector_type(2))) short2;
+typedef unsigned short __attribute__((ext_vector_type(2))) ushort2;
 
-void test(global uint* out, uint a, uint b, uint c) {
+void test(global int* out, global short2 *s2out, global ushort2 *us2out,
+          uint a, uint b, uint c, short2 s2a, short2 s2b, short2 s2c,
+          ushort2 us2a, ushort2 us2b, ushort2 us2c) {
   __builtin_amdgcn_s_setprio_inc_wg(1); // expected-error {{'__builtin_amdgcn_s_setprio_inc_wg' needs target feature setprio-inc-wg-inst}}
   *out = __builtin_amdgcn_bitop3_b32(a, b, c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b32' needs target feature bitop3-insts}}
   *out = __builtin_amdgcn_bitop3_b16((ushort)a, (ushort)b, (ushort)c, 1); // expected-error {{'__builtin_amdgcn_bitop3_b16' needs target feature bitop3-insts}}
+  *out = __builtin_amdgcn_add_max_i32(a, b, c, false); // expected-error {{'__builtin_amdgcn_add_max_i32' needs target feature add-min-max-insts}}
+  *out = __builtin_amdgcn_add_max_u32(a, b, c, true); // expected-error {{'__builtin_amdgcn_add_max_u32' needs target feature add-min-max-insts}}
+  *out = __builtin_amdgcn_add_min_i32(a, b, c, false); // expected-error {{'__builtin_amdgcn_add_min_i32' needs target feature add-min-max-insts}}
+  *out = __builtin_amdgcn_add_min_u32(a, b, c, true); // expected-error {{'__builtin_amdgcn_add_min_u32' needs target feature add-min-max-insts}}
+  *s2out = __builtin_amdgcn_pk_add_max_i16(s2a, s2b, s2c, false); // expected-error {{'__builtin_amdgcn_pk_add_max_i16' needs target feature pk-add-min-max-insts}}
+  *us2out = __builtin_amdgcn_pk_add_max_u16(us2a, us2b, us2c, true); // expected-error {{'__builtin_amdgcn_pk_add_max_u16' needs target feature pk-add-min-max-insts}}
+  *s2out = __builtin_amdgcn_pk_add_min_i16(s2a, s2b, s2c, false); // expected-error {{'__builtin_amdgcn_pk_add_min_i16' needs target feature pk-add-min-max-insts}}
+  *us2out = __builtin_amdgcn_pk_add_min_u16(us2a, us2b, us2c, true); // expected-error {{'__builtin_amdgcn_pk_add_min_u16' needs target feature pk-add-min-max-insts}}
 }
diff --git a/clang/test/SemaOpenMP/openmp-begin-declare-variant_template.cpp b/clang/test/SemaOpenMP/openmp-begin-declare-variant_template.cpp
new file mode 100644
index 0000000000000..ded8f58253540
--- /dev/null
+++ b/clang/test/SemaOpenMP/openmp-begin-declare-variant_template.cpp
@@ -0,0 +1,12 @@
+// RUN: %clang_cc1 -triple x86_64 -fopenmp -verify %s
+
+// FIXME: Is this supposed to work?
+
+#pragma omp begin declare variant match(implementation={extension(allow_templates)})
+template <class T> void f(T) {}
+// expected-note@-1 {{explicit instantiation refers here}}
+#pragma end
+template <int> struct A {};
+template <bool B> A<B> f() = delete;
+template void f<float>(float);
+// expected-error@-1 {{explicit instantiation of undefined function template 'f'}}
diff --git a/clang/test/SemaTemplate/concepts-recursive-inst.cpp b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
index 73dce9317f383..5e1bce5f15684 100644
--- a/clang/test/SemaTemplate/concepts-recursive-inst.cpp
+++ b/clang/test/SemaTemplate/concepts-recursive-inst.cpp
@@ -82,7 +82,6 @@ auto it = begin(rng); // #BEGIN_CALL
 // expected-error@#BEGIN_CALL {{no matching function for call to 'begin'}}
 // expected-note@#NOTINF_BEGIN {{candidate function}}
 // expected-note@#INF_BEGIN{{candidate template ignored: constraints not satisfied}}
-// expected-note@#INF_BEGIN{{because 'Inf auto' does not satisfy 'Inf}}
 }
 } // namespace DirectRecursiveCheck
 
diff --git a/clang/test/SemaTemplate/concepts.cpp b/clang/test/SemaTemplate/concepts.cpp
index 5b0f3d39d9648..becf5467a1b61 100644
--- a/clang/test/SemaTemplate/concepts.cpp
+++ b/clang/test/SemaTemplate/concepts.cpp
@@ -1573,3 +1573,62 @@ namespace GH162770 {
   template<typename... Ts> auto comma = (..., Ts());
   auto b = comma<check<e{}>>;
 } // namespace GH162770
+
+namespace GH164750 {
+
+template <typename>
+struct a;
+template <typename>
+struct b;
+
+template <template <typename> typename c, typename d, typename>
+concept e = !__is_convertible_to(c<d>*, b<d>*);
+
+template <typename...>
+struct f;
+template <typename g, typename... h>
+struct f<g, h...> {
+    g i;
+};
+
+template <typename, typename>
+struct u;
+template <typename j, template <typename> typename k, typename l>
+    requires e<k, j, l>
+struct u<const k<j>*, l> {
+    u(const a<j>*);
+};
+template <typename j, template <typename> typename k, typename l>
+struct u<const k<j>*, l> {
+    u(const b<j>*);
+};
+
+template <typename>
+struct m;
+template <typename n, typename... o>
+struct m<n (*)(o...)> {
+    template <template <typename> typename j>
+    using p = j<o...>;
+};
+
+template <typename q, typename r>
+struct s {
+    template <typename... p>
+    struct D {
+        using v = f<u<r, p>...>;
+    };
+    template <typename... t>
+    s(t... p1) : x(p1...) {}
+    m<q>::template p<D>::v x;
+};
+template <typename w, typename... t>
+void fn1(w, t... p2) {
+    s<w, t...>(p2...);
+}
+int* fn2(int) { return nullptr; }
+void fn3() {
+    fn1(fn2, static_cast<const a<int>*>(nullptr));
+    fn1(fn2, static_cast<const b<int>*>(nullptr));
+}
+
+}
diff --git a/clang/test/Tooling/clang-check-pwd.cpp b/clang/test/Tooling/clang-check-pwd.cpp
index 309cee54aadd9..e4360c006cd16 100644
--- a/clang/test/Tooling/clang-check-pwd.cpp
+++ b/clang/test/Tooling/clang-check-pwd.cpp
@@ -12,5 +12,3 @@
 // CHECK: a type specifier is required
 // CHECK: .foobar/test.cpp
 invalid;
-
-// REQUIRES: shell
diff --git a/clang/test/lit.cfg.py b/clang/test/lit.cfg.py
index 1b3e6cf881f00..81780413b00e4 100644
--- a/clang/test/lit.cfg.py
+++ b/clang/test/lit.cfg.py
@@ -18,11 +18,22 @@
 # name: The name of this test suite.
 config.name = "Clang"
 
+# TODO: Consolidate the logic for turning on the internal shell by default for all LLVM test suites.
+# See https://github.com/llvm/llvm-project/issues/106636 for more details.
+#
+# We prefer the lit internal shell which provides a better user experience on failures
+# and is faster unless the user explicitly disables it with LIT_USE_INTERNAL_SHELL=0
+# env var.
+use_lit_shell = True
+lit_shell_env = os.environ.get("LIT_USE_INTERNAL_SHELL")
+if lit_shell_env:
+    use_lit_shell = lit.util.pythonize_bool(lit_shell_env)
+
 # testFormat: The test format to use to interpret tests.
 #
 # For now we require '&&' between commands, until they get globally killed and
 # the test runner updated.
-config.test_format = lit.formats.ShTest(not llvm_config.use_lit_shell)
+config.test_format = lit.formats.ShTest(execute_external=not use_lit_shell)
 
 # suffixes: A list of file extensions to treat as test files.
 config.suffixes = [
diff --git a/clang/tools/clang-fuzzer/Dockerfile b/clang/tools/clang-fuzzer/Dockerfile
deleted file mode 100644
index 1ddf82954e219..0000000000000
--- a/clang/tools/clang-fuzzer/Dockerfile
+++ /dev/null
@@ -1,41 +0,0 @@
-#===- llvm/tools/clang/tools/clang-fuzzer ---------------------------------===//
-#
-# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-#
-#===----------------------------------------------------------------------===//
-# Produces an image that builds clang-proto-fuzzer
-FROM ubuntu:16.04
-RUN apt-get update -y
-RUN apt-get install -y autoconf automake libtool curl make g++ unzip wget git \
-    binutils liblzma-dev libz-dev python-all cmake ninja-build subversion \
-    pkg-config docbook2x
-
-WORKDIR /root
-
-# Get protobuf
-RUN wget -qO- https://github.com/google/protobuf/releases/download/v3.3.0/protobuf-cpp-3.3.0.tar.gz | tar zxf -
-RUN cd protobuf-3.3.0 && ./autogen.sh && ./configure && make -j $(nproc) && make check -j $(nproc) && make install && ldconfig
-# Get LLVM
-RUN svn co http://llvm.org/svn/llvm-project/llvm/trunk llvm
-RUN cd llvm/tools && svn co http://llvm.org/svn/llvm-project/cfe/trunk clang -r $(cd ../ && svn info | grep Revision | awk '{print $2}')
-RUN cd llvm/projects && svn co http://llvm.org/svn/llvm-project/compiler-rt/trunk compiler-rt -r $(cd ../ && svn info | grep Revision | awk '{print $2}')
-# Build plain LLVM (stage 0)
-RUN mkdir build0 && cd build0 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm && ninja
-# Configure instrumented LLVM (stage 1)
-RUN mkdir build1 && cd build1 && cmake -GNinja -DCMAKE_BUILD_TYPE=Release ../llvm \
-    -DLLVM_ENABLE_ASSERTIONS=ON \
-    -DCMAKE_C_COMPILER=`pwd`/../build0/bin/clang \
-    -DCMAKE_CXX_COMPILER=`pwd`/../build0/bin/clang++ \
-    -DLLVM_USE_SANITIZE_COVERAGE=YES \
-    -DLLVM_USE_SANITIZER=Address -DCLANG_ENABLE_PROTO_FUZZER=ON
-# Build the fuzzers
-RUN cd build1 && ninja clang-fuzzer
-RUN cd build1 && ninja clang-objc-fuzzer
-RUN cd build1 && ninja clang-proto-fuzzer
-RUN cd build1 && ninja clang-proto-to-cxx
-RUN cd build1 && ninja clang-loop-proto-to-cxx
-RUN cd build1 && ninja clang-loop-proto-to-llvm
-RUN cd build1 && ninja clang-loop-proto-fuzzer
-RUN cd build1 && ninja clang-llvm-proto-fuzzer
diff --git a/clang/tools/clang-fuzzer/README.txt b/clang/tools/clang-fuzzer/README.txt
index eec4a9efdfc66..218c54437a466 100644
--- a/clang/tools/clang-fuzzer/README.txt
+++ b/clang/tools/clang-fuzzer/README.txt
@@ -99,9 +99,6 @@ Example:
     -DCLANG_ENABLE_PROTO_FUZZER=ON
   ninja clang-proto-fuzzer clang-proto-to-cxx
 
-This directory also contains a Dockerfile which sets up all required
-dependencies and builds the fuzzers.
-
 ============================
  Running clang-proto-fuzzer
 ============================
diff --git a/clang/tools/clang-installapi/ClangInstallAPI.cpp b/clang/tools/clang-installapi/ClangInstallAPI.cpp
index 16abeb10284c0..4e66485343b89 100644
--- a/clang/tools/clang-installapi/ClangInstallAPI.cpp
+++ b/clang/tools/clang-installapi/ClangInstallAPI.cpp
@@ -114,6 +114,7 @@ static bool run(ArrayRef<const char *> Args, const char *ProgName) {
 
   // Set up compilation.
   std::unique_ptr<CompilerInstance> CI(new CompilerInstance());
+  CI->setVirtualFileSystem(FM->getVirtualFileSystemPtr());
   CI->setFileManager(FM);
   CI->createDiagnostics();
   if (!CI->hasDiagnostics())
diff --git a/clang/tools/clang-scan-deps/ClangScanDeps.cpp b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
index e41f4eb7999ae..c11a34870b204 100644
--- a/clang/tools/clang-scan-deps/ClangScanDeps.cpp
+++ b/clang/tools/clang-scan-deps/ClangScanDeps.cpp
@@ -106,6 +106,7 @@ static constexpr bool DoRoundTripDefault = false;
 #endif
 
 static bool RoundTripArgs = DoRoundTripDefault;
+static bool VerbatimArgs = false;
 
 static void ParseArgs(int argc, char **argv) {
   ScanDepsOptTable Tbl;
@@ -239,6 +240,8 @@ static void ParseArgs(int argc, char **argv) {
 
   RoundTripArgs = Args.hasArg(OPT_round_trip_args);
 
+  VerbatimArgs = Args.hasArg(OPT_verbatim_args);
+
   if (const llvm::opt::Arg *A = Args.getLastArgNoClaim(OPT_DASH_DASH))
     CommandLine.assign(A->getValues().begin(), A->getValues().end());
 }
@@ -883,14 +886,16 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
 
   llvm::cl::PrintOptionValues();
 
-  // Expand response files in advance, so that we can "see" all the arguments
-  // when adjusting below.
-  Compilations = expandResponseFiles(std::move(Compilations),
-                                     llvm::vfs::getRealFileSystem());
+  if (!VerbatimArgs) {
+    // Expand response files in advance, so that we can "see" all the arguments
+    // when adjusting below.
+    Compilations = expandResponseFiles(std::move(Compilations),
+                                       llvm::vfs::getRealFileSystem());
 
-  Compilations = inferTargetAndDriverMode(std::move(Compilations));
+    Compilations = inferTargetAndDriverMode(std::move(Compilations));
 
-  Compilations = inferToolLocation(std::move(Compilations));
+    Compilations = inferToolLocation(std::move(Compilations));
+  }
 
   // The command options are rewritten to run Clang in preprocessor only mode.
   auto AdjustingCompilations =
@@ -898,7 +903,7 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
           std::move(Compilations));
   ResourceDirectoryCache ResourceDirCache;
 
-  AdjustingCompilations->appendArgumentsAdjuster(
+  auto ArgsAdjuster =
       [&ResourceDirCache](const tooling::CommandLineArguments &Args,
                           StringRef FileName) {
         std::string LastO;
@@ -960,7 +965,10 @@ int clang_scan_deps_main(int argc, char **argv, const llvm::ToolContext &) {
         }
         AdjustedArgs.insert(AdjustedArgs.end(), FlagsEnd, Args.end());
         return AdjustedArgs;
-      });
+      };
+
+  if (!VerbatimArgs)
+    AdjustingCompilations->appendArgumentsAdjuster(ArgsAdjuster);
 
   SharedStream Errs(llvm::errs());
 
diff --git a/clang/tools/clang-scan-deps/Opts.td b/clang/tools/clang-scan-deps/Opts.td
index 03011f9ae1f75..7a63b18f6d462 100644
--- a/clang/tools/clang-scan-deps/Opts.td
+++ b/clang/tools/clang-scan-deps/Opts.td
@@ -44,4 +44,6 @@ def verbose : F<"v", "Use verbose output">;
 
 def round_trip_args : F<"round-trip-args", "verify that command-line arguments are canonical by parsing and re-serializing">;
 
+def verbatim_args : F<"verbatim-args", "Pass commands to the scanner verbatim without adjustments">;
+
 def DASH_DASH : Option<["--"], "", KIND_REMAINING_ARGS>;
diff --git a/clang/tools/driver/cc1_main.cpp b/clang/tools/driver/cc1_main.cpp
index 49f8843515a35..52cffa4ccbe1f 100644
--- a/clang/tools/driver/cc1_main.cpp
+++ b/clang/tools/driver/cc1_main.cpp
@@ -313,17 +313,6 @@ int cc1_main(ArrayRef<const char *> Argv, const char *Argv0, void *MainAddr) {
   llvm::TimerGroup::clearAll();
 
   if (llvm::timeTraceProfilerEnabled()) {
-    // It is possible that the compiler instance doesn't own a file manager here
-    // if we're compiling a module unit. Since the file manager are owned by AST
-    // when we're compiling a module unit. So the file manager may be invalid
-    // here.
-    //
-    // It should be fine to create file manager here since the file system
-    // options are stored in the compiler invocation and we can recreate the VFS
-    // from the compiler invocation.
-    if (!Clang->hasFileManager())
-      Clang->createFileManager();
-
     if (auto profilerOutput = Clang->createOutputFile(
             Clang->getFrontendOpts().TimeTracePath, /*Binary=*/false,
             /*RemoveFileOnSignal=*/false,
diff --git a/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
index d87554203a5b7..2e528edd7c1f9 100644
--- a/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/MockHeaders.cpp
@@ -1356,7 +1356,7 @@ bool operator==(const Status &lhs, const Status &rhs);
 bool operator!=(const Status &lhs, const Status &rhs);
 
 Status OkStatus();
-Status InvalidArgumentError(char *);
+Status InvalidArgumentError(const char *);
 
 #endif // STATUS_H
 )cc";
diff --git a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp
index 4827cc1d0a7e9..425beb939a42a 100644
--- a/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp
+++ b/clang/unittests/Analysis/FlowSensitive/UncheckedStatusOrAccessModelTestFixture.cpp
@@ -2453,6 +2453,528 @@ TEST_P(UncheckedStatusOrAccessModelTest, SubclassOperator) {
       )cc");
 }
 
+TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusCheck) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.status().ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusRefCheck) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      const STATUS& s = sor.status();
+      if (s.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithStatusPtrCheck) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      const STATUS* s = &sor.status();
+      if (s->ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, UnwrapValueWithMovedStatus) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (std::move(sor.status()).ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, MembersUsedInsideStatus) {
+  ExpectDiagnosticsFor(R"cc(
+    namespace absl {
+
+    class Status {
+     public:
+      bool ok() const;
+
+      void target() const { ok(); }
+    };
+
+    }  // namespace absl
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, StatusUpdate) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      STATUS s;
+      s.Update(sor.status());
+      if (s.ok())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      STATUS s;
+      s.Update(sor1.status());
+      s.Update(sor2.status());
+      if (s.ok()) {
+        sor1.value();
+        sor2.value();
+      }
+    }
+  )cc");
+
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      STATUS s;
+      s.Update(sor1.status());
+      CHECK(s.ok());
+      s.Update(sor2.status());
+      sor1.value();
+      sor2.value();  // [[unsafe]]
+    }
+  )cc");
+
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      STATUS s;
+      s.Update(sor1.status());
+      CHECK(s.ok());
+      sor1.value();
+      sor2.value();  // [[unsafe]]
+    }
+  )cc");
+
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      STATUS s;
+      STATUS sor1_status = sor1.status();
+      s.Update(std::move(sor1_status));
+      CHECK(s.ok());
+      sor1.value();
+      sor2.value();  // [[unsafe]]
+    }
+  )cc");
+
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor1, STATUSOR_INT sor2) {
+      STATUS s;
+      STATUS sor1_status = sor1.status();
+      sor1_status.Update(sor2.status());
+      s.Update(std::move(sor1_status));
+      CHECK(s.ok());
+      sor1.value();
+      sor2.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    const STATUS& OptStatus();
+
+    void target(STATUSOR_INT sor) {
+      auto s = sor.status();
+      s.Update(OptStatus());
+      if (s.ok()) sor.value();
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, EqualityCheck) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (x == y)
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (y == x)
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (x != y)
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (y != x)
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (!(x == y))
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (!(x != y))
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x == y)
+        if (x.ok()) y.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (x.status() == y.status())
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (x.status() != y.status())
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.status() == absl::OkStatus())
+        sor.value();
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.status() != absl::OkStatus())
+        sor.value();  // [[unsafe]]
+      else
+        sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.status() != absl::InvalidArgumentError("oh no"))
+        sor.value();  // [[unsafe]]
+      else
+        sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (x.ok() == y.ok())
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (x.ok() != y.ok())
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (x.status().ok() == y.status().ok())
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (x.status().ok() != y.status().ok())
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT x, STATUSOR_INT y) {
+          if (x.ok()) {
+            if (x.status().ok() == y.ok())
+              y.value();
+            else
+              y.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT x, STATUSOR_INT y) {
+      if (x.ok()) {
+        if (x.status().ok() != y.ok())
+          y.value();  // [[unsafe]]
+        else
+          y.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b, STATUSOR_INT sor) {
+      if (sor.ok() == b) {
+        if (b) sor.value();
+      }
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok() == true) sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(STATUSOR_INT sor) {
+      if (sor.ok() == false) sor.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor1;
+      STATUSOR_INT sor2 = Make<STATUSOR_INT>();
+      if (sor1 == sor2) sor2.value();  // [[unsafe]]
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor1 = Make<STATUSOR_INT>();
+      STATUSOR_INT sor2;
+      if (sor1 == sor2) sor1.value();  // [[unsafe]]
+    }
+  )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, PointerEqualityCheck) {
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT* x, STATUSOR_INT* y) {
+          if (x->ok()) {
+            if (x == y)
+              y->value();
+            else
+              y->value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUSOR_INT* x, STATUSOR_INT* y) {
+          if (x->ok()) {
+            if (x != y)
+              y->value();  // [[unsafe]]
+            else
+              y->value();
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUS* x, STATUS* y) {
+          auto sor = Make<STATUSOR_INT>();
+          if (x->ok()) {
+            if (x == y && sor.status() == *y)
+              sor.value();
+            else
+              sor.value();  // [[unsafe]]
+          }
+        }
+      )cc");
+  ExpectDiagnosticsFor(
+      R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+        void target(STATUS* x, STATUS* y) {
+          auto sor = Make<STATUSOR_INT>();
+          if (x->ok()) {
+            if (x != y)
+              sor.value();  // [[unsafe]]
+            else if (sor.status() == *y)
+              sor.value();
+          }
+        }
+      )cc");
+}
+
+TEST_P(UncheckedStatusOrAccessModelTest, Emplace) {
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      Foo(int);
+    };
+
+    void target(absl::StatusOr<Foo> sor, int value) {
+      sor.emplace(value);
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    struct Foo {
+      Foo(std::initializer_list<int>, int);
+    };
+
+    void target(absl::StatusOr<Foo> sor, int value) {
+      sor.emplace({1, 2, 3}, value);
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target() {
+      STATUSOR_INT sor;
+      bool sor_ok = sor.ok();
+      if (!sor_ok)
+        sor.emplace(42);
+      sor.value();
+    }
+  )cc");
+  ExpectDiagnosticsFor(R"cc(
+#include "unchecked_statusor_access_test_defs.h"
+
+    void target(bool b) {
+      STATUSOR_INT sor;
+      if (b) sor.emplace(42);
+      if (b) sor.value();
+    }
+  )cc");
+}
+
 } // namespace
 
 std::string
diff --git a/clang/unittests/CIR/PointerLikeTest.cpp b/clang/unittests/CIR/PointerLikeTest.cpp
index 22690f2834cc4..b3dfba75f34ce 100644
--- a/clang/unittests/CIR/PointerLikeTest.cpp
+++ b/clang/unittests/CIR/PointerLikeTest.cpp
@@ -76,7 +76,7 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     EXPECT_EQ(pltTy.getElementType(), ty);
 
     OwningOpRef<cir::AllocaOp> varPtrOp =
-        b.create<cir::AllocaOp>(loc, ptrTy, ty, "", getAlignOne(&context));
+        cir::AllocaOp::create(b, loc, ptrTy, ty, "", getAlignOne(&context));
 
     mlir::Value val = varPtrOp.get();
     mlir::acc::VariableTypeCategory typeCategory = pltTy.getPointeeTypeCategory(
@@ -110,7 +110,7 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
 
     // Create an alloca for the array
     OwningOpRef<cir::AllocaOp> varPtrOp =
-        b.create<cir::AllocaOp>(loc, ptrTy, arrTy, "", getAlignOne(&context));
+        cir::AllocaOp::create(b, loc, ptrTy, arrTy, "", getAlignOne(&context));
 
     // Verify that the type category is array.
     mlir::Value val = varPtrOp.get();
@@ -121,8 +121,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
 
     // Create an array-to-pointer decay cast.
     mlir::Type ptrToElemTy = cir::PointerType::get(ty);
-    OwningOpRef<cir::CastOp> decayPtr = b.create<cir::CastOp>(
-        loc, ptrToElemTy, cir::CastKind::array_to_ptrdecay, val);
+    OwningOpRef<cir::CastOp> decayPtr = cir::CastOp::create(
+        b, loc, ptrToElemTy, cir::CastKind::array_to_ptrdecay, val);
     mlir::Value decayVal = decayPtr.get();
 
     // Verify that we still get the expected element type.
@@ -141,9 +141,9 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     // Create an element access.
     mlir::Type i32Ty = cir::IntType::get(&context, 32, true);
     mlir::Value index =
-        b.create<cir::ConstantOp>(loc, cir::IntAttr::get(i32Ty, 2));
+        cir::ConstantOp::create(b, loc, cir::IntAttr::get(i32Ty, 2));
     OwningOpRef<cir::PtrStrideOp> accessPtr =
-        b.create<cir::PtrStrideOp>(loc, ptrToElemTy, decayVal, index);
+        cir::PtrStrideOp::create(b, loc, ptrToElemTy, decayVal, index);
     mlir::Value accessVal = accessPtr.get();
 
     // Verify that we still get the expected element type.
@@ -175,8 +175,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     EXPECT_EQ(pltTy.getElementType(), structTy);
 
     // Create an alloca for the array
-    OwningOpRef<cir::AllocaOp> varPtrOp = b.create<cir::AllocaOp>(
-        loc, ptrTy, structTy, "", getAlignOne(&context));
+    OwningOpRef<cir::AllocaOp> varPtrOp = cir::AllocaOp::create(
+        b, loc, ptrTy, structTy, "", getAlignOne(&context));
 
     // Verify that the type category is composite.
     mlir::Value val = varPtrOp.get();
@@ -186,8 +186,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     EXPECT_EQ(typeCategory, mlir::acc::VariableTypeCategory::composite);
 
     // Access the first element of the structure.
-    OwningOpRef<cir::GetMemberOp> access1 = b.create<cir::GetMemberOp>(
-        loc, cir::PointerType::get(ty1), val, b.getStringAttr("f1"), 0);
+    OwningOpRef<cir::GetMemberOp> access1 = cir::GetMemberOp::create(
+        b, loc, cir::PointerType::get(ty1), val, "f1", 0u);
     mlir::Value accessVal1 = access1.get();
 
     // Verify that we get the expected element type.
@@ -204,8 +204,8 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     EXPECT_EQ(access1TypeCategory, mlir::acc::VariableTypeCategory::composite);
 
     // Access the second element of the structure.
-    OwningOpRef<cir::GetMemberOp> access2 = b.create<cir::GetMemberOp>(
-        loc, cir::PointerType::get(ty2), val, b.getStringAttr("f2"), 1);
+    OwningOpRef<cir::GetMemberOp> access2 = cir::GetMemberOp::create(
+        b, loc, cir::PointerType::get(ty2), val, "f2", 1u);
     mlir::Value accessVal2 = access2.get();
 
     // Verify that we get the expected element type.
@@ -252,17 +252,17 @@ class CIROpenACCPointerLikeTest : public ::testing::Test {
     mlir::Type structPptrTy = cir::PointerType::get(structTy);
 
     // Create an alloca for the struct.
-    OwningOpRef<cir::AllocaOp> varPtrOp = b.create<cir::AllocaOp>(
-        loc, structPptrTy, structTy, "S", getAlignOne(&context));
+    OwningOpRef<cir::AllocaOp> varPtrOp = cir::AllocaOp::create(
+        b, loc, structPptrTy, structTy, "S", getAlignOne(&context));
     mlir::Value val = varPtrOp.get();
 
     // Get a pointer to the second member.
-    OwningOpRef<cir::GetMemberOp> access = b.create<cir::GetMemberOp>(
-        loc, cir::PointerType::get(ptrTy), val, b.getStringAttr("f2"), 1);
+    OwningOpRef<cir::GetMemberOp> access = cir::GetMemberOp::create(
+        b, loc, cir::PointerType::get(ptrTy), val, b.getStringAttr("f2"), 1);
     mlir::Value accessVal = access.get();
 
     // Load the value of the second member. This is the pointer we want to test.
-    OwningOpRef<cir::LoadOp> loadOp = b.create<cir::LoadOp>(loc, accessVal);
+    OwningOpRef<cir::LoadOp> loadOp = cir::LoadOp::create(b, loc, accessVal);
     mlir::Value loadVal = loadOp.get();
 
     // Verify that the type category is the expected type category.
diff --git a/clang/unittests/Driver/MultilibBuilderTest.cpp b/clang/unittests/Driver/MultilibBuilderTest.cpp
index 0c1e806d42065..ebcbee7974001 100644
--- a/clang/unittests/Driver/MultilibBuilderTest.cpp
+++ b/clang/unittests/Driver/MultilibBuilderTest.cpp
@@ -61,7 +61,7 @@ TEST(MultilibBuilderTest, Construction3) {
       MultilibBuilder().flag("-f1").flag("-f2").flag("-f3", /*Disallow=*/true);
   for (const std::string &A : M.flags()) {
     ASSERT_TRUE(llvm::StringSwitch<bool>(A)
-                    .Cases("-f1", "-f2", "!f3", true)
+                    .Cases({"-f1", "-f2", "!f3"}, true)
                     .Default(false));
   }
 }
diff --git a/clang/unittests/Driver/MultilibTest.cpp b/clang/unittests/Driver/MultilibTest.cpp
index 4c11e6a32d7c8..ebb8611d97e1c 100644
--- a/clang/unittests/Driver/MultilibTest.cpp
+++ b/clang/unittests/Driver/MultilibTest.cpp
@@ -131,7 +131,7 @@ TEST(MultilibTest, Construction3) {
                                             E = M.flags().end();
        I != E; ++I) {
     ASSERT_TRUE(llvm::StringSwitch<bool>(*I)
-                    .Cases("+f1", "+f2", "-f3", true)
+                    .Cases({"+f1", "+f2", "-f3"}, true)
                     .Default(false));
   }
 }
diff --git a/clang/unittests/Format/FormatTestComments.cpp b/clang/unittests/Format/FormatTestComments.cpp
index fc80bf4024fd9..6b433bb384864 100644
--- a/clang/unittests/Format/FormatTestComments.cpp
+++ b/clang/unittests/Format/FormatTestComments.cpp
@@ -839,6 +839,25 @@ TEST_F(FormatTestComments, MultiLineCommentsInDefines) {
                    getLLVMStyleWithColumns(17)));
 }
 
+TEST_F(FormatTestComments, LineCommentsInMacrosDoNotGetEscapedNewlines) {
+  FormatStyle Style = getLLVMStyleWithColumns(0);
+  Style.ReflowComments = FormatStyle::RCS_Never;
+  verifyFormat("#define FOO (1U) // comment\n"
+               "                 // comment",
+               Style);
+
+  Style.ColumnLimit = 32;
+  verifyFormat("#define SOME_MACRO(x) x\n"
+               "#define FOO                    \\\n"
+               "  SOME_MACRO(1) +              \\\n"
+               "      SOME_MACRO(2) // comment\n"
+               "                    // comment",
+               "#define SOME_MACRO(x) x\n"
+               "#define FOO SOME_MACRO(1) + SOME_MACRO(2) // comment\n"
+               "                                          // comment",
+               Style);
+}
+
 TEST_F(FormatTestComments, ParsesCommentsAdjacentToPPDirectives) {
   EXPECT_EQ("namespace {}\n// Test\n#define A",
             format("namespace {}\n   // Test\n#define A"));
diff --git a/clang/unittests/Format/FormatTestObjC.cpp b/clang/unittests/Format/FormatTestObjC.cpp
index 700d7cf8efca6..cf8143ace7b45 100644
--- a/clang/unittests/Format/FormatTestObjC.cpp
+++ b/clang/unittests/Format/FormatTestObjC.cpp
@@ -949,6 +949,12 @@ TEST_F(FormatTestObjC, FormatObjCMethodExpr) {
       "[aaaaaaaaaaaaaaaaaaaaaaaaa\n"
       "    aaaaaaaaaaaaaaaaa:aaaaaaaa\n"
       "                  aaa:aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa];");
+  verifyFormat("[objectName\n"
+               "    respondsToSelector:\n"
+               "        @selector(\n"
+               "            somelonglonglonglongnameeeeeeee:\n"
+               "            loooooooooanotherlonglonglonglongnametopush:\n"
+               "            otherlongnameforlimit:)];");
 
   Style = getChromiumStyle(FormatStyle::LK_ObjC);
   Style.ColumnLimit = 80;
diff --git a/clang/unittests/Sema/HeuristicResolverTest.cpp b/clang/unittests/Sema/HeuristicResolverTest.cpp
index a00632f232ea5..c592e74c41a95 100644
--- a/clang/unittests/Sema/HeuristicResolverTest.cpp
+++ b/clang/unittests/Sema/HeuristicResolverTest.cpp
@@ -524,6 +524,28 @@ TEST(HeuristicResolver, MemberExpr_HangIssue126536) {
       cxxDependentScopeMemberExpr(hasMemberName("foo")).bind("input"));
 }
 
+TEST(HeuristicResolver, MemberExpr_HangOnLongCallChain) {
+  const size_t CallChainLength = 50;
+  std::string Code = R"cpp(
+    template <typename T>
+    void foo(T t) {
+      t
+    )cpp";
+  for (size_t I = 0; I < CallChainLength; ++I)
+    Code.append(".method()\n");
+  Code.append(R"cpp(
+      .lastMethod();
+    }
+  )cpp");
+  // Test that resolution of a name whose base is a long call chain
+  // does not hang. Note that the hang for which this is a regression
+  // test is finite (exponential runtime in the length of the chain),
+  // so a "failure" here manifests as abnormally long runtime.
+  expectResolution(
+      Code, &HeuristicResolver::resolveMemberExpr,
+      cxxDependentScopeMemberExpr(hasMemberName("lastMethod")).bind("input"));
+}
+
 TEST(HeuristicResolver, MemberExpr_DefaultTemplateArgument) {
   std::string Code = R"cpp(
     struct Default {
diff --git a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
index aa32bb3d39f6d..4523af33e3c28 100644
--- a/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
+++ b/clang/unittests/Tooling/DependencyScanning/DependencyScannerTest.cpp
@@ -49,6 +49,8 @@ class TestFileCollector : public DependencyFileGenerator {
   std::vector<std::string> &Deps;
 };
 
+// FIXME: Use the regular Service/Worker/Collector APIs instead of
+//        reimplementing the action.
 class TestDependencyScanningAction : public tooling::ToolAction {
 public:
   TestDependencyScanningAction(std::vector<std::string> &Deps) : Deps(Deps) {}
@@ -59,6 +61,7 @@ class TestDependencyScanningAction : public tooling::ToolAction {
                      DiagnosticConsumer *DiagConsumer) override {
     CompilerInstance Compiler(std::move(Invocation),
                               std::move(PCHContainerOps));
+    Compiler.setVirtualFileSystem(FileMgr->getVirtualFileSystemPtr());
     Compiler.setFileManager(FileMgr);
 
     Compiler.createDiagnostics(DiagConsumer, /*ShouldOwnClient=*/false);
diff --git a/clang/unittests/Tooling/Syntax/TokensTest.cpp b/clang/unittests/Tooling/Syntax/TokensTest.cpp
index 6094177e4817b..47184cbf5d768 100644
--- a/clang/unittests/Tooling/Syntax/TokensTest.cpp
+++ b/clang/unittests/Tooling/Syntax/TokensTest.cpp
@@ -134,6 +134,7 @@ class TokenCollectorTest : public ::testing::Test {
         FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
     CompilerInstance Compiler(std::move(CI));
     Compiler.setDiagnostics(Diags);
+    Compiler.setVirtualFileSystem(FS);
     Compiler.setFileManager(FileMgr);
     Compiler.setSourceManager(SourceMgr);
 
diff --git a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
index 400a0d5a1801b..b2be64fc08f3d 100644
--- a/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
+++ b/clang/unittests/Tooling/Syntax/TreeTestBase.cpp
@@ -153,6 +153,7 @@ SyntaxTreeTest::buildTree(StringRef Code, const TestClangConfig &ClangConfig) {
       FileName, llvm::MemoryBuffer::getMemBufferCopy(Code).release());
   CompilerInstance Compiler(Invocation);
   Compiler.setDiagnostics(Diags);
+  Compiler.setVirtualFileSystem(FS);
   Compiler.setFileManager(FileMgr);
   Compiler.setSourceManager(SourceMgr);
 
diff --git a/clang/utils/TableGen/ClangAttrEmitter.cpp b/clang/utils/TableGen/ClangAttrEmitter.cpp
index 1342e1a6ffb5b..183952af590e1 100644
--- a/clang/utils/TableGen/ClangAttrEmitter.cpp
+++ b/clang/utils/TableGen/ClangAttrEmitter.cpp
@@ -3078,6 +3078,17 @@ static void emitAttributes(const RecordKeeper &Records, raw_ostream &OS,
 
       OS << "  {\n";
 
+      // The generator puts the arguments for each attribute in the child class,
+      // even if those are set in the inherited attribute class (in the TD
+      // file). This means I cannot access those from the parent class, and have
+      // to do this weirdness. Maybe the generator should be changed to
+      // arguments are put in the class they are declared in inside the TD file?
+      if (HLSLSemantic) {
+        OS << "  if (SemanticExplicitIndex)\n";
+        OS << "    setSemanticIndex(SemanticIndex);\n";
+        OS << "  setTargetDecl(Target);\n";
+      }
+
       for (auto const &ai : Args) {
         if (!shouldEmitArg(ai))
           continue;
diff --git a/clang/utils/TableGen/RISCVVEmitter.cpp b/clang/utils/TableGen/RISCVVEmitter.cpp
index 74f29ac43ac40..c316dfd30d9bf 100644
--- a/clang/utils/TableGen/RISCVVEmitter.cpp
+++ b/clang/utils/TableGen/RISCVVEmitter.cpp
@@ -241,6 +241,8 @@ static unsigned getSegInstLog2SEW(StringRef InstName) {
 void emitCodeGenSwitchBody(const RVVIntrinsic *RVVI, raw_ostream &OS) {
   if (!RVVI->getIRName().empty())
     OS << "  ID = Intrinsic::riscv_" + RVVI->getIRName() + ";\n";
+  if (RVVI->getTWiden() > 0)
+    OS << "  TWiden = " << RVVI->getTWiden() << ";\n";
 
   OS << "  PolicyAttrs = " << RVVI->getPolicyAttrsBits() << ";\n";
   unsigned IndexedLoadStorePtrIdx = getIndexedLoadStorePtrIdx(RVVI);
@@ -295,6 +297,9 @@ void emitCodeGenSwitchBody(const RVVIntrinsic *RVVI, raw_ostream &OS) {
       OS << "  Ops.insert(Ops.begin(), llvm::PoisonValue::get(ResultType));\n";
   }
 
+  if (RVVI->getTWiden() > 0)
+    OS << "  Ops.push_back(ConstantInt::get(Ops.back()->getType(), TWiden));\n";
+
   OS << "  IntrinsicTypes = {";
   ListSeparator LS;
   for (const auto &Idx : RVVI->getIntrinsicTypes()) {
@@ -583,7 +588,8 @@ void RVVEmitter::createCodeGen(raw_ostream &OS) {
         (Def->getManualCodegen() != PrevDef->getManualCodegen()) ||
         (Def->getPolicyAttrs() != PrevDef->getPolicyAttrs()) ||
         (getSegInstLog2SEW(Def->getOverloadedName()) !=
-         getSegInstLog2SEW(PrevDef->getOverloadedName()))) {
+         getSegInstLog2SEW(PrevDef->getOverloadedName())) ||
+        (Def->getTWiden() != PrevDef->getTWiden())) {
       emitCodeGenSwitchBody(PrevDef, OS);
     }
     PrevDef = Def.get();
@@ -645,6 +651,7 @@ void RVVEmitter::createRVVIntrinsics(
     StringRef IRName = R->getValueAsString("IRName");
     StringRef MaskedIRName = R->getValueAsString("MaskedIRName");
     unsigned NF = R->getValueAsInt("NF");
+    unsigned TWiden = R->getValueAsInt("TWiden");
     bool IsTuple = R->getValueAsBit("IsTuple");
     bool HasFRMRoundModeOp = R->getValueAsBit("HasFRMRoundModeOp");
 
@@ -694,7 +701,7 @@ void RVVEmitter::createRVVIntrinsics(
             /*IsMasked=*/false, /*HasMaskedOffOperand=*/false, HasVL,
             UnMaskedPolicyScheme, SupportOverloading, HasBuiltinAlias,
             ManualCodegen, *Types, IntrinsicTypes, NF, DefaultPolicy,
-            HasFRMRoundModeOp));
+            HasFRMRoundModeOp, TWiden));
         if (UnMaskedPolicyScheme != PolicyScheme::SchemeNone)
           for (auto P : SupportedUnMaskedPolicies) {
             SmallVector<PrototypeDescriptor> PolicyPrototype =
@@ -709,7 +716,7 @@ void RVVEmitter::createRVVIntrinsics(
                 /*IsMask=*/false, /*HasMaskedOffOperand=*/false, HasVL,
                 UnMaskedPolicyScheme, SupportOverloading, HasBuiltinAlias,
                 ManualCodegen, *PolicyTypes, IntrinsicTypes, NF, P,
-                HasFRMRoundModeOp));
+                HasFRMRoundModeOp, TWiden));
           }
         if (!HasMasked)
           continue;
@@ -720,7 +727,7 @@ void RVVEmitter::createRVVIntrinsics(
             Name, SuffixStr, OverloadedName, OverloadedSuffixStr, MaskedIRName,
             /*IsMasked=*/true, HasMaskedOffOperand, HasVL, MaskedPolicyScheme,
             SupportOverloading, HasBuiltinAlias, ManualCodegen, *MaskTypes,
-            IntrinsicTypes, NF, DefaultPolicy, HasFRMRoundModeOp));
+            IntrinsicTypes, NF, DefaultPolicy, HasFRMRoundModeOp, TWiden));
         if (MaskedPolicyScheme == PolicyScheme::SchemeNone)
           continue;
         for (auto P : SupportedMaskedPolicies) {
@@ -735,7 +742,7 @@ void RVVEmitter::createRVVIntrinsics(
               MaskedIRName, /*IsMasked=*/true, HasMaskedOffOperand, HasVL,
               MaskedPolicyScheme, SupportOverloading, HasBuiltinAlias,
               ManualCodegen, *PolicyTypes, IntrinsicTypes, NF, P,
-              HasFRMRoundModeOp));
+              HasFRMRoundModeOp, TWiden));
         }
       } // End for Log2LMULList
     }   // End for TypeRange
diff --git a/clang/utils/perf-training/perf-helper.py b/clang/utils/perf-training/perf-helper.py
index 1c7904ec62163..97b771103ae2b 100644
--- a/clang/utils/perf-training/perf-helper.py
+++ b/clang/utils/perf-training/perf-helper.py
@@ -133,7 +133,7 @@ def perf2bolt(args):
         "--profile-format=yaml",
     ]
     if not opts.lbr:
-        p2b_args += ["-nl"]
+        p2b_args += ["-ba"]
     p2b_args += ["-p"]
     for filename in findFilesWithExtension(opts.path, "perf.data"):
         subprocess.check_call(p2b_args + [filename, "-o", filename + ".fdata"])
@@ -722,7 +722,7 @@ def bolt_optimize(args):
             "-dyno-stats",
             "-use-gnu-stack",
             "-update-debug-sections",
-            "-nl" if opts.method == "PERF" else "",
+            "-ba" if opts.method == "PERF" else "",
         ]
         print("Running: " + " ".join(args))
         process = subprocess.run(
diff --git a/compiler-rt/CMakeLists.txt b/compiler-rt/CMakeLists.txt
index 162c5e3210f0d..1ea093e7ecc2d 100644
--- a/compiler-rt/CMakeLists.txt
+++ b/compiler-rt/CMakeLists.txt
@@ -315,7 +315,7 @@ include(config-ix)
 # Setup Compiler Flags
 #================================
 
-# fcf-protection is a gcc/clang option for CET support on Linux platforms.
+# fcf-protection is a gcc/clang option for CET support on some ELF platforms.
 # We need to handle MSVC CET option on Windows platforms.
 if (NOT MSVC)
   if (COMPILER_RT_ENABLE_CET AND NOT COMPILER_RT_HAS_FCF_PROTECTION_FLAG)
diff --git a/compiler-rt/lib/asan/asan_interceptors.cpp b/compiler-rt/lib/asan/asan_interceptors.cpp
index 03eea2ec3001c..0951a77b1b93e 100644
--- a/compiler-rt/lib/asan/asan_interceptors.cpp
+++ b/compiler-rt/lib/asan/asan_interceptors.cpp
@@ -58,7 +58,7 @@ namespace __asan {
 
 static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 #if SANITIZER_INTERCEPT_STRNLEN
-  if (REAL(strnlen))
+  if (static_cast<bool>(REAL(strnlen)))
     return REAL(strnlen)(s, maxlen);
 #  endif
   return internal_strnlen(s, maxlen);
@@ -66,7 +66,7 @@ static inline uptr MaybeRealStrnlen(const char *s, uptr maxlen) {
 
 static inline uptr MaybeRealWcsnlen(const wchar_t* s, uptr maxlen) {
 #  if SANITIZER_INTERCEPT_WCSNLEN
-  if (REAL(wcsnlen))
+  if (static_cast<bool>(REAL(wcsnlen)))
     return REAL(wcsnlen)(s, maxlen);
 #  endif
   return internal_wcsnlen(s, maxlen);
diff --git a/compiler-rt/lib/builtins/assembly.h b/compiler-rt/lib/builtins/assembly.h
index 79a45d9143b40..368cbaf108d31 100644
--- a/compiler-rt/lib/builtins/assembly.h
+++ b/compiler-rt/lib/builtins/assembly.h
@@ -14,7 +14,7 @@
 #ifndef COMPILERRT_ASSEMBLY_H
 #define COMPILERRT_ASSEMBLY_H
 
-#if defined(__linux__) && defined(__CET__)
+#ifdef __CET__
 #if __has_include(<cet.h>)
 #include <cet.h>
 #endif
@@ -337,7 +337,8 @@
 #endif
 #endif
 
-#if defined(__i386__) || defined(__amd64__)
+#if defined(__ASSEMBLER__) && (defined(__i386__) || defined(__amd64__)) &&     \
+    !defined(__arm64ec__)
 .att_syntax
 #endif
 
diff --git a/compiler-rt/lib/builtins/cpu_model/x86.c b/compiler-rt/lib/builtins/cpu_model/x86.c
index 7ddfaa3e3ed6c..c21b2bad1d212 100644
--- a/compiler-rt/lib/builtins/cpu_model/x86.c
+++ b/compiler-rt/lib/builtins/cpu_model/x86.c
@@ -235,6 +235,19 @@ enum ProcessorFeatures {
   CPU_FEATURE_MAX
 };
 
+#ifndef _WIN32
+__attribute__((visibility("hidden")))
+#endif
+struct __processor_model {
+  unsigned int __cpu_vendor;
+  unsigned int __cpu_type;
+  unsigned int __cpu_subtype;
+  unsigned int __cpu_features[1];
+} __cpu_model = {0, 0, 0, {0}};
+
+static_assert(sizeof(__cpu_model) == 16,
+              "Wrong size of __cpu_model will result in ABI break");
+
 // This code is copied from lib/Support/Host.cpp.
 // Changes to either file should be mirrored in the other.
 
@@ -319,15 +332,17 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
 
 #define testFeature(F) (Features[F / 32] & (1 << (F % 32))) != 0
 
-static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
-                                                   unsigned Model,
-                                                   const unsigned *Features,
-                                                   unsigned *Type,
-                                                   unsigned *Subtype) {
+static const char *
+getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                                const unsigned *Features,
+                                struct __processor_model *CpuModel) {
   // We select CPU strings to match the code in Host.cpp, but we don't use them
   // in compiler-rt.
   const char *CPU = 0;
 
+  enum ProcessorTypes Type = CPU_TYPE_MAX;
+  enum ProcessorSubtypes Subtype = CPU_SUBTYPE_MAX;
+
   switch (Family) {
   case 0x6:
     switch (Model) {
@@ -339,7 +354,7 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x16: // Intel Celeron processor model 16h. All processors are
                // manufactured using the 65 nm process
       CPU = "core2";
-      *Type = INTEL_CORE2;
+      Type = INTEL_CORE2;
       break;
     case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
                // 17h. All processors are manufactured using the 45 nm process.
@@ -348,7 +363,7 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
                // the 45 nm process.
       CPU = "penryn";
-      *Type = INTEL_CORE2;
+      Type = INTEL_CORE2;
       break;
     case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 45 nm process.
@@ -357,29 +372,29 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x1f:
     case 0x2e: // Nehalem EX
       CPU = "nehalem";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_NEHALEM;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_NEHALEM;
       break;
     case 0x25: // Intel Core i7, laptop version.
     case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 32 nm process.
     case 0x2f: // Westmere EX
       CPU = "westmere";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_WESTMERE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_WESTMERE;
       break;
     case 0x2a: // Intel Core i7 processor. All processors are manufactured
                // using the 32 nm process.
     case 0x2d:
       CPU = "sandybridge";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SANDYBRIDGE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_SANDYBRIDGE;
       break;
     case 0x3a:
     case 0x3e: // Ivy Bridge EP
       CPU = "ivybridge";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_IVYBRIDGE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_IVYBRIDGE;
       break;
 
     // Haswell:
@@ -388,8 +403,8 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x45:
     case 0x46:
       CPU = "haswell";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_HASWELL;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_HASWELL;
       break;
 
     // Broadwell:
@@ -398,8 +413,8 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x4f:
     case 0x56:
       CPU = "broadwell";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_BROADWELL;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_BROADWELL;
       break;
 
     // Skylake:
@@ -410,61 +425,61 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0xa5: // Comet Lake-H/S
     case 0xa6: // Comet Lake-U
       CPU = "skylake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SKYLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_SKYLAKE;
       break;
 
     // Rocketlake:
     case 0xa7:
       CPU = "rocketlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ROCKETLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ROCKETLAKE;
       break;
 
     // Skylake Xeon:
     case 0x55:
-      *Type = INTEL_COREI7;
+      Type = INTEL_COREI7;
       if (testFeature(FEATURE_AVX512BF16)) {
         CPU = "cooperlake";
-        *Subtype = INTEL_COREI7_COOPERLAKE;
+        Subtype = INTEL_COREI7_COOPERLAKE;
       } else if (testFeature(FEATURE_AVX512VNNI)) {
         CPU = "cascadelake";
-        *Subtype = INTEL_COREI7_CASCADELAKE;
+        Subtype = INTEL_COREI7_CASCADELAKE;
       } else {
         CPU = "skylake-avx512";
-        *Subtype = INTEL_COREI7_SKYLAKE_AVX512;
+        Subtype = INTEL_COREI7_SKYLAKE_AVX512;
       }
       break;
 
     // Cannonlake:
     case 0x66:
       CPU = "cannonlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_CANNONLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_CANNONLAKE;
       break;
 
     // Icelake:
     case 0x7d:
     case 0x7e:
       CPU = "icelake-client";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_CLIENT;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ICELAKE_CLIENT;
       break;
 
     // Tigerlake:
     case 0x8c:
     case 0x8d:
       CPU = "tigerlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_TIGERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_TIGERLAKE;
       break;
 
     // Alderlake:
     case 0x97:
     case 0x9a:
       CPU = "alderlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ALDERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ALDERLAKE;
       break;
 
     // Raptorlake:
@@ -472,23 +487,23 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0xba:
     case 0xbf:
       CPU = "raptorlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ALDERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ALDERLAKE;
       break;
 
     // Meteorlake:
     case 0xaa:
     case 0xac:
       CPU = "meteorlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ALDERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ALDERLAKE;
       break;
 
     // Gracemont:
     case 0xbe:
       CPU = "gracemont";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ALDERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ALDERLAKE;
       break;
 
     // Arrowlake:
@@ -496,72 +511,72 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     // Arrowlake U:
     case 0xb5:
       CPU = "arrowlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ARROWLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ARROWLAKE;
       break;
 
     // Arrowlake S:
     case 0xc6:
       CPU = "arrowlake-s";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ARROWLAKE_S;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ARROWLAKE_S;
       break;
 
     // Lunarlake:
     case 0xbd:
       CPU = "lunarlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ARROWLAKE_S;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ARROWLAKE_S;
       break;
 
     // Pantherlake:
     case 0xcc:
       CPU = "pantherlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_PANTHERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_PANTHERLAKE;
       break;
 
     // Wildcatlake:
     case 0xd5:
       CPU = "wildcatlake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_PANTHERLAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_PANTHERLAKE;
       break;
 
     // Icelake Xeon:
     case 0x6a:
     case 0x6c:
       CPU = "icelake-server";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_ICELAKE_SERVER;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_ICELAKE_SERVER;
       break;
 
     // Emerald Rapids:
     case 0xcf:
       CPU = "emeraldrapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
       break;
 
     // Sapphire Rapids:
     case 0x8f:
       CPU = "sapphirerapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_SAPPHIRERAPIDS;
       break;
 
     // Granite Rapids:
     case 0xad:
       CPU = "graniterapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_GRANITERAPIDS;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_GRANITERAPIDS;
       break;
 
     // Granite Rapids D:
     case 0xae:
       CPU = "graniterapids-d";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_GRANITERAPIDS_D;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_GRANITERAPIDS_D;
       break;
 
     case 0x1c: // Most 45 nm Intel Atom processors
@@ -570,7 +585,7 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x35: // 32 nm Atom Midview
     case 0x36: // 32 nm Atom Midview
       CPU = "bonnell";
-      *Type = INTEL_BONNELL;
+      Type = INTEL_BONNELL;
       break;
 
     // Atom Silvermont codes from the Intel software optimization guide.
@@ -581,53 +596,52 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x5d:
     case 0x4c: // really airmont
       CPU = "silvermont";
-      *Type = INTEL_SILVERMONT;
+      Type = INTEL_SILVERMONT;
       break;
     // Goldmont:
     case 0x5c: // Apollo Lake
     case 0x5f: // Denverton
       CPU = "goldmont";
-      *Type = INTEL_GOLDMONT;
+      Type = INTEL_GOLDMONT;
       break; // "goldmont"
     case 0x7a:
       CPU = "goldmont-plus";
-      *Type = INTEL_GOLDMONT_PLUS;
+      Type = INTEL_GOLDMONT_PLUS;
       break;
     case 0x86:
     case 0x8a: // Lakefield
     case 0x96: // Elkhart Lake
     case 0x9c: // Jasper Lake
       CPU = "tremont";
-      *Type = INTEL_TREMONT;
+      Type = INTEL_TREMONT;
       break;
 
     // Sierraforest:
     case 0xaf:
       CPU = "sierraforest";
-      *Type = INTEL_SIERRAFOREST;
+      Type = INTEL_SIERRAFOREST;
       break;
 
     // Grandridge:
     case 0xb6:
       CPU = "grandridge";
-      *Type = INTEL_GRANDRIDGE;
+      Type = INTEL_GRANDRIDGE;
       break;
 
     // Clearwaterforest:
     case 0xdd:
       CPU = "clearwaterforest";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_CLEARWATERFOREST;
+      Type = INTEL_CLEARWATERFOREST;
       break;
 
     case 0x57:
       CPU = "knl";
-      *Type = INTEL_KNL;
+      Type = INTEL_KNL;
       break;
 
     case 0x85:
       CPU = "knm";
-      *Type = INTEL_KNM;
+      Type = INTEL_KNM;
       break;
 
     default: // Unknown family 6 CPU.
@@ -639,8 +653,8 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     // Diamond Rapids:
     case 0x01:
       CPU = "diamondrapids";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_DIAMONDRAPIDS;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_DIAMONDRAPIDS;
       break;
 
     default: // Unknown family 19 CPU.
@@ -652,8 +666,8 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     case 0x1:
     case 0x3:
       CPU = "novalake";
-      *Type = INTEL_COREI7;
-      *Subtype = INTEL_COREI7_NOVALAKE;
+      Type = INTEL_COREI7;
+      Subtype = INTEL_COREI7_NOVALAKE;
       break;
     default: // Unknown family 0x12 CPU.
       break;
@@ -664,16 +678,23 @@ static const char *getIntelProcessorTypeAndSubtype(unsigned Family,
     break; // Unknown.
   }
 
+  if (Type != CPU_TYPE_MAX)
+    CpuModel->__cpu_type = Type;
+  if (Subtype != CPU_SUBTYPE_MAX)
+    CpuModel->__cpu_subtype = Subtype;
+
   return CPU;
 }
 
-static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
-                                                 unsigned Model,
-                                                 const unsigned *Features,
-                                                 unsigned *Type,
-                                                 unsigned *Subtype) {
+static const char *
+getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                              const unsigned *Features,
+                              struct __processor_model *CpuModel) {
   const char *CPU = 0;
 
+  enum ProcessorTypes Type = CPU_TYPE_MAX;
+  enum ProcessorSubtypes Subtype = CPU_SUBTYPE_MAX;
+
   switch (Family) {
   case 4:
     CPU = "i486";
@@ -714,53 +735,53 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
   case 16:
   case 18:
     CPU = "amdfam10";
-    *Type = AMDFAM10H; // "amdfam10"
+    Type = AMDFAM10H; // "amdfam10"
     switch (Model) {
     case 2:
-      *Subtype = AMDFAM10H_BARCELONA;
+      Subtype = AMDFAM10H_BARCELONA;
       break;
     case 4:
-      *Subtype = AMDFAM10H_SHANGHAI;
+      Subtype = AMDFAM10H_SHANGHAI;
       break;
     case 8:
-      *Subtype = AMDFAM10H_ISTANBUL;
+      Subtype = AMDFAM10H_ISTANBUL;
       break;
     }
     break;
   case 20:
     CPU = "btver1";
-    *Type = AMD_BTVER1;
+    Type = AMD_BTVER1;
     break;
   case 21:
     CPU = "bdver1";
-    *Type = AMDFAM15H;
+    Type = AMDFAM15H;
     if (Model >= 0x60 && Model <= 0x7f) {
       CPU = "bdver4";
-      *Subtype = AMDFAM15H_BDVER4;
+      Subtype = AMDFAM15H_BDVER4;
       break; // 60h-7Fh: Excavator
     }
     if (Model >= 0x30 && Model <= 0x3f) {
       CPU = "bdver3";
-      *Subtype = AMDFAM15H_BDVER3;
+      Subtype = AMDFAM15H_BDVER3;
       break; // 30h-3Fh: Steamroller
     }
     if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
       CPU = "bdver2";
-      *Subtype = AMDFAM15H_BDVER2;
+      Subtype = AMDFAM15H_BDVER2;
       break; // 02h, 10h-1Fh: Piledriver
     }
     if (Model <= 0x0f) {
-      *Subtype = AMDFAM15H_BDVER1;
+      Subtype = AMDFAM15H_BDVER1;
       break; // 00h-0Fh: Bulldozer
     }
     break;
   case 22:
     CPU = "btver2";
-    *Type = AMD_BTVER2;
+    Type = AMD_BTVER2;
     break;
   case 23:
     CPU = "znver1";
-    *Type = AMDFAM17H;
+    Type = AMDFAM17H;
     if ((Model >= 0x30 && Model <= 0x3f) || (Model == 0x47) ||
         (Model >= 0x60 && Model <= 0x67) || (Model >= 0x68 && Model <= 0x6f) ||
         (Model >= 0x70 && Model <= 0x7f) || (Model >= 0x84 && Model <= 0x87) ||
@@ -776,20 +797,20 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       // Family 17h Models 98h-9Fh (Mero) Zen 2
       // Family 17h Models A0h-AFh (Mendocino) Zen 2
       CPU = "znver2";
-      *Subtype = AMDFAM17H_ZNVER2;
+      Subtype = AMDFAM17H_ZNVER2;
       break;
     }
     if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x20 && Model <= 0x2f)) {
       // Family 17h Models 10h-1Fh (Raven1) Zen
       // Family 17h Models 10h-1Fh (Picasso) Zen+
       // Family 17h Models 20h-2Fh (Raven2 x86) Zen
-      *Subtype = AMDFAM17H_ZNVER1;
+      Subtype = AMDFAM17H_ZNVER1;
       break;
     }
     break;
   case 25:
     CPU = "znver3";
-    *Type = AMDFAM19H;
+    Type = AMDFAM19H;
     if (Model <= 0x0f || (Model >= 0x20 && Model <= 0x2f) ||
         (Model >= 0x30 && Model <= 0x3f) || (Model >= 0x40 && Model <= 0x4f) ||
         (Model >= 0x50 && Model <= 0x5f)) {
@@ -798,7 +819,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       // Family 19h Models 30h-3Fh (Badami) Zen 3
       // Family 19h Models 40h-4Fh (Rembrandt) Zen 3+
       // Family 19h Models 50h-5Fh (Cezanne) Zen 3
-      *Subtype = AMDFAM19H_ZNVER3;
+      Subtype = AMDFAM19H_ZNVER3;
       break;
     }
     if ((Model >= 0x10 && Model <= 0x1f) || (Model >= 0x60 && Model <= 0x6f) ||
@@ -810,13 +831,13 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       // Family 19h Models 78h-7Fh (Phoenix 2, Hawkpoint2) Zen 4
       // Family 19h Models A0h-AFh (Stones-Dense) Zen 4
       CPU = "znver4";
-      *Subtype = AMDFAM19H_ZNVER4;
+      Subtype = AMDFAM19H_ZNVER4;
       break; //  "znver4"
     }
     break; // family 19h
   case 26:
     CPU = "znver5";
-    *Type = AMDFAM1AH;
+    Type = AMDFAM1AH;
     if (Model <= 0x77) {
       // Models 00h-0Fh (Breithorn).
       // Models 10h-1Fh (Breithorn-Dense).
@@ -828,7 +849,7 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
       // Models 60h-6Fh (Krackan1).
       // Models 70h-77h (Sarlak).
       CPU = "znver5";
-      *Subtype = AMDFAM1AH_ZNVER5;
+      Subtype = AMDFAM1AH_ZNVER5;
       break; //  "znver5"
     }
     break;
@@ -836,6 +857,11 @@ static const char *getAMDProcessorTypeAndSubtype(unsigned Family,
     break; // Unknown AMD CPU.
   }
 
+  if (Type != CPU_TYPE_MAX)
+    CpuModel->__cpu_type = Type;
+  if (Subtype != CPU_SUBTYPE_MAX)
+    CpuModel->__cpu_subtype = Subtype;
+
   return CPU;
 }
 
@@ -1157,16 +1183,6 @@ __attribute__((visibility("hidden")))
 #endif
 int __cpu_indicator_init(void) CONSTRUCTOR_ATTRIBUTE;
 
-#ifndef _WIN32
-__attribute__((visibility("hidden")))
-#endif
-struct __processor_model {
-  unsigned int __cpu_vendor;
-  unsigned int __cpu_type;
-  unsigned int __cpu_subtype;
-  unsigned int __cpu_features[1];
-} __cpu_model = {0, 0, 0, {0}};
-
 #ifndef _WIN32
 __attribute__((visibility("hidden")))
 #endif
@@ -1209,15 +1225,11 @@ int CONSTRUCTOR_ATTRIBUTE __cpu_indicator_init(void) {
 
   if (Vendor == SIG_INTEL) {
     // Get CPU type.
-    getIntelProcessorTypeAndSubtype(Family, Model, &Features[0],
-                                    &(__cpu_model.__cpu_type),
-                                    &(__cpu_model.__cpu_subtype));
+    getIntelProcessorTypeAndSubtype(Family, Model, &Features[0], &__cpu_model);
     __cpu_model.__cpu_vendor = VENDOR_INTEL;
   } else if (Vendor == SIG_AMD) {
     // Get CPU type.
-    getAMDProcessorTypeAndSubtype(Family, Model, &Features[0],
-                                  &(__cpu_model.__cpu_type),
-                                  &(__cpu_model.__cpu_subtype));
+    getAMDProcessorTypeAndSubtype(Family, Model, &Features[0], &__cpu_model);
     __cpu_model.__cpu_vendor = VENDOR_AMD;
   } else
     __cpu_model.__cpu_vendor = VENDOR_OTHER;
diff --git a/compiler-rt/lib/builtins/gcc_personality_v0.c b/compiler-rt/lib/builtins/gcc_personality_v0.c
index 3ed17fa788c28..6d92a7b248f3c 100644
--- a/compiler-rt/lib/builtins/gcc_personality_v0.c
+++ b/compiler-rt/lib/builtins/gcc_personality_v0.c
@@ -30,6 +30,10 @@ EXCEPTION_DISPOSITION _GCC_specific_handler(PEXCEPTION_RECORD, void *, PCONTEXT,
                                             _Unwind_Personality_Fn);
 #endif
 
+#ifndef __has_feature
+#define __has_feature(__feature) 0
+#endif
+
 #if __has_feature(ptrauth_calls)
 #include <ptrauth.h>
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
index 88ecd7e16306a..ed60671014d5a 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_platform_interceptors.h
@@ -551,6 +551,7 @@ SANITIZER_WEAK_IMPORT void *aligned_alloc(__sanitizer::usize __alignment,
 #define SANITIZER_INTERCEPT_MALLOC_USABLE_SIZE (!SI_MAC && !SI_NETBSD)
 #define SANITIZER_INTERCEPT_MCHECK_MPROBE SI_LINUX_NOT_ANDROID
 #define SANITIZER_INTERCEPT_WCSLEN 1
+#define SANITIZER_INTERCEPT_WCSNLEN 1
 #define SANITIZER_INTERCEPT_WCSCAT (SI_POSIX || SI_WINDOWS)
 #define SANITIZER_INTERCEPT_WCSDUP SI_POSIX
 #define SANITIZER_INTERCEPT_SIGNAL_AND_SIGACTION (!SI_WINDOWS && SI_NOT_FUCHSIA)
diff --git a/compiler-rt/lib/scudo/standalone/secondary.h b/compiler-rt/lib/scudo/standalone/secondary.h
index f0b7bceb010f0..2509db26e6c82 100644
--- a/compiler-rt/lib/scudo/standalone/secondary.h
+++ b/compiler-rt/lib/scudo/standalone/secondary.h
@@ -249,6 +249,7 @@ class MapAllocatorCache {
 
     LRUEntries.clear();
     LRUEntries.init(Entries, sizeof(Entries));
+    OldestPresentEntry = nullptr;
 
     AvailEntries.clear();
     AvailEntries.init(Entries, sizeof(Entries));
@@ -322,8 +323,6 @@ class MapAllocatorCache {
         }
         CachedBlock PrevEntry = Quarantine[QuarantinePos];
         Quarantine[QuarantinePos] = Entry;
-        if (OldestTime == 0)
-          OldestTime = Entry.Time;
         Entry = PrevEntry;
       }
 
@@ -339,9 +338,6 @@ class MapAllocatorCache {
       }
 
       insert(Entry);
-
-      if (OldestTime == 0)
-        OldestTime = Entry.Time;
     } while (0);
 
     for (MemMapT &EvictMemMap : EvictionMemMaps)
@@ -355,7 +351,6 @@ class MapAllocatorCache {
         SCUDO_SCOPED_TRACE(
             GetSecondaryReleaseToOSTraceName(ReleaseToOS::Normal));
 
-        // TODO: Add ReleaseToOS logic to LRU algorithm
         releaseOlderThan(Time - static_cast<u64>(Interval) * 1000000);
         Mutex.unlock();
       } else
@@ -535,6 +530,11 @@ class MapAllocatorCache {
 
   void unmapTestOnly() { empty(); }
 
+  void releaseOlderThanTestOnly(u64 ReleaseTime) {
+    ScopedLock L(Mutex);
+    releaseOlderThan(ReleaseTime);
+  }
+
 private:
   void insert(const CachedBlock &Entry) REQUIRES(Mutex) {
     CachedBlock *AvailEntry = AvailEntries.front();
@@ -542,10 +542,16 @@ class MapAllocatorCache {
 
     *AvailEntry = Entry;
     LRUEntries.push_front(AvailEntry);
+    if (OldestPresentEntry == nullptr && AvailEntry->Time != 0)
+      OldestPresentEntry = AvailEntry;
   }
 
   void remove(CachedBlock *Entry) REQUIRES(Mutex) {
     DCHECK(Entry->isValid());
+    if (OldestPresentEntry == Entry) {
+      OldestPresentEntry = LRUEntries.getPrev(Entry);
+      DCHECK(OldestPresentEntry == nullptr || OldestPresentEntry->Time != 0);
+    }
     LRUEntries.remove(Entry);
     Entry->invalidate();
     AvailEntries.push_front(Entry);
@@ -560,6 +566,7 @@ class MapAllocatorCache {
       for (CachedBlock &Entry : LRUEntries)
         MapInfo[N++] = Entry.MemMap;
       LRUEntries.clear();
+      OldestPresentEntry = nullptr;
     }
     for (uptr I = 0; I < N; I++) {
       MemMapT &MemMap = MapInfo[I];
@@ -567,36 +574,42 @@ class MapAllocatorCache {
     }
   }
 
-  void releaseIfOlderThan(CachedBlock &Entry, u64 Time) REQUIRES(Mutex) {
-    if (!Entry.isValid() || !Entry.Time)
-      return;
-    if (Entry.Time > Time) {
-      if (OldestTime == 0 || Entry.Time < OldestTime)
-        OldestTime = Entry.Time;
-      return;
+  void releaseOlderThan(u64 ReleaseTime) REQUIRES(Mutex) {
+    SCUDO_SCOPED_TRACE(GetSecondaryReleaseOlderThanTraceName());
+
+    if (!Config::getQuarantineDisabled()) {
+      for (uptr I = 0; I < Config::getQuarantineSize(); I++) {
+        auto &Entry = Quarantine[I];
+        if (!Entry.isValid() || Entry.Time == 0 || Entry.Time > ReleaseTime)
+          continue;
+        Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase,
+                                             Entry.CommitSize);
+        Entry.Time = 0;
+      }
     }
-    Entry.MemMap.releaseAndZeroPagesToOS(Entry.CommitBase, Entry.CommitSize);
-    Entry.Time = 0;
-  }
 
-  void releaseOlderThan(u64 Time) REQUIRES(Mutex) {
-    SCUDO_SCOPED_TRACE(GetSecondaryReleaseOlderThanTraceName());
+    for (CachedBlock *Entry = OldestPresentEntry; Entry != nullptr;
+         Entry = LRUEntries.getPrev(Entry)) {
+      DCHECK(Entry->isValid());
+      DCHECK(Entry->Time != 0);
+
+      if (Entry->Time > ReleaseTime) {
+        // All entries are newer than this, so no need to keep scanning.
+        OldestPresentEntry = Entry;
+        return;
+      }
 
-    if (!LRUEntries.size() || OldestTime == 0 || OldestTime > Time)
-      return;
-    OldestTime = 0;
-    if (!Config::getQuarantineDisabled())
-      for (uptr I = 0; I < Config::getQuarantineSize(); I++)
-        releaseIfOlderThan(Quarantine[I], Time);
-    for (uptr I = 0; I < Config::getEntriesArraySize(); I++)
-      releaseIfOlderThan(Entries[I], Time);
+      Entry->MemMap.releaseAndZeroPagesToOS(Entry->CommitBase,
+                                            Entry->CommitSize);
+      Entry->Time = 0;
+    }
+    OldestPresentEntry = nullptr;
   }
 
   HybridMutex Mutex;
   u32 QuarantinePos GUARDED_BY(Mutex) = 0;
   atomic_u32 MaxEntriesCount = {};
   atomic_uptr MaxEntrySize = {};
-  u64 OldestTime GUARDED_BY(Mutex) = 0;
   atomic_s32 ReleaseToOsIntervalMs = {};
   u32 CallsToRetrieve GUARDED_BY(Mutex) = 0;
   u32 SuccessfulRetrieves GUARDED_BY(Mutex) = 0;
@@ -606,6 +619,8 @@ class MapAllocatorCache {
   NonZeroLengthArray<CachedBlock, Config::getQuarantineSize()>
       Quarantine GUARDED_BY(Mutex) = {};
 
+  // The oldest entry in the LRUEntries that has Time non-zero.
+  CachedBlock *OldestPresentEntry GUARDED_BY(Mutex) = nullptr;
   // Cached blocks stored in LRU order
   DoublyLinkedList<CachedBlock> LRUEntries GUARDED_BY(Mutex);
   // The unused Entries
diff --git a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
index d8a7f6bd66ed2..855a3e6e6109f 100644
--- a/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/secondary_test.cpp
@@ -403,6 +403,11 @@ template <class Config> struct CacheInfoType {
                    MemMap.getBase(), MemMap);
     }
   }
+
+  void storeMemMap(scudo::MemMapT &MemMap) {
+    Cache->store(Options, MemMap.getBase(), MemMap.getCapacity(),
+                 MemMap.getBase(), MemMap);
+  }
 };
 
 TEST(ScudoSecondaryTest, AllocatorCacheEntryOrder) {
@@ -503,3 +508,83 @@ TEST(ScudoSecondaryTest, AllocatorCacheOptions) {
       Info.Cache->setOption(scudo::Option::MaxCacheEntrySize, 1UL << 20));
   EXPECT_TRUE(Info.Cache->canCache(1UL << 16));
 }
+
+TEST(ScudoSecondaryTest, ReleaseOlderThanAllEntries) {
+  CacheInfoType<TestCacheConfig> Info;
+  using CacheConfig = CacheInfoType<TestCacheConfig>::CacheConfig;
+
+  Info.Cache->releaseOlderThanTestOnly(UINT64_MAX);
+
+  Info.fillCacheWithSameSizeBlocks(CacheConfig::getDefaultMaxEntriesCount(),
+                                   1024);
+  for (size_t I = 0; I < Info.MemMaps.size(); I++) {
+    // Set the first u32 value to a non-zero value.
+    *reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()) = 10;
+  }
+
+  Info.Cache->releaseOlderThanTestOnly(UINT64_MAX);
+
+  EXPECT_EQ(Info.MemMaps.size(), CacheConfig::getDefaultMaxEntriesCount());
+  for (size_t I = 0; I < Info.MemMaps.size(); I++) {
+    // All released maps will now be zero.
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 0U);
+  }
+}
+
+// This test assumes that the timestamp comes from getMonotonicFast.
+TEST(ScudoSecondaryTest, ReleaseOlderThanGroups) {
+  CacheInfoType<TestCacheConfig> Info;
+
+  // Disable the release interval so we can do tests the releaseOlderThan
+  // function.
+  Info.Cache->setOption(scudo::Option::ReleaseInterval, -1);
+
+  // Create all of the maps we are going to use.
+  for (size_t I = 0; I < 6; I++) {
+    Info.MemMaps.emplace_back(Info.allocate(1024));
+    // Set the first u32 value to a non-zero value.
+    *reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()) = 10;
+  }
+
+  // Create three groups of entries at three different intervals.
+  Info.storeMemMap(Info.MemMaps[0]);
+  Info.storeMemMap(Info.MemMaps[1]);
+  scudo::u64 FirstTime = scudo::getMonotonicTimeFast();
+
+  // Need to make sure the next set of entries are stamped with a newer time.
+  while (scudo::getMonotonicTimeFast() <= FirstTime)
+    ;
+
+  Info.storeMemMap(Info.MemMaps[2]);
+  Info.storeMemMap(Info.MemMaps[3]);
+  scudo::u64 SecondTime = scudo::getMonotonicTimeFast();
+
+  // Need to make sure the next set of entries are stamped with a newer time.
+  while (scudo::getMonotonicTimeFast() <= SecondTime)
+    ;
+
+  Info.storeMemMap(Info.MemMaps[4]);
+  Info.storeMemMap(Info.MemMaps[5]);
+  scudo::u64 ThirdTime = scudo::getMonotonicTimeFast();
+
+  Info.Cache->releaseOlderThanTestOnly(FirstTime);
+  for (size_t I = 0; I < 2; I++) {
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 0U);
+  }
+  for (size_t I = 2; I < 6; I++) {
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 10U);
+  }
+
+  Info.Cache->releaseOlderThanTestOnly(SecondTime);
+  for (size_t I = 0; I < 4; I++) {
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 0U);
+  }
+  for (size_t I = 4; I < 6; I++) {
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 10U);
+  }
+
+  Info.Cache->releaseOlderThanTestOnly(ThirdTime);
+  for (size_t I = 0; I < 6; I++) {
+    EXPECT_EQ(*reinterpret_cast<scudo::u32 *>(Info.MemMaps[I].getBase()), 0U);
+  }
+}
diff --git a/flang-rt/include/flang-rt/runtime/connection.h b/flang-rt/include/flang-rt/runtime/connection.h
index 4c1d6a4b1140c..158e156d1b0b3 100644
--- a/flang-rt/include/flang-rt/runtime/connection.h
+++ b/flang-rt/include/flang-rt/runtime/connection.h
@@ -73,7 +73,6 @@ struct ConnectionState : public ConnectionAttributes {
     auto least{leftTabLimit.value_or(0)};
     auto newPos{positionInRecord + n};
     positionInRecord = newPos < least ? least : newPos;
-    ;
   }
 
   RT_API_ATTRS void BeginRecord() {
diff --git a/flang-rt/include/flang-rt/runtime/io-stmt.h b/flang-rt/include/flang-rt/runtime/io-stmt.h
index 3de2309954a40..f6a81f7cb8120 100644
--- a/flang-rt/include/flang-rt/runtime/io-stmt.h
+++ b/flang-rt/include/flang-rt/runtime/io-stmt.h
@@ -184,6 +184,18 @@ class IoStatementState {
       }
       connection_.HandleRelativePosition(bytes);
     }
+    RT_API_ATTRS bool SkipBlanks() {
+      if (at_) {
+        const char *start{at_};
+        while (at_ < limit_ && (*at_ == ' ' || *at_ == '\t' || *at_ == '\n')) {
+          ++at_;
+        }
+        connection_.HandleRelativePosition(at_ - start);
+        return true;
+      } else {
+        return false;
+      }
+    }
 
     // Could there be a list-directed repetition count here?
     RT_API_ATTRS bool MightBeRepetitionCount() const {
@@ -289,24 +301,32 @@ class IoStatementState {
   // Skips spaces, advances records, and ignores NAMELIST comments
   RT_API_ATTRS common::optional<char32_t> GetNextNonBlank(
       std::size_t &byteCount, FastAsciiField *fastField = nullptr) {
-    auto ch{GetCurrentChar(byteCount, fastField)};
     bool inNamelist{mutableModes().inNamelist};
+    if (fastField) {
+      while (fastField->SkipBlanks()) {
+        if (auto ch{fastField->Next()}) {
+          if (inNamelist && *ch == '!') {
+            // skip namelist comment
+          } else {
+            byteCount = 1;
+            return ch;
+          }
+        }
+        if (!AdvanceRecord()) {
+          break;
+        }
+        fastField->NextRecord(*this);
+      }
+    }
+    auto ch{GetCurrentCharSlow(byteCount)};
     while (!ch || *ch == ' ' || *ch == '\t' || *ch == '\n' ||
         (inNamelist && *ch == '!')) {
       if (ch && (*ch == ' ' || *ch == '\t' || *ch == '\n')) {
-        if (fastField) {
-          fastField->Advance(0, byteCount);
-        } else {
-          HandleRelativePosition(byteCount);
-        }
-      } else if (AdvanceRecord()) {
-        if (fastField) {
-          fastField->NextRecord(*this);
-        }
-      } else {
+        HandleRelativePosition(byteCount);
+      } else if (!AdvanceRecord()) {
         return common::nullopt;
       }
-      ch = GetCurrentChar(byteCount, fastField);
+      ch = GetCurrentCharSlow(byteCount);
     }
     return ch;
   }
diff --git a/flang-rt/lib/cuda/allocator.cpp b/flang-rt/lib/cuda/allocator.cpp
index 05d97a5db0451..5436051002265 100644
--- a/flang-rt/lib/cuda/allocator.cpp
+++ b/flang-rt/lib/cuda/allocator.cpp
@@ -138,23 +138,21 @@ void RTDEF(CUFRegisterAllocator)() {
 void *CUFAllocPinned(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
-  CUDA_REPORT_IF_ERROR(cudaMallocHost((void **)&p, sizeInBytes));
+  cudaMallocHost((void **)&p, sizeInBytes);
   return p;
 }
 
-void CUFFreePinned(void *p) { CUDA_REPORT_IF_ERROR(cudaFreeHost(p)); }
+void CUFFreePinned(void *p) { cudaFreeHost(p); }
 
 void *CUFAllocDevice(std::size_t sizeInBytes, std::int64_t *asyncObject) {
   void *p;
   if (Fortran::runtime::executionEnvironment.cudaDeviceIsManaged) {
-    CUDA_REPORT_IF_ERROR(
-        cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
+    cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal);
   } else {
     if (asyncObject == nullptr) {
-      CUDA_REPORT_IF_ERROR(cudaMalloc(&p, sizeInBytes));
+      cudaMalloc(&p, sizeInBytes);
     } else {
-      CUDA_REPORT_IF_ERROR(
-          cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject));
+      cudaMallocAsync(&p, sizeInBytes, (cudaStream_t)*asyncObject);
       insertAllocation(p, sizeInBytes, (cudaStream_t)*asyncObject);
     }
   }
@@ -167,21 +165,20 @@ void CUFFreeDevice(void *p) {
   if (pos >= 0) {
     cudaStream_t stream = deviceAllocations[pos].stream;
     eraseAllocation(pos);
-    CUDA_REPORT_IF_ERROR(cudaFreeAsync(p, stream));
+    cudaFreeAsync(p, stream);
   } else {
-    CUDA_REPORT_IF_ERROR(cudaFree(p));
+    cudaFree(p);
   }
 }
 
 void *CUFAllocManaged(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
   void *p;
-  CUDA_REPORT_IF_ERROR(
-      cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal));
+  cudaMallocManaged((void **)&p, sizeInBytes, cudaMemAttachGlobal);
   return reinterpret_cast<void *>(p);
 }
 
-void CUFFreeManaged(void *p) { CUDA_REPORT_IF_ERROR(cudaFree(p)); }
+void CUFFreeManaged(void *p) { cudaFree(p); }
 
 void *CUFAllocUnified(
     std::size_t sizeInBytes, [[maybe_unused]] std::int64_t *asyncObject) {
diff --git a/flang-rt/lib/cuda/kernel.cpp b/flang-rt/lib/cuda/kernel.cpp
index e299a114ed7eb..296f4b7913268 100644
--- a/flang-rt/lib/cuda/kernel.cpp
+++ b/flang-rt/lib/cuda/kernel.cpp
@@ -76,8 +76,8 @@ void RTDEF(CUFLaunchKernel)(const void *kernel, intptr_t gridX, intptr_t gridY,
     terminator.Crash("Too many invalid grid dimensions");
   }
   cudaStream_t defaultStream = 0;
-  CUDA_REPORT_IF_ERROR(cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
-      stream != nullptr ? (cudaStream_t)(*stream) : defaultStream));
+  cudaLaunchKernel(kernel, gridDim, blockDim, params, smem,
+      stream != nullptr ? (cudaStream_t)(*stream) : defaultStream);
 }
 
 void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
@@ -153,7 +153,7 @@ void RTDEF(CUFLaunchClusterKernel)(const void *kernel, intptr_t clusterX,
   launchAttr[0].val.clusterDim.z = clusterZ;
   config.numAttrs = 1;
   config.attrs = launchAttr;
-  CUDA_REPORT_IF_ERROR(cudaLaunchKernelExC(&config, kernel, params));
+  cudaLaunchKernelExC(&config, kernel, params);
 }
 
 void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
@@ -218,8 +218,8 @@ void RTDEF(CUFLaunchCooperativeKernel)(const void *kernel, intptr_t gridX,
     terminator.Crash("Too many invalid grid dimensions");
   }
   cudaStream_t defaultStream = 0;
-  CUDA_REPORT_IF_ERROR(cudaLaunchCooperativeKernel(kernel, gridDim, blockDim,
-      params, smem, stream != nullptr ? (cudaStream_t)*stream : defaultStream));
+  cudaLaunchCooperativeKernel(kernel, gridDim, blockDim, params, smem,
+      stream != nullptr ? (cudaStream_t)*stream : defaultStream);
 }
 
 } // extern "C"
diff --git a/flang/docs/AssumedRank.md b/flang/docs/AssumedRank.md
index c5d2c3e3909c7..0154adc7c3ad4 100644
--- a/flang/docs/AssumedRank.md
+++ b/flang/docs/AssumedRank.md
@@ -101,9 +101,9 @@ Assumed-rank dummies are also represented in the
 represent assumed-rank in procedure characteristics.
 
 ### Runtime Representation of Assumed-Ranks
-Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of an
-f18 specific addendum when required for the type. This is the usual f18
-descriptor, and no changes is required to represent assumed-ranks in this data
+Assumed-ranks are implemented as CFI_cdesc_t (18.5.3) with the addition of a
+Flang specific addendum when required for the type. This is the usual Flang
+descriptor, and no changes are required to represent assumed-ranks in this data
 structure. In fact, there is no difference between the runtime descriptor
 created for an assumed shape and the runtime descriptor created when the
 corresponding entity is passed as an assumed-rank.
diff --git a/flang/docs/C++17.md b/flang/docs/C++17.md
index f36110af86847..9137827911426 100644
--- a/flang/docs/C++17.md
+++ b/flang/docs/C++17.md
@@ -6,7 +6,7 @@
   
 -->
 
-# C++14/17 features used in f18
+# C++14/17 features used in Flang
 
 ```{contents}
 ---
@@ -27,7 +27,7 @@ out the details of how our C++ code should look and gives
 guidance about feature usage.
 
 We have chosen to use some features of the recent C++17
-language standard in f18.
+language standard in Flang.
 The most important of these are:
 * sum types (discriminated unions) in the form of `std::variant`
 * `using` template parameter packs
@@ -41,7 +41,7 @@ in this list because it's not particularly well known.)
 ## Sum types
 
 First, some background information to explain the need for sum types
-in f18.
+in Flang.
 
 Fortran is notoriously problematic to lex and parse, as tokenization
 depends on the state of the partial parse;
@@ -57,7 +57,7 @@ a unified lexer/parser.
 We have chosen to do so because it is simpler and should reduce
 both initial bugs and long-term maintenance.
 
-Specifically, f18's parser uses the technique of recursive descent with
+Specifically, Flang's parser uses the technique of recursive descent with
 backtracking.
 It is constructed as the incremental composition of pure parsing functions
 that each, when given a context (location in the input stream plus some state),
@@ -73,7 +73,7 @@ of Fortran.
 
 The specification of Fortran uses a form of BNF with alternatives,
 optional elements, sequences, and lists.  Each of these constructs
-in the Fortran grammar maps directly in the f18 parser to both
+in the Fortran grammar maps directly in Flang's parser to both
 the means of combining other parsers as alternatives, &c., and to
 the declarations of the parse tree data structures that represent
 the results of successful parses.
@@ -87,10 +87,10 @@ The bounded polymorphism supplied by the C++17 `std::variant` fits
 those needs exactly.
 For example, production R502 in Fortran defines the top-level
 program unit of Fortran as being a function, subroutine, module, &c.
-The `struct ProgramUnit` in the f18 parse tree header file
+`struct ProgramUnit` in the Flang parse tree header file
 represents each program unit with a member that is a `std::variant`
 over the six possibilities.
-Similarly, the parser for that type in the f18 grammar has six alternatives,
+Similarly, the parser for that type in Flang's grammar has six alternatives,
 each of which constructs an instance of `ProgramUnit` upon the result of
 parsing a `Module`, `FunctionSubprogram`, and so on.
 
@@ -99,7 +99,7 @@ parse is typically implemented with overloaded functions.
 A function instantiated on `ProgramUnit` will use `std::visit` to
 identify the right alternative and perform the right actions.
 The call to `std::visit` must pass a visitor that can handle all
-of the possibilities, and f18 will fail to build if one is missing.
+of the possibilities, and Flang will fail to build if one is missing.
 
 Were we unable to use `std::variant` directly, we would likely
 have chosen to implement a local `SumType` replacement; in the
diff --git a/flang/docs/C++style.md b/flang/docs/C++style.md
index cbb96f15eb5f1..a4ca9628b56f5 100644
--- a/flang/docs/C++style.md
+++ b/flang/docs/C++style.md
@@ -30,7 +30,7 @@ is clear on usage, follow it.
   is pretty good and comes with lots of justifications for its rules.
 * Reasonable exceptions to these guidelines can be made.
 * Be aware of some workarounds for known issues in older C++ compilers that should
-  still be able to compile f18. They are listed at the end of this document.
+  still be able to compile Flang. They are listed at the end of this document.
 
 ## In particular:
 
@@ -261,7 +261,7 @@ move semantics, member access, and comparison for equality; suitable for use in
 `std::variant<>`.
 * `std::unique_ptr<>`: A nullable pointer with ownership, null by default,
 not copyable, reassignable.
-F18 has a helpful `Deleter<>` class template that makes `unique_ptr<>`
+Flang has a helpful `Deleter<>` class template that makes `unique_ptr<>`
 easier to use with forward-referenced data types.
 * `std::shared_ptr<>`: A nullable pointer with shared ownership via reference
 counting, null by default, shallowly copyable, reassignable, and slow.
@@ -312,9 +312,9 @@ Consistency is one of many aspects in the pursuit of clarity,
 but not an end in itself.
 
 ## C++ compiler bug workarounds
-Below is a list of workarounds for C++ compiler bugs met with f18 that, even
-if the bugs are fixed in latest C++ compiler versions, need to be applied so
-that all desired tool-chains can compile f18.
+Below is a list of workarounds for C++ compiler bugs encountered when building
+Flang. Even if the bugs are fixed in latest C++ compiler versions, these need to
+be applied so that all desired tool-chains can compile Flang.
 
 ### Explicitly move noncopyable local variable into optional results
 
@@ -338,7 +338,7 @@ std::optional<CantBeCopied> fooOK() {
 }
 ```
 The underlying bug is actually not specific to `std::optional` but this is the most common
-case in f18 where the issue may occur. The actual bug can be reproduced with any class `B`
+case in Flang where the issue may occur. The actual bug can be reproduced with any class `B`
 that has a perfect forwarding constructor taking `CantBeCopied` as argument:
 `template<typename CantBeCopied> B(CantBeCopied&& x) x_{std::forward<CantBeCopied>(x)} {}`.
 In such scenarios, Ubuntu 18.04 g++ fails to instantiate the move constructor
diff --git a/flang/docs/Calls.md b/flang/docs/Calls.md
index f518dc00ed8e8..f27af1a461bac 100644
--- a/flang/docs/Calls.md
+++ b/flang/docs/Calls.md
@@ -529,7 +529,7 @@ PGI passes host instance links in descriptors in additional arguments
 that are not always successfully forwarded across implicit interfaces,
 sometimes leading to crashes when they turn out to be needed.
 
-F18 will manage a pool of trampolines in its runtime support library
+Flang will manage a pool of trampolines in its runtime support library
 that can be used to pass internal procedures as effective arguments
 to F77ish procedures, so that
 a bare code address can serve to represent the effective argument.
@@ -569,14 +569,14 @@ Fortran 2018 explicitly enables us to do this with a correction to Fortran
 2003 in 4.3.4(5).
 
 Last, there must be reasonably permanent naming conventions used
-by the F18 runtime library for those unrestricted specific intrinsic
+by Flang's runtime library for those unrestricted specific intrinsic
 functions (table 16.2 in 16.8) and extensions that can be passed as
 arguments.
 
 In these cases where external naming is at the discretion
 of the implementation, we should use names that are not in the C language
 user namespace, begin with something that identifies
-the current incompatible version of F18, the module, the submodule, and
+the current incompatible version of Flang, the module, the submodule, and
 elemental SIMD width, and are followed by the external name.
 The parts of the external name can be separated by some character that
 is acceptable for use in LLVM IR and assembly language but not in user
diff --git a/flang/docs/Character.md b/flang/docs/Character.md
index 4e1d40774d6db..96e0a069e2ba2 100644
--- a/flang/docs/Character.md
+++ b/flang/docs/Character.md
@@ -6,7 +6,7 @@
 
 -->
 
-# Implementation of `CHARACTER` types in f18
+# Implementation of `CHARACTER` types in Flang
 
 ```{contents}
 ---
@@ -16,7 +16,7 @@ local:
 
 ## Kinds and Character Sets
 
-The f18 compiler and runtime support three kinds of the intrinsic
+The Flang compiler and runtime support three kinds of the intrinsic
 `CHARACTER` type of Fortran 2018.
 The default (`CHARACTER(KIND=1)`) holds 8-bit character codes;
 `CHARACTER(KIND=2)` holds 16-bit character codes;
@@ -108,12 +108,12 @@ The result of `//` may be used
 * as the value of a specifier of an I/O statement,
 * or as the value of a statement function.
 
-The f18 compiler has a general (but slow) means of implementing concatenation
+The Flang compiler has a general (but slow) means of implementing concatenation
 and a specialized (fast) option to optimize the most common case.
 
 ### General concatenation
 
-In the most general case, the f18 compiler's generated code and
+In the most general case, Flang's generated code and
 runtime support library represent the result as a deferred-length allocatable
 `CHARACTER` temporary scalar or array variable that is initialized
 as a zero-length array by `AllocatableInitCharacter()`
diff --git a/flang/docs/DoConcurrent.md b/flang/docs/DoConcurrent.md
index bd1008af86f6b..eba2656fc9e52 100644
--- a/flang/docs/DoConcurrent.md
+++ b/flang/docs/DoConcurrent.md
@@ -280,7 +280,8 @@ Specifically, an easy means is required that stipulates that localization
 should apply at most only to the obvious cases of local non-pointer
 non-allocatable scalars.
 
-In the LLVM Fortran compiler project (a/k/a "flang", "f18") we considered
+In the LLVM Fortran compiler project (now known as "flang", previously also
+known as "f18") we considered
 several solutions to this problem.
 1. Add syntax (e.g., `DO PARALLEL` or `DO CONCURRENT() DEFAULT(PARALLEL)`)
    by which one can inform the compiler that it should localize only
diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 420b7517922b7..6d872094811e3 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -84,7 +84,7 @@ end
   be "local identifiers" and should be distinct in the "inclusive
   scope" -- i.e., not scoped by `BLOCK` constructs.
   As most (but not all) compilers implement `BLOCK` scoping of construct
-  names, so does f18, with a portability warning.
+  names, so does Flang, with a portability warning.
 * 15.6.4 paragraph 2 prohibits an implicitly typed statement function
   from sharing the same name as a symbol in its scope's host, if it
   has one.
@@ -153,7 +153,7 @@ end
   that a call to intrinsic module procedure `ieee_support_halting` with
   a constant argument has a compile time constant result in `constant
   expression` and `specification expression` contexts. In compilations
-  where this information is not known at compile time, f18 generates code
+  where this information is not known at compile time, Flang generates code
   to determine the absence or presence of this capability at runtime.
   A call to `ieee_support_halting` in contexts that the standard requires
   to be constant will generate a compilation error. `ieee_support_standard`
@@ -366,7 +366,7 @@ end
 * The legacy extension intrinsic functions `IZEXT` and `JZEXT`
   are supported; `ZEXT` has different behavior with various older
   compilers, so it is not supported.
-* f18 doesn't impose a limit on the number of continuation lines
+* Flang doesn't impose a limit on the number of continuation lines
   allowed for a single statement.
 * When a type-bound procedure declaration statement has neither interface
   nor attributes, the "::" before the bindings is optional, even
@@ -553,7 +553,7 @@ end
 * Fortran explicitly ignores type declaration statements when they
   attempt to type the name of a generic intrinsic function (8.2 p3).
   One can declare `CHARACTER::COS` and still get a real result
-  from `COS(3.14159)`, for example.  f18 will complain when a
+  from `COS(3.14159)`, for example.  Flang will complain when a
   generic intrinsic function's inferred result type does not
   match an explicit declaration.  This message is a warning.
 
@@ -570,7 +570,7 @@ end
 
 ## Standard features that might as well not be
 
-* f18 supports designators with constant expressions, properly
+* Flang supports designators with constant expressions, properly
   constrained, as initial data targets for data pointers in
   initializers of variable and component declarations and in
   `DATA` statements; e.g., `REAL, POINTER :: P => T(1:10:2)`.
@@ -587,8 +587,8 @@ end
 * The standard doesn't explicitly require that a named constant that
   appears as part of a complex-literal-constant be a scalar, but
   most compilers emit an error when an array appears.
-  f18 supports them with a portability warning.
-* f18 does not enforce a blanket prohibition against generic
+  Flang supports them with a portability warning.
+* Flang does not enforce a blanket prohibition against generic
   interfaces containing a mixture of functions and subroutines.
   We allow both to appear, unlike several other Fortran compilers.
   This is especially desirable when two generics of the same
@@ -655,7 +655,7 @@ end
   treat them as references to implicitly typed local variables, and
   load uninitialized values.
 
-  In f18, we chose to emit an error message for this case since the standard
+  In Flang, we chose to emit an error message for this case since the standard
   is unclear, the usage is not portable, and the issue can be easily resolved
   by adding a declaration.
 
@@ -686,7 +686,7 @@ end
 
 * When a `DATA` statement in a `BLOCK` construct could be construed as
   either initializing a host-associated object or declaring a new local
-  initialized object, f18 interprets the standard's classification of
+  initialized object, Flang interprets the standard's classification of
   a `DATA` statement as being a "declaration" rather than a "specification"
   construct, and notes that the `BLOCK` construct is defined as localizing
   names that have specifications in the `BLOCK` construct.
@@ -703,7 +703,7 @@ end subroutine
   Other Fortran compilers disagree with each other in their interpretations
   of this example.
   The precedent among the most commonly used compilers
-  agrees with f18's interpretation: a `DATA` statement without any other
+  agrees with Flang's interpretation: a `DATA` statement without any other
   specification of the name refers to the host-associated object.
 
 * Many Fortran compilers allow a non-generic procedure to be `USE`-associated
@@ -729,7 +729,7 @@ module m2
 end module
 ```
 
-  This case elicits a warning from f18, as it should not be treated
+  This case elicits a warning from Flang, as it should not be treated
   any differently than the same case with the non-generic procedure of
   the same name being defined in the same scope rather than being
   `USE`-associated into it, which is explicitly non-conforming in the
@@ -747,7 +747,7 @@ end module
   symbols, much less appear in specification inquiries, and there are
   application codes that expect exterior symbols whose names match
   components to be visible in a derived-type definition's default initialization
-  expressions, and so f18 follows that precedent.
+  expressions, and so Flang follows that precedent.
 
 * 19.3.1p1 "Within its scope, a local identifier of an entity of class (1)
   or class (4) shall not be the same as a global identifier used in that scope..."
@@ -769,17 +769,17 @@ end module
   left-hand side for a pointer assignment statement, and we emit a
   portability warning when it is not.
 
-* F18 allows a `USE` statement to reference a module that is defined later
+* Flang allows a `USE` statement to reference a module that is defined later
   in the same compilation unit, so long as mutual dependencies do not form
   a cycle.
   This feature forestalls any risk of such a `USE` statement reading an
   obsolete module file from a previous compilation and then overwriting
   that file later.
 
-* F18 allows `OPTIONAL` dummy arguments to interoperable procedures
+* Flang allows `OPTIONAL` dummy arguments to interoperable procedures
   unless they are `VALUE` (C865).
 
-* F18 processes the `NAMELIST` group declarations in a scope after it
+* Flang processes the `NAMELIST` group declarations in a scope after it
   has resolved all of the names in that scope.  This means that names
   that appear before their local declarations do not resolve to host
   associated objects and do not elicit errors about improper redeclarations
@@ -862,11 +862,11 @@ print *, [(j,j=1,10)]
 
 * The Fortran standard doesn't mention integer overflow explicitly. In many cases,
   however, integer overflow makes programs non-conforming.
-  F18 follows other widely-used Fortran compilers. Specifically, f18 assumes
+  Flang follows other widely-used Fortran compilers. Specifically, Flang assumes
   integer overflow never occurs in address calculations and increment of
   do-variable unless the option `-fwrapv` is enabled.
 
-* Two new ieee_round_type values were added in f18 beyond the four values
+* Two new ieee_round_type values were added in Flang beyond the four values
   defined in f03 and f08: ieee_away and ieee_other. Contemporary hardware
   typically does not have support for these rounding modes;
   ieee_support_rounding calls for these values return false.
diff --git a/flang/docs/FortranForCProgrammers.md b/flang/docs/FortranForCProgrammers.md
index 135e6b71711d3..9023fdc8a6038 100644
--- a/flang/docs/FortranForCProgrammers.md
+++ b/flang/docs/FortranForCProgrammers.md
@@ -304,7 +304,7 @@ Preprocessing behavior varies across implementations and one should not depend o
 much portability.
 Preprocessing is typically requested by the use of a capitalized filename
 suffix (e.g., "foo.F90") or a compiler command line option.
-(Since the F18 compiler always runs its built-in preprocessing stage,
+(Since Flang always runs its built-in preprocessing stage,
 no special option or filename suffix is required.)
 
 ## "Object Oriented" Programming
diff --git a/flang/docs/FortranIR.md b/flang/docs/FortranIR.md
index f9f8f6416b37a..7f3c7b2ae5241 100644
--- a/flang/docs/FortranIR.md
+++ b/flang/docs/FortranIR.md
@@ -171,7 +171,7 @@ FIR is intentionally similar to SIL from the statement level up to the level of
 Program, procedure, region, and basic block all leverage code from LLVM, in much the same way as SIL. These data structures have significant investment and engineering behind their use in compilers, and it makes sense to leverage that work.
 
 * Pro: Uses LLVM data structures, pervasive in compiler projects such as LLVM, SIL, etc.
-* Pro: Get used to seeing and using LLVM, as f18 aims to be an LLVM project
+* Pro: Get used to seeing and using LLVM, as Flang aims to be an LLVM project
 * Con: Uses LLVM data structures, which the project has been avoiding
 
 #### Alternative: C++ Standard Template Library
diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index 79af78897c9e4..2d283422b7308 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -41,7 +41,7 @@ Contributions to Flang are done using GitHub Pull Requests and follow the
 ### Flang Slack Workspace
 
 -   There is a Slack workspace dedicated to Flang.
--   There are a number of topic-oriented channels available (e.g., #driver, #f18-semantics, #fir).
+-   There are a number of topic-oriented channels available (e.g., #driver, #fir).
 -   Add yourself via the *[invitation link](https://join.slack.com/t/flang-compiler/shared_invite/zt-2pcn51lh-VrRQL_YUOkxA_1CEfMGQhw "title")*
 
 ## Calls
diff --git a/flang/docs/Intrinsics.md b/flang/docs/Intrinsics.md
index 34b6559e4345f..bfda5f3253a68 100644
--- a/flang/docs/Intrinsics.md
+++ b/flang/docs/Intrinsics.md
@@ -19,7 +19,7 @@ of functions or subroutines with similar interfaces as an aid to
 comprehension beyond that which might be gained from the standard's
 alphabetical list.
 
-A brief status of intrinsic procedure support in f18 is also given at the end.
+A brief status of intrinsic procedure support in Flang is also given at the end.
 
 Few procedures are actually described here apart from their interfaces; see the
 Fortran 2018 standard (section 16) for the complete story.
@@ -733,20 +733,20 @@ In case the invocation would be an error if the procedure were the intrinsic
 leaves two choices to the compiler: emit an error about the intrinsic invocation,
 or consider this is an external procedure and emit no error.
 
-f18 will always consider this case to be the intrinsic and emit errors, unless the procedure
+Flang will always consider this case to be the intrinsic and emit errors, unless the procedure
 is used as a function (resp. subroutine) and the intrinsic is a subroutine (resp. function).
 The table below gives some examples of decisions made by Fortran compilers in such case.
 
 | What is ACOS ?     | Bad intrinsic call       | External with warning |  External no warning | Other error |
 | --- | --- | --- | --- | --- |
-| `print*, ACOS()`     | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(I)`    | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `print*, ACOS(X=I)`  | gfortran, nag, xlf, f18  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
-| `print*, ACOS(X, X)` | gfortran, nag, xlf, f18  |  ifort                |  nvfortran           | |
-| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, f18  | |
+| `print*, ACOS()`     | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(I)`    | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `print*, ACOS(X=I)`  | gfortran, nag, xlf, flang  |  ifort                |                      | nvfortran (keyword on implicit extrenal )|
+| `print*, ACOS(X, X)` | gfortran, nag, xlf, flang  |  ifort                |  nvfortran           | |
+| `CALL ACOS(X)`       |                          |                       |  gfortran, nag, xlf, nvfortran, ifort, flang  | |
 
 
-The rationale for f18 behavior is that when referring to a procedure with an
+The rationale for Flang behavior is that when referring to a procedure with an
 argument number or type that does not match the intrinsic specification, it seems safer to block
 the rather likely case where the user is using the intrinsic the wrong way.
 In case the user wanted to refer to an external function, he can add an explicit EXTERNAL
@@ -759,13 +759,13 @@ Also note that in general, the standard gives the compiler the right to consider
 any procedure that is not explicitly external as a non standard intrinsic (section 4.2 point 4).
 So it is highly advised for the programmer to use EXTERNAL statements to prevent any ambiguity.
 
-## Intrinsic Procedure Support in f18
-This section gives an overview of the support inside f18 libraries for the
+## Intrinsic Procedure Support in Flang
+This section gives an overview of the support inside Flang libraries for the
 intrinsic procedures listed above.
-It may be outdated, refer to f18 code base for the actual support status.
+It may be outdated, refer to Flang code base for the actual support status.
 
 ### Semantic Analysis
-F18 semantic expression analysis phase detects intrinsic procedure references,
+Flang semantic expression analysis phase detects intrinsic procedure references,
 validates the argument types and deduces the return types.
 This phase currently supports all the intrinsic procedures listed above but the ones in the table below.
 
@@ -789,17 +789,17 @@ Constant Expressions may be used to define kind arguments. Therefore, the semant
 expression analysis phase must be able to fold references to intrinsic functions
 listed in section 10.1.12.
 
-F18 intrinsic function folding is either performed by implementations directly
-operating on f18 scalar types or by using host runtime functions and
-host hardware types. F18 supports folding elemental intrinsic functions over
+Flang intrinsic function folding is either performed by implementations directly
+operating on Flang scalar types or by using host runtime functions and
+host hardware types. Flang supports folding elemental intrinsic functions over
 arrays when an implementation is provided for the scalars (regardless of whether
 it is using host hardware types or not).
 The status of intrinsic function folding support is given in the sub-sections below.
 
 #### Intrinsic Functions with Host Independent Folding Support
-Implementations using f18 scalar types enables folding intrinsic functions
-on any host and with any possible type kind supported by f18. The intrinsic functions
-listed below are folded using host independent implementations.
+Implementations using Flang scalar types enables folding intrinsic functions
+on any host and with any possible type kind supported by Flang. The intrinsic
+functions listed below are folded using host independent implementations.
 
 | Return Type | Intrinsic Functions with Host Independent Folding Support|
 | --- | --- |
@@ -810,12 +810,12 @@ listed below are folded using host independent implementations.
 
 #### Intrinsic Functions with Host Dependent Folding Support
 Implementations using the host runtime may not be available for all supported
-f18 types depending on the host hardware types and the libraries available on the host.
+Flang types depending on the hardware type of the host and the libraries available on it.
 The actual support on a host depends on what the host hardware types are.
 The list below gives the functions that are folded using host runtime and the related C/C++ types.
-F18 automatically detects if these types match an f18 scalar type. If so,
-folding of the intrinsic functions will be possible for the related f18 scalar type,
-otherwise an error message will be produced by f18 when attempting to fold related intrinsic functions.
+Flang automatically detects if these types match an Flang scalar type. If so,
+folding of the intrinsic functions will be possible for the related Flang scalar type,
+otherwise an error message will be produced by Flang when attempting to fold related intrinsic functions.
 
 | C/C++ Host Type | Intrinsic Functions with Host Standard C++ Library Based Folding Support |
 | --- | --- |
@@ -823,17 +823,17 @@ otherwise an error message will be produced by f18 when attempting to fold relat
 | std::complex for float, double and long double| ACOS, ACOSH, ASIN, ASINH, ATAN, ATANH, COS, COSH, EXP, LOG, SIN, SINH, SQRT, TAN, TANH |
 
 On top of the default usage of C++ standard library functions for folding described
-in the table above, it is possible to compile f18 evaluate library with
+in the table above, it is possible to compile Flang evaluate library with
 [libpgmath](https://github.com/flang-compiler/flang/tree/master/runtime/libpgmath)
 so that it can be used for folding. To do so, one must have a compiled version
 of the libpgmath library available on the host and add
-`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the f18 cmake command.
+`-DLIBPGMATH_DIR=<path to the compiled shared libpgmath library>` to the Flang cmake command.
 
 Libpgmath comes with real and complex functions that replace C++ standard library
 float and double functions to fold all the intrinsic functions listed in the table above.
-It has no long double versions. If the host long double matches an f18 scalar type,
+It has no long double versions. If the host long double matches a Flang scalar type,
 C++ standard library functions will still be used for folding expressions with this scalar type.
-Libpgmath adds the possibility to fold the following functions for f18 real scalar
+Libpgmath adds the possibility to fold the following functions for Flang's real scalar
 types related to host float and double types.
 
 | C/C++ Host Type | Additional Intrinsic Function Folding Support with Libpgmath (Optional) |
@@ -841,10 +841,10 @@ types related to host float and double types.
 |float and double| BESSEL_J0, BESSEL_J1, BESSEL_JN (elemental only), BESSEL_Y0, BESSEL_Y1, BESSEL_Yn (elemental only), DERFC_SCALED, ERFC_SCALED, QERFC_SCALED |
 
 Libpgmath comes in three variants (precise, relaxed and fast). So far, only the
-precise version is used for intrinsic function folding in f18. It guarantees the greatest numerical precision.
+precise version is used for intrinsic function folding in Flang. It guarantees the greatest numerical precision.
 
 ### Intrinsic Functions with Missing Folding Support
-The following intrinsic functions are allowed in constant expressions but f18
+The following intrinsic functions are allowed in constant expressions but Flang
 is not yet able to fold them. Note that there might be constraints on the arguments
 so that these intrinsics can be used in constant expressions (see section 10.1.12 of Fortran 2018 standard).
 
@@ -1133,8 +1133,8 @@ end program rename_proc
 - **Standard:**  GNU extension
 - **Class:**     function
 - **Syntax:**    result = `SECNDS(refTime)`
-- **Arguments:** 
- 
+- **Arguments:**
+
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=4), required  | Reference time in seconds since midnight |
@@ -1157,16 +1157,16 @@ END PROGRAM example_secnds
 since midnight minus a user-supplied reference time `refTime`. Uses `REAL(KIND=8)` for higher precision.
 
 #### Usage and Info
-- **Standard:** PGI extension  
-- **Class:**     function  
-- **Syntax:**    result = `DSECNDS(refTime)`  
-- **Arguments:** 
+- **Standard:** PGI extension
+- **Class:**     function
+- **Syntax:**    result = `DSECNDS(refTime)`
+- **Arguments:**
 
 | ARGUMENT  | INTENT |      TYPE     |          KIND           |           Description                    |
 |-----------|--------|---------------|-------------------------|------------------------------------------|
 | `refTime` | `IN`   | `REAL, scalar`| REAL(KIND=8), required  | Reference time in seconds since midnight |
 
-- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.  
+- **Return Value:** REAL(KIND=8), scalar — seconds elapsed since `refTime`.
 - **Purity:** Impure
 
 #### Example
diff --git a/flang/examples/FeatureList/FeatureList.cpp b/flang/examples/FeatureList/FeatureList.cpp
index daa012e3eb08b..225a6558ef956 100644
--- a/flang/examples/FeatureList/FeatureList.cpp
+++ b/flang/examples/FeatureList/FeatureList.cpp
@@ -451,6 +451,7 @@ struct NodeVisitor {
   READ_FEATURE(OmpBlockConstruct)
   READ_FEATURE(OmpClause)
   READ_FEATURE(OmpClauseList)
+  READ_FEATURE(OmpCombinerExpression)
   READ_FEATURE(OmpDefaultClause)
   READ_FEATURE(OmpDefaultClause::DataSharingAttribute)
   READ_FEATURE(OmpDefaultmapClause)
@@ -496,7 +497,6 @@ struct NodeVisitor {
   READ_FEATURE(OmpProcBindClause::AffinityPolicy)
   READ_FEATURE(OmpReductionClause)
   READ_FEATURE(OmpInReductionClause)
-  READ_FEATURE(OmpReductionCombiner)
   READ_FEATURE(OmpInitializerClause)
   READ_FEATURE(OmpReductionIdentifier)
   READ_FEATURE(OmpAllocateClause)
diff --git a/flang/include/flang/Evaluate/call.h b/flang/include/flang/Evaluate/call.h
index 2a5929b873d74..fea09d6440226 100644
--- a/flang/include/flang/Evaluate/call.h
+++ b/flang/include/flang/Evaluate/call.h
@@ -112,6 +112,7 @@ class ActualArgument {
   int Rank() const;
   bool operator==(const ActualArgument &) const;
   llvm::raw_ostream &AsFortran(llvm::raw_ostream &) const;
+  std::string AsFortran() const;
 
   std::optional<parser::CharBlock> keyword() const { return keyword_; }
   ActualArgument &set_keyword(parser::CharBlock x) {
diff --git a/flang/include/flang/Optimizer/Builder/HLFIRTools.h b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
index f96d2228915fe..9f7c10c2b06c2 100644
--- a/flang/include/flang/Optimizer/Builder/HLFIRTools.h
+++ b/flang/include/flang/Optimizer/Builder/HLFIRTools.h
@@ -98,6 +98,13 @@ class Entity : public mlir::Value {
   mlir::Type getElementOrSequenceType() const {
     return hlfir::getFortranElementOrSequenceType(getType());
   }
+  /// Return the fir.class or fir.box type needed to describe this entity.
+  fir::BaseBoxType getBoxType() const {
+    if (isBoxAddressOrValue())
+      return llvm::cast<fir::BaseBoxType>(fir::unwrapRefType(getType()));
+    const bool isVolatile = fir::isa_volatile_type(getType());
+    return fir::BoxType::get(getElementOrSequenceType(), isVolatile);
+  }
 
   bool hasLengthParameters() const {
     mlir::Type eleTy = getFortranElementType();
diff --git a/flang/include/flang/Optimizer/Dialect/FIROps.td b/flang/include/flang/Optimizer/Dialect/FIROps.td
index fc6eedc6ed4c6..86502c6271d23 100644
--- a/flang/include/flang/Optimizer/Dialect/FIROps.td
+++ b/flang/include/flang/Optimizer/Dialect/FIROps.td
@@ -1249,6 +1249,41 @@ def fir_IsAssumedSizeOp : fir_SimpleOp<"is_assumed_size", [NoMemoryEffect]> {
   let results = (outs BoolLike);
 }
 
+def fir_AssumedSizeExtentOp : fir_SimpleOneResultOp<"assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "get the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns the special extent value representing the last dimension of an
+    assumed-size array. This is used to model the semantics in FIR without
+    directly materializing the sentinel value. The concrete encoding is
+    introduced during FIR to LLVM lowering.
+
+    ```
+      %e = fir.assumed_size_extent : index
+    ```
+  }];
+
+  let results = (outs Index);
+  let assemblyFormat = "attr-dict `:` type(results)";
+}
+
+def fir_IsAssumedSizeExtentOp : fir_SimpleOp<"is_assumed_size_extent", [NoMemoryEffect]> {
+  let summary = "is value the assumed-size last extent sentinel";
+
+  let description = [{
+    Returns true iff the given integer equals the assumed-size extent sentinel.
+
+    ```
+      %t = fir.is_assumed_size_extent %v : (index) -> i1
+      %c = fir.is_assumed_size_extent %x : (i32) -> i1
+    ```
+  }];
+
+  let arguments = (ins AnyIntegerLike:$val);
+  let results = (outs BoolLike);
+  let hasCanonicalizer = 1;
+}
+
 def fir_BoxIsPtrOp : fir_SimpleOp<"box_isptr", [NoMemoryEffect]> {
   let summary = "is the boxed value a POINTER?";
 
diff --git a/flang/include/flang/Optimizer/Passes/Pipelines.h b/flang/include/flang/Optimizer/Passes/Pipelines.h
index a312ad6140869..f673a5d5ec27f 100644
--- a/flang/include/flang/Optimizer/Passes/Pipelines.h
+++ b/flang/include/flang/Optimizer/Passes/Pipelines.h
@@ -22,6 +22,7 @@
 #include "mlir/Conversion/SCFToControlFlow/SCFToControlFlow.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "mlir/Transforms/Passes.h"
diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 5677277a9b381..af8152deb8a52 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -538,6 +538,7 @@ class ParseTreeDumper {
   NODE(parser, OmpClauseList)
   NODE(parser, OmpCloseModifier)
   NODE_ENUM(OmpCloseModifier, Value)
+  NODE(parser, OmpCombinerExpression)
   NODE(parser, OmpContainsClause)
   NODE(parser, OmpContextSelectorSpecification)
   NODE(parser, OmpDeclareVariantDirective)
@@ -655,7 +656,6 @@ class ParseTreeDumper {
   NODE_ENUM(OmpProcBindClause, AffinityPolicy)
   NODE(parser, OmpReductionClause)
   NODE(OmpReductionClause, Modifier)
-  NODE(parser, OmpReductionCombiner)
   NODE(parser, OmpReductionIdentifier)
   NODE(parser, OmpReductionModifier)
   NODE_ENUM(OmpReductionModifier, Value)
@@ -693,8 +693,8 @@ class ParseTreeDumper {
   NODE(parser, OmpTraitSetSelectorName)
   NODE_ENUM(OmpTraitSetSelectorName, Value)
   NODE(parser, OmpTransparentClause)
+  NODE(parser, OmpTypeName)
   NODE(parser, OmpTypeNameList)
-  NODE(parser, OmpTypeSpecifier)
   NODE(parser, OmpUnifiedAddressClause)
   NODE(parser, OmpUnifiedSharedMemoryClause)
   NODE(parser, OmpUpdateClause)
diff --git a/flang/include/flang/Parser/parse-tree.h b/flang/include/flang/Parser/parse-tree.h
index 6dd4f2492cf22..bb47f31060885 100644
--- a/flang/include/flang/Parser/parse-tree.h
+++ b/flang/include/flang/Parser/parse-tree.h
@@ -3274,13 +3274,13 @@ struct FunctionReference {
 // R1521 call-stmt -> CALL procedure-designator [ chevrons ]
 //         [( [actual-arg-spec-list] )]
 // (CUDA) chevrons -> <<< * | scalar-expr, scalar-expr [,
-//          scalar-int-expr [, scalar-int-expr ] ] >>>
+//          scalar-expr [, scalar-int-expr ] ] >>>
 struct CallStmt {
   BOILERPLATE(CallStmt);
   WRAPPER_CLASS(StarOrExpr, std::optional<ScalarExpr>);
   struct Chevrons {
     TUPLE_CLASS_BOILERPLATE(Chevrons);
-    std::tuple<StarOrExpr, ScalarExpr, std::optional<ScalarIntExpr>,
+    std::tuple<StarOrExpr, ScalarExpr, std::optional<ScalarExpr>,
         std::optional<ScalarIntExpr>>
         t;
   };
@@ -3502,6 +3502,16 @@ struct OmpDirectiveName {
   llvm::omp::Directive v{llvm::omp::Directive::OMPD_unknown};
 };
 
+// type-name list item
+struct OmpTypeName {
+  UNION_CLASS_BOILERPLATE(OmpTypeName);
+  std::variant<TypeSpec, DeclarationTypeSpec> u;
+};
+
+struct OmpTypeNameList {
+  WRAPPER_CLASS_BOILERPLATE(OmpTypeNameList, std::list<OmpTypeName>);
+};
+
 // 2.1 Directives or clauses may accept a list or extended-list.
 //     A list item is a variable, array section or common block name (enclosed
 //     in slashes). An extended list item is a list item or a procedure Name.
@@ -3539,21 +3549,12 @@ struct OmpReductionIdentifier {
 // combiner-expression ->                           // since 4.5
 //    assignment-statement |
 //    function-reference
-struct OmpReductionCombiner {
-  UNION_CLASS_BOILERPLATE(OmpReductionCombiner);
+struct OmpCombinerExpression {
+  UNION_CLASS_BOILERPLATE(OmpCombinerExpression);
   std::variant<AssignmentStmt, FunctionReference> u;
 };
 
 inline namespace arguments {
-struct OmpTypeSpecifier {
-  UNION_CLASS_BOILERPLATE(OmpTypeSpecifier);
-  std::variant<TypeSpec, DeclarationTypeSpec> u;
-};
-
-struct OmpTypeNameList {
-  WRAPPER_CLASS_BOILERPLATE(OmpTypeNameList, std::list<OmpTypeSpecifier>);
-};
-
 struct OmpLocator {
   UNION_CLASS_BOILERPLATE(OmpLocator);
   std::variant<OmpObject, FunctionReference> u;
@@ -3596,7 +3597,7 @@ struct OmpMapperSpecifier {
 struct OmpReductionSpecifier {
   TUPLE_CLASS_BOILERPLATE(OmpReductionSpecifier);
   std::tuple<OmpReductionIdentifier, OmpTypeNameList,
-      std::optional<OmpReductionCombiner>>
+      std::optional<OmpCombinerExpression>>
       t;
 };
 
diff --git a/flang/include/flang/Semantics/expression.h b/flang/include/flang/Semantics/expression.h
index 95c97f264a667..639ef99d2d936 100644
--- a/flang/include/flang/Semantics/expression.h
+++ b/flang/include/flang/Semantics/expression.h
@@ -367,11 +367,16 @@ class ExpressionAnalyzer {
   using AdjustActuals =
       std::optional<std::function<bool(const Symbol &, ActualArguments &)>>;
   const Symbol *ResolveForward(const Symbol &);
-  std::pair<const Symbol *, bool /* failure due ambiguity */> ResolveGeneric(
-      const Symbol &, const ActualArguments &, const AdjustActuals &,
-      bool isSubroutine, bool mightBeStructureConstructor = false);
-  void EmitGenericResolutionError(
-      const Symbol &, bool dueToNullActuals, bool isSubroutine);
+  struct GenericResolution {
+    const Symbol *specific{nullptr};
+    bool failedDueToAmbiguity{false};
+    SymbolVector tried{};
+  };
+  GenericResolution ResolveGeneric(const Symbol &, const ActualArguments &,
+      const AdjustActuals &, bool isSubroutine, SymbolVector &&tried,
+      bool mightBeStructureConstructor = false);
+  void EmitGenericResolutionError(const Symbol &, bool dueToNullActuals,
+      bool isSubroutine, ActualArguments &, const SymbolVector &);
   const Symbol &AccessSpecific(
       const Symbol &originalGeneric, const Symbol &specific);
   std::optional<CalleeAndArguments> GetCalleeAndArguments(const parser::Name &,
diff --git a/flang/include/flang/Semantics/openmp-utils.h b/flang/include/flang/Semantics/openmp-utils.h
index 0f851830edd46..032944d8be370 100644
--- a/flang/include/flang/Semantics/openmp-utils.h
+++ b/flang/include/flang/Semantics/openmp-utils.h
@@ -13,9 +13,11 @@
 #ifndef FORTRAN_SEMANTICS_OPENMP_UTILS_H
 #define FORTRAN_SEMANTICS_OPENMP_UTILS_H
 
+#include "flang/Common/indirection.h"
 #include "flang/Evaluate/type.h"
 #include "flang/Parser/char-block.h"
 #include "flang/Parser/parse-tree.h"
+#include "flang/Parser/tools.h"
 #include "flang/Semantics/tools.h"
 
 #include "llvm/ADT/ArrayRef.h"
@@ -26,6 +28,7 @@
 #include <utility>
 
 namespace Fortran::semantics {
+class Scope;
 class SemanticsContext;
 class Symbol;
 
@@ -74,7 +77,11 @@ bool IsVarOrFunctionRef(const MaybeExpr &expr);
 bool IsMapEnteringType(parser::OmpMapType::Value type);
 bool IsMapExitingType(parser::OmpMapType::Value type);
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr);
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr);
+template <typename T> MaybeExpr GetEvaluateExpr(const T &inp) {
+  return GetEvaluateExpr(parser::UnwrapRef<parser::Expr>(inp));
+}
+
 std::optional<evaluate::DynamicType> GetDynamicType(
     const parser::Expr &parserExpr);
 
diff --git a/flang/include/flang/Semantics/scope.h b/flang/include/flang/Semantics/scope.h
index 3195892fa7b91..ecffdb468bf6c 100644
--- a/flang/include/flang/Semantics/scope.h
+++ b/flang/include/flang/Semantics/scope.h
@@ -86,6 +86,13 @@ class Scope {
     CHECK(parent_ != this);
     return *parent_;
   }
+
+  mapType &commonBlocks() { return commonBlocks_; }
+  const mapType &commonBlocks() const { return commonBlocks_; }
+
+  mapType &commonBlockUses() { return commonBlockUses_; }
+  const mapType &commonBlockUses() const { return commonBlockUses_; }
+
   Kind kind() const { return kind_; }
   bool IsGlobal() const { return kind_ == Kind::Global; }
   bool IsIntrinsicModules() const { return kind_ == Kind::IntrinsicModules; }
@@ -186,10 +193,19 @@ class Scope {
   // Cray pointers are saved as map of pointee name -> pointer symbol
   const mapType &crayPointers() const { return crayPointers_; }
   void add_crayPointer(const SourceName &, Symbol &);
-  mapType &commonBlocks() { return commonBlocks_; }
-  const mapType &commonBlocks() const { return commonBlocks_; }
   Symbol &MakeCommonBlock(SourceName, SourceName location);
-  Symbol *FindCommonBlock(const SourceName &) const;
+  bool AddCommonBlockUse(
+      const SourceName &name, Attrs attrs, Symbol &cbUltimate);
+
+  // Find COMMON block that is declared in the current scope
+  Symbol *FindCommonBlock(const SourceName &name) const;
+
+  // Find USE-associated COMMON block in the current scope
+  Symbol *FindCommonBlockUse(const SourceName &name) const;
+
+  // Find COMMON block in current and surrounding scopes, follow USE
+  // associations
+  Symbol *FindCommonBlockInVisibleScopes(const SourceName &) const;
 
   /// Make a Symbol but don't add it to the scope.
   template <typename D>
@@ -283,6 +299,7 @@ class Scope {
   std::list<Scope> children_;
   mapType symbols_;
   mapType commonBlocks_;
+  mapType commonBlockUses_; // USE-assocated COMMON blocks
   std::list<EquivalenceSet> equivalenceSets_;
   mapType crayPointers_;
   std::map<SourceName, common::Reference<Scope>> submodules_;
diff --git a/flang/include/flang/Semantics/semantics.h b/flang/include/flang/Semantics/semantics.h
index f7910ad38a19d..c03d0a02d7faf 100644
--- a/flang/include/flang/Semantics/semantics.h
+++ b/flang/include/flang/Semantics/semantics.h
@@ -262,6 +262,7 @@ class SemanticsContext {
   const Scope &FindScope(parser::CharBlock) const;
   Scope &FindScope(parser::CharBlock);
   void UpdateScopeIndex(Scope &, parser::CharBlock);
+  void DumpScopeIndex(llvm::raw_ostream &) const;
 
   bool IsInModuleFile(parser::CharBlock) const;
 
diff --git a/flang/lib/Evaluate/formatting.cpp b/flang/lib/Evaluate/formatting.cpp
index ec5dc0baaa5cb..5632015857ab3 100644
--- a/flang/lib/Evaluate/formatting.cpp
+++ b/flang/lib/Evaluate/formatting.cpp
@@ -252,6 +252,13 @@ llvm::raw_ostream &ActualArgument::AsFortran(llvm::raw_ostream &o) const {
   return o;
 }
 
+std::string ActualArgument::AsFortran() const {
+  std::string result;
+  llvm::raw_string_ostream sstream(result);
+  AsFortran(sstream);
+  return result;
+}
+
 llvm::raw_ostream &SpecificIntrinsic::AsFortran(llvm::raw_ostream &o) const {
   return o << name;
 }
diff --git a/flang/lib/Frontend/FrontendActions.cpp b/flang/lib/Frontend/FrontendActions.cpp
index bfce8805433b6..b2c9013dd2c47 100644
--- a/flang/lib/Frontend/FrontendActions.cpp
+++ b/flang/lib/Frontend/FrontendActions.cpp
@@ -1023,24 +1023,40 @@ void CodeGenAction::runOptimizationPipeline(llvm::raw_pwrite_stream &os) {
 
   // Create the pass manager.
   llvm::ModulePassManager mpm;
-  if (opts.PrepareForFatLTO) {
-    // The module summary should be emitted by default for regular LTO
-    // except for ld64 targets.
-    bool emitSummary = opts.PrepareForThinLTO || opts.PrepareForFullLTO ||
-                       triple.getVendor() != llvm::Triple::Apple;
+  // The module summary should be emitted by default for regular LTO
+  // except for ld64 targets.
+  bool emitSummary =
+      opts.PrepareForFullLTO && (triple.getVendor() != llvm::Triple::Apple);
+  if (opts.PrepareForFatLTO)
     mpm = pb.buildFatLTODefaultPipeline(level, opts.PrepareForThinLTO,
                                         emitSummary);
-  } else if (opts.PrepareForFullLTO)
+  else if (opts.PrepareForFullLTO)
     mpm = pb.buildLTOPreLinkDefaultPipeline(level);
   else if (opts.PrepareForThinLTO)
     mpm = pb.buildThinLTOPreLinkDefaultPipeline(level);
   else
     mpm = pb.buildPerModuleDefaultPipeline(level);
 
-  if (action == BackendActionTy::Backend_EmitBC)
-    mpm.addPass(llvm::BitcodeWriterPass(os));
-  else if (action == BackendActionTy::Backend_EmitLL)
-    mpm.addPass(llvm::PrintModulePass(os));
+  if (action == BackendActionTy::Backend_EmitBC ||
+      action == BackendActionTy::Backend_EmitLL || opts.PrepareForFatLTO) {
+    if (opts.PrepareForThinLTO) {
+      // TODO: ThinLTO module summary support is yet to be enabled.
+      if (action == BackendActionTy::Backend_EmitBC)
+        mpm.addPass(llvm::BitcodeWriterPass(os));
+      else if (action == BackendActionTy::Backend_EmitLL)
+        mpm.addPass(llvm::PrintModulePass(os));
+    } else {
+      if (emitSummary && !llvmModule->getModuleFlag("ThinLTO"))
+        llvmModule->addModuleFlag(llvm::Module::Error, "ThinLTO", uint32_t(0));
+      if (action == BackendActionTy::Backend_EmitBC)
+        mpm.addPass(llvm::BitcodeWriterPass(
+            os, /*ShouldPreserveUseListOrder=*/false, emitSummary));
+      else if (action == BackendActionTy::Backend_EmitLL)
+        mpm.addPass(llvm::PrintModulePass(os, /*Banner=*/"",
+                                          /*ShouldPreserveUseListOrder=*/false,
+                                          emitSummary));
+    }
+  }
 
   // FIXME: This should eventually be replaced by a first-class driver option.
   // This should be done for both flang and clang simultaneously.
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index 3b711ccbe786a..a516a44204cac 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -1766,7 +1766,7 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // to a crash due to a block with no terminator. See issue #126452.
       mlir::FunctionType funcType = builder->getFunction().getFunctionType();
       mlir::Type resultType = funcType.getResult(0);
-      mlir::Value undefResult = builder->create<fir::UndefOp>(loc, resultType);
+      mlir::Value undefResult = fir::UndefOp::create(*builder, loc, resultType);
       genExitRoutine(false, undefResult);
       return;
     }
@@ -4010,8 +4010,8 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       // parameters and dynamic type. The selector cannot be a
       // POINTER/ALLOCATBLE as per F'2023 C1160.
       fir::ExtendedValue newExv;
-      llvm::SmallVector assumeSizeExtents{
-          builder->createMinusOneInteger(loc, builder->getIndexType())};
+      llvm::SmallVector<mlir::Value> assumeSizeExtents{
+          fir::AssumedSizeExtentOp::create(*builder, loc)};
       mlir::Value baseAddr =
           hlfir::genVariableRawAddress(loc, *builder, selector);
       const bool isVolatile = fir::isa_volatile_type(selector.getType());
@@ -4733,11 +4733,21 @@ class FirConverter : public Fortran::lower::AbstractConverter {
       return fir::factory::createUnallocatedBox(*builder, loc, lhsBoxType, {});
     hlfir::Entity rhs = Fortran::lower::convertExprToHLFIR(
         loc, *this, assign.rhs, localSymbols, rhsContext);
+    auto rhsBoxType = rhs.getBoxType();
     // Create pointer descriptor value from the RHS.
     if (rhs.isMutableBox())
       rhs = hlfir::Entity{fir::LoadOp::create(*builder, loc, rhs)};
-    mlir::Value rhsBox = hlfir::genVariableBox(
-        loc, *builder, rhs, lhsBoxType.getBoxTypeWithNewShape(rhs.getRank()));
+
+    // Use LHS type if LHS is not polymorphic.
+    fir::BaseBoxType targetBoxType;
+    if (assign.lhs.GetType()->IsPolymorphic())
+      targetBoxType = rhsBoxType.getBoxTypeWithNewAttr(
+          fir::BaseBoxType::Attribute::Pointer);
+    else
+      targetBoxType = lhsBoxType.getBoxTypeWithNewShape(rhs.getRank());
+    mlir::Value rhsBox =
+        hlfir::genVariableBox(loc, *builder, rhs, targetBoxType);
+
     // Apply lower bounds or reshaping if any.
     if (const auto *lbExprs =
             std::get_if<Fortran::evaluate::Assignment::BoundsSpec>(&assign.u);
diff --git a/flang/lib/Lower/ConvertVariable.cpp b/flang/lib/Lower/ConvertVariable.cpp
index 00ec1b51e5400..2517ab35d4ff0 100644
--- a/flang/lib/Lower/ConvertVariable.cpp
+++ b/flang/lib/Lower/ConvertVariable.cpp
@@ -1711,7 +1711,7 @@ static void lowerExplicitLowerBounds(
 /// CFI_desc_t requirements in 18.5.3 point 5.).
 static mlir::Value getAssumedSizeExtent(mlir::Location loc,
                                         fir::FirOpBuilder &builder) {
-  return builder.createMinusOneInteger(loc, builder.getIndexType());
+  return fir::AssumedSizeExtentOp::create(builder, loc);
 }
 
 /// Lower explicit extents into \p result if this is an explicit-shape or
diff --git a/flang/lib/Lower/OpenACC.cpp b/flang/lib/Lower/OpenACC.cpp
index af4f4207b8e4c..d7861ac6463c8 100644
--- a/flang/lib/Lower/OpenACC.cpp
+++ b/flang/lib/Lower/OpenACC.cpp
@@ -816,8 +816,9 @@ static void genDeclareDataOperandOperations(
         Fortran::semantics::FindCommonBlockContaining(symbol)) {
       emitCommonGlobal(
           converter, builder, accObject, dataClause,
-          [&](mlir::OpBuilder &modBuilder, mlir::Location loc,
-              fir::GlobalOp globalOp, mlir::acc::DataClause clause,
+          [&](mlir::OpBuilder &modBuilder, [[maybe_unused]] mlir::Location loc,
+              [[maybe_unused]] fir::GlobalOp globalOp,
+              [[maybe_unused]] mlir::acc::DataClause clause,
               std::stringstream &asFortranStr, const std::string &ctorName) {
             if constexpr (std::is_same_v<EntryOp, mlir::acc::DeclareLinkOp>) {
               createDeclareGlobalOp<
@@ -2366,6 +2367,23 @@ static void processDoLoopBounds(
   }
 }
 
+static void remapCommonBlockMember(
+    Fortran::lower::AbstractConverter &converter, mlir::Location loc,
+    const Fortran::semantics::Symbol &member,
+    mlir::Value newCommonBlockBaseAddress,
+    const Fortran::semantics::Symbol &commonBlockSymbol,
+    llvm::SmallPtrSetImpl<const Fortran::semantics::Symbol *> &seenSymbols) {
+  if (seenSymbols.contains(&member))
+    return;
+  mlir::Value accMemberValue = Fortran::lower::genCommonBlockMember(
+      converter, loc, member, newCommonBlockBaseAddress,
+      commonBlockSymbol.size());
+  fir::ExtendedValue hostExv = converter.getSymbolExtendedValue(member);
+  fir::ExtendedValue accExv = fir::substBase(hostExv, accMemberValue);
+  converter.bindSymbol(member, accExv);
+  seenSymbols.insert(&member);
+}
+
 /// Remap symbols that appeared in OpenACC data clauses to use the results of
 /// the corresponding data operations. This allows isolating symbol accesses
 /// inside the OpenACC region from accesses in the host and other regions while
@@ -2391,14 +2409,39 @@ static void remapDataOperandSymbols(
   builder.setInsertionPointToStart(&regionOp.getRegion().front());
   llvm::SmallPtrSet<const Fortran::semantics::Symbol *, 8> seenSymbols;
   mlir::IRMapping mapper;
+  mlir::Location loc = regionOp.getLoc();
   for (auto [value, symbol] : dataOperandSymbolPairs) {
-
-    // If A symbol appears on several data clause, just map it to the first
+    // If a symbol appears on several data clause, just map it to the first
     // result (all data operations results for a symbol are pointing same
     // memory, so it does not matter which one is used).
     if (seenSymbols.contains(&symbol.get()))
       continue;
     seenSymbols.insert(&symbol.get());
+    // When a common block appears in a directive, remap its members.
+    // Note: this will instantiate all common block members even if they are not
+    // used inside the region. If hlfir.declare DCE is not made possible, this
+    // could be improved to reduce IR noise.
+    if (const auto *commonBlock = symbol->template detailsIf<
+                                  Fortran::semantics::CommonBlockDetails>()) {
+      const Fortran::semantics::Scope &commonScope = symbol->owner();
+      if (commonScope.equivalenceSets().empty()) {
+        for (auto member : commonBlock->objects())
+          remapCommonBlockMember(converter, loc, *member, value, *symbol,
+                                 seenSymbols);
+      } else {
+        // Objects equivalenced with common block members still belong to the
+        // common block storage even if they are not part of the common block
+        // declaration. The easiest and most robust way to find all symbols
+        // belonging to the common block is to loop through the scope symbols
+        // and check if they belong to the common.
+        for (const auto &scopeSymbol : commonScope)
+          if (Fortran::semantics::FindCommonBlockContaining(
+                  *scopeSymbol.second) == &symbol.get())
+            remapCommonBlockMember(converter, loc, *scopeSymbol.second, value,
+                                   *symbol, seenSymbols);
+      }
+      continue;
+    }
     std::optional<fir::FortranVariableOpInterface> hostDef =
         symbolMap.lookupVariableDefinition(symbol);
     assert(hostDef.has_value() && llvm::isa<hlfir::DeclareOp>(*hostDef) &&
@@ -2415,10 +2458,8 @@ static void remapDataOperandSymbols(
              "box type mismatch between compute region variable and "
              "hlfir.declare input unexpected");
       if (Fortran::semantics::IsOptional(symbol))
-        TODO(regionOp.getLoc(),
-             "remapping OPTIONAL symbol in OpenACC compute region");
-      auto rawValue =
-          fir::BoxAddrOp::create(builder, regionOp.getLoc(), hostType, value);
+        TODO(loc, "remapping OPTIONAL symbol in OpenACC compute region");
+      auto rawValue = fir::BoxAddrOp::create(builder, loc, hostType, value);
       mapper.map(hostInput, rawValue);
     } else {
       assert(!llvm::isa<fir::BaseBoxType>(hostType) &&
@@ -2430,8 +2471,7 @@ static void remapDataOperandSymbols(
       assert(fir::isa_ref_type(hostType) && fir::isa_ref_type(computeType) &&
              "compute region variable and host variable should both be raw "
              "addresses");
-      mlir::Value cast =
-          builder.createConvert(regionOp.getLoc(), hostType, value);
+      mlir::Value cast = builder.createConvert(loc, hostType, value);
       mapper.map(hostInput, cast);
     }
     if (mlir::Value dummyScope = hostDeclare.getDummyScope()) {
diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 0f70b9181bb3e..f60e55d3b4a8e 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2068,37 +2068,38 @@ static void genCanonicalLoopNest(
     // Start lowering
     mlir::Value zero = firOpBuilder.createIntegerConstant(loc, loopVarType, 0);
     mlir::Value one = firOpBuilder.createIntegerConstant(loc, loopVarType, 1);
-    mlir::Value isDownwards = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
+    mlir::Value isDownwards = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, loopStepVar, zero);
 
     // Ensure we are counting upwards. If not, negate step and swap lb and ub.
     mlir::Value negStep =
-        firOpBuilder.create<mlir::arith::SubIOp>(loc, zero, loopStepVar);
-    mlir::Value incr = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, negStep, loopStepVar);
-    mlir::Value lb = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopUBVar, loopLBVar);
-    mlir::Value ub = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isDownwards, loopLBVar, loopUBVar);
+        mlir::arith::SubIOp::create(firOpBuilder, loc, zero, loopStepVar);
+    mlir::Value incr = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, negStep, loopStepVar);
+    mlir::Value lb = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopUBVar, loopLBVar);
+    mlir::Value ub = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isDownwards, loopLBVar, loopUBVar);
 
     // Compute the trip count assuming lb <= ub. This guarantees that the result
     // is non-negative and we can use unsigned arithmetic.
-    mlir::Value span = firOpBuilder.create<mlir::arith::SubIOp>(
-        loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
+    mlir::Value span = mlir::arith::SubIOp::create(
+        firOpBuilder, loc, ub, lb, ::mlir::arith::IntegerOverflowFlags::nuw);
     mlir::Value tcMinusOne =
-        firOpBuilder.create<mlir::arith::DivUIOp>(loc, span, incr);
-    mlir::Value tcIfLooping = firOpBuilder.create<mlir::arith::AddIOp>(
-        loc, tcMinusOne, one, ::mlir::arith::IntegerOverflowFlags::nuw);
+        mlir::arith::DivUIOp::create(firOpBuilder, loc, span, incr);
+    mlir::Value tcIfLooping =
+        mlir::arith::AddIOp::create(firOpBuilder, loc, tcMinusOne, one,
+                                    ::mlir::arith::IntegerOverflowFlags::nuw);
 
     // Fall back to 0 if lb > ub
-    mlir::Value isZeroTC = firOpBuilder.create<mlir::arith::CmpIOp>(
-        loc, mlir::arith::CmpIPredicate::slt, ub, lb);
-    mlir::Value tripcount = firOpBuilder.create<mlir::arith::SelectOp>(
-        loc, isZeroTC, zero, tcIfLooping);
+    mlir::Value isZeroTC = mlir::arith::CmpIOp::create(
+        firOpBuilder, loc, mlir::arith::CmpIPredicate::slt, ub, lb);
+    mlir::Value tripcount = mlir::arith::SelectOp::create(
+        firOpBuilder, loc, isZeroTC, zero, tcIfLooping);
     tripcounts.push_back(tripcount);
 
     // Create the CLI handle.
-    auto newcli = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto newcli = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     mlir::Value cli = newcli.getResult();
     clis.push_back(cli);
 
@@ -2131,10 +2132,10 @@ static void genCanonicalLoopNest(
                "Expecting all block args to have been collected by now");
         for (auto j : llvm::seq<size_t>(numLoops)) {
           mlir::Value natIterNum = fir::getBase(blockArgs[j]);
-          mlir::Value scaled = firOpBuilder.create<mlir::arith::MulIOp>(
-              loc, natIterNum, loopStepVars[j]);
-          mlir::Value userVal = firOpBuilder.create<mlir::arith::AddIOp>(
-              loc, loopLBVars[j], scaled);
+          mlir::Value scaled = mlir::arith::MulIOp::create(
+              firOpBuilder, loc, natIterNum, loopStepVars[j]);
+          mlir::Value userVal = mlir::arith::AddIOp::create(
+              firOpBuilder, loc, loopLBVars[j], scaled);
 
           mlir::OpBuilder::InsertPoint insPt =
               firOpBuilder.saveInsertionPoint();
@@ -2207,9 +2208,9 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   gridGeneratees.reserve(numLoops);
   intratileGeneratees.reserve(numLoops);
   for ([[maybe_unused]] auto i : llvm::seq<int>(0, sizesClause.sizes.size())) {
-    auto gridCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto gridCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     gridGeneratees.push_back(gridCLI.getResult());
-    auto intratileCLI = firOpBuilder.create<mlir::omp::NewCliOp>(loc);
+    auto intratileCLI = mlir::omp::NewCliOp::create(firOpBuilder, loc);
     intratileGeneratees.push_back(intratileCLI.getResult());
   }
 
@@ -2218,8 +2219,8 @@ static void genTileOp(Fortran::lower::AbstractConverter &converter,
   generatees.append(gridGeneratees);
   generatees.append(intratileGeneratees);
 
-  firOpBuilder.create<mlir::omp::TileOp>(loc, generatees, applyees,
-                                         sizesClause.sizes);
+  mlir::omp::TileOp::create(firOpBuilder, loc, generatees, applyees,
+                            sizesClause.sizes);
 }
 
 static void genUnrollOp(Fortran::lower::AbstractConverter &converter,
diff --git a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
index 29eedfb0ce9cd..39bac818fe5d0 100644
--- a/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
+++ b/flang/lib/Optimizer/Builder/IntrinsicCall.cpp
@@ -989,9 +989,18 @@ static constexpr IntrinsicHandler handlers[]{
        {"mask", asBox, handleDynamicOptional}}},
      /*isElemental=*/false},
     {"syncthreads", &I::genSyncThreads, {}, /*isElemental=*/false},
-    {"syncthreads_and", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
-    {"syncthreads_count", &I::genSyncThreadsCount, {}, /*isElemental=*/false},
-    {"syncthreads_or", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
+    {"syncthreads_and_i4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
+    {"syncthreads_and_l4", &I::genSyncThreadsAnd, {}, /*isElemental=*/false},
+    {"syncthreads_count_i4",
+     &I::genSyncThreadsCount,
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_count_l4",
+     &I::genSyncThreadsCount,
+     {},
+     /*isElemental=*/false},
+    {"syncthreads_or_i4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
+    {"syncthreads_or_l4", &I::genSyncThreadsOr, {}, /*isElemental=*/false},
     {"syncwarp", &I::genSyncWarp, {}, /*isElemental=*/false},
     {"system",
      &I::genSystem,
@@ -1758,8 +1767,10 @@ static constexpr MathOperation mathOperations[] = {
      genComplexMathOp<mlir::complex::SinOp>},
     {"sin", RTNAME_STRING(CSinF128), FuncTypeComplex16Complex16,
      genLibF128Call},
-    {"sinh", "sinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>, genLibCall},
-    {"sinh", "sinh", genFuncType<Ty::Real<8>, Ty::Real<8>>, genLibCall},
+    {"sinh", "sinhf", genFuncType<Ty::Real<4>, Ty::Real<4>>,
+     genMathOp<mlir::math::SinhOp>},
+    {"sinh", "sinh", genFuncType<Ty::Real<8>, Ty::Real<8>>,
+     genMathOp<mlir::math::SinhOp>},
     {"sinh", RTNAME_STRING(SinhF128), FuncTypeReal16Real16, genLibF128Call},
     {"sinh", "csinhf", genFuncType<Ty::Complex<4>, Ty::Complex<4>>, genLibCall},
     {"sinh", "csinh", genFuncType<Ty::Complex<8>, Ty::Complex<8>>, genLibCall},
@@ -3095,7 +3106,9 @@ IntrinsicLibrary::genAtomicCas(mlir::Type resultType,
           .getResult(0);
   auto cmpxchg = mlir::LLVM::AtomicCmpXchgOp::create(
       builder, loc, address, arg1, arg2, successOrdering, failureOrdering);
-  return mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  mlir::Value boolResult =
+      mlir::LLVM::ExtractValueOp::create(builder, loc, cmpxchg, 1);
+  return builder.createConvert(loc, resultType, boolResult);
 }
 
 mlir::Value IntrinsicLibrary::genAtomicDec(mlir::Type resultType,
@@ -8965,10 +8978,12 @@ IntrinsicLibrary::genSyncThreadsAnd(mlir::Type resultType,
                                     llvm::ArrayRef<mlir::Value> args) {
   constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.and";
   mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
   mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+      mlir::FunctionType::get(context, {resultType}, {i32});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
 }
 
 // SYNCTHREADS_COUNT
@@ -8977,10 +8992,12 @@ IntrinsicLibrary::genSyncThreadsCount(mlir::Type resultType,
                                       llvm::ArrayRef<mlir::Value> args) {
   constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.popc";
   mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
   mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+      mlir::FunctionType::get(context, {resultType}, {i32});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
 }
 
 // SYNCTHREADS_OR
@@ -8989,10 +9006,12 @@ IntrinsicLibrary::genSyncThreadsOr(mlir::Type resultType,
                                    llvm::ArrayRef<mlir::Value> args) {
   constexpr llvm::StringLiteral funcName = "llvm.nvvm.barrier0.or";
   mlir::MLIRContext *context = builder.getContext();
+  mlir::Type i32 = builder.getI32Type();
   mlir::FunctionType ftype =
-      mlir::FunctionType::get(context, {resultType}, {args[0].getType()});
+      mlir::FunctionType::get(context, {resultType}, {i32});
   auto funcOp = builder.createFunction(loc, funcName, ftype);
-  return fir::CallOp::create(builder, loc, funcOp, args).getResult(0);
+  mlir::Value arg = builder.createConvert(loc, i32, args[0]);
+  return fir::CallOp::create(builder, loc, funcOp, {arg}).getResult(0);
 }
 
 // SYNCWARP
diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 70bb43a2510ba..478ab151b96d0 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -749,6 +749,44 @@ struct VolatileCastOpConversion
   }
 };
 
+/// Lower `fir.assumed_size_extent` to constant -1 of index type.
+struct AssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::AssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::AssumedSizeExtentOp op, OpAdaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Type ity = lowerTy().indexType();
+    auto cst = fir::genConstantIndex(loc, ity, rewriter, -1);
+    rewriter.replaceOp(op, cst.getResult());
+    return mlir::success();
+  }
+};
+
+/// Lower `fir.is_assumed_size_extent` to integer equality with -1.
+struct IsAssumedSizeExtentOpConversion
+    : public fir::FIROpConversion<fir::IsAssumedSizeExtentOp> {
+  using FIROpConversion::FIROpConversion;
+
+  llvm::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op, OpAdaptor adaptor,
+                  mlir::ConversionPatternRewriter &rewriter) const override {
+    mlir::Location loc = op.getLoc();
+    mlir::Value val = adaptor.getVal();
+    mlir::Type valTy = val.getType();
+    // Create constant -1 of the operand type.
+    auto negOneAttr = rewriter.getIntegerAttr(valTy, -1);
+    auto negOne =
+        mlir::LLVM::ConstantOp::create(rewriter, loc, valTy, negOneAttr);
+    auto cmp = mlir::LLVM::ICmpOp::create(
+        rewriter, loc, mlir::LLVM::ICmpPredicate::eq, val, negOne);
+    rewriter.replaceOp(op, cmp.getResult());
+    return mlir::success();
+  }
+};
+
 /// convert value of from-type to value of to-type
 struct ConvertOpConversion : public fir::FIROpConversion<fir::ConvertOp> {
   using FIROpConversion::FIROpConversion;
@@ -1113,7 +1151,7 @@ struct AllocMemOpConversion : public fir::FIROpConversion<fir::AllocMemOp> {
     mlir::Value size = genTypeSizeInBytes(loc, ity, rewriter, llvmObjectTy);
     if (auto scaleSize =
             fir::genAllocationScaleSize(loc, heap.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands())
       size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size,
                                        integerCast(loc, rewriter, ity, opnd));
@@ -4360,6 +4398,7 @@ void fir::populateFIRToLLVMConversionPatterns(
       AllocaOpConversion, AllocMemOpConversion, BoxAddrOpConversion,
       BoxCharLenOpConversion, BoxDimsOpConversion, BoxEleSizeOpConversion,
       BoxIsAllocOpConversion, BoxIsArrayOpConversion, BoxIsPtrOpConversion,
+      AssumedSizeExtentOpConversion, IsAssumedSizeExtentOpConversion,
       BoxOffsetOpConversion, BoxProcHostOpConversion, BoxRankOpConversion,
       BoxTypeCodeOpConversion, BoxTypeDescOpConversion, CallOpConversion,
       CmpcOpConversion, VolatileCastOpConversion, ConvertOpConversion,
diff --git a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
index 381b2a29c517a..f74d635d50a75 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGenOpenMP.cpp
@@ -242,10 +242,11 @@ struct TargetAllocMemOpConversion
         loc, llvmObjectTy, ity, rewriter, lowerTy().getDataLayout());
     if (auto scaleSize = fir::genAllocationScaleSize(
             loc, allocmemOp.getInType(), ity, rewriter))
-      size = rewriter.create<mlir::LLVM::MulOp>(loc, ity, size, scaleSize);
+      size = mlir::LLVM::MulOp::create(rewriter, loc, ity, size, scaleSize);
     for (mlir::Value opnd : adaptor.getOperands().drop_front())
-      size = rewriter.create<mlir::LLVM::MulOp>(
-          loc, ity, size, integerCast(lowerTy(), loc, rewriter, ity, opnd));
+      size = mlir::LLVM::MulOp::create(
+          rewriter, loc, ity, size,
+          integerCast(lowerTy(), loc, rewriter, ity, opnd));
     auto mallocTyWidth = lowerTy().getIndexTypeBitwidth();
     auto mallocTy =
         mlir::IntegerType::get(rewriter.getContext(), mallocTyWidth);
diff --git a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
index ac285b5d403df..0776346870c72 100644
--- a/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
+++ b/flang/lib/Optimizer/CodeGen/TargetRewrite.cpp
@@ -872,6 +872,14 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       }
     }
 
+    // Count the number of arguments that have to stay in place at the end of
+    // the argument list.
+    unsigned trailingArgs = 0;
+    if constexpr (std::is_same_v<FuncOpTy, mlir::gpu::GPUFuncOp>) {
+      trailingArgs =
+          func.getNumWorkgroupAttributions() + func.getNumPrivateAttributions();
+    }
+
     // Convert return value(s)
     for (auto ty : funcTy.getResults())
       llvm::TypeSwitch<mlir::Type>(ty)
@@ -981,6 +989,16 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
       }
     }
 
+    // Add the argument at the end if the number of trailing arguments is 0,
+    // otherwise insert the argument at the appropriate index.
+    auto addOrInsertArgument = [&](mlir::Type ty, mlir::Location loc) {
+      unsigned inputIndex = func.front().getArguments().size() - trailingArgs;
+      auto newArg = trailingArgs == 0
+                        ? func.front().addArgument(ty, loc)
+                        : func.front().insertArgument(inputIndex, ty, loc);
+      return newArg;
+    };
+
     if (!func.empty()) {
       // If the function has a body, then apply the fixups to the arguments and
       // return ops as required. These fixups are done in place.
@@ -1117,8 +1135,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
           // original arguments. (Boxchar arguments.)
           auto newBufArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto boxTy = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           auto box = fir::EmboxCharOp::create(*rewriter, loc, boxTy, newBufArg,
@@ -1133,8 +1150,7 @@ class TargetRewrite : public fir::impl::TargetRewritePassBase<TargetRewrite> {
           // appended after all the original arguments.
           auto newProcPointerArg =
               func.front().insertArgument(fixup.index, fixupType, loc);
-          auto newLenArg =
-              func.front().addArgument(trailingTys[fixup.second], loc);
+          auto newLenArg = addOrInsertArgument(trailingTys[fixup.second], loc);
           auto tupleType = oldArgTys[fixup.index - offset];
           rewriter->setInsertionPointToStart(&func.front());
           fir::FirOpBuilder builder(*rewriter, getModule());
diff --git a/flang/lib/Optimizer/Dialect/FIROps.cpp b/flang/lib/Optimizer/Dialect/FIROps.cpp
index 1712af1d1eba7..d0164f32d9b6a 100644
--- a/flang/lib/Optimizer/Dialect/FIROps.cpp
+++ b/flang/lib/Optimizer/Dialect/FIROps.cpp
@@ -5142,6 +5142,34 @@ void fir::BoxTotalElementsOp::getCanonicalizationPatterns(
   patterns.add<SimplifyBoxTotalElementsOp>(context);
 }
 
+//===----------------------------------------------------------------------===//
+// IsAssumedSizeExtentOp and AssumedSizeExtentOp
+//===----------------------------------------------------------------------===//
+
+namespace {
+struct FoldIsAssumedSizeExtentOnCtor
+    : public mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp> {
+  using mlir::OpRewritePattern<fir::IsAssumedSizeExtentOp>::OpRewritePattern;
+  mlir::LogicalResult
+  matchAndRewrite(fir::IsAssumedSizeExtentOp op,
+                  mlir::PatternRewriter &rewriter) const override {
+    if (llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+            op.getVal().getDefiningOp())) {
+      mlir::Type i1 = rewriter.getI1Type();
+      rewriter.replaceOpWithNewOp<mlir::arith::ConstantOp>(
+          op, i1, rewriter.getIntegerAttr(i1, 1));
+      return mlir::success();
+    }
+    return mlir::failure();
+  }
+};
+} // namespace
+
+void fir::IsAssumedSizeExtentOp::getCanonicalizationPatterns(
+    mlir::RewritePatternSet &patterns, mlir::MLIRContext *context) {
+  patterns.add<FoldIsAssumedSizeExtentOnCtor>(context);
+}
+
 //===----------------------------------------------------------------------===//
 // LocalitySpecifierOp
 //===----------------------------------------------------------------------===//
diff --git a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
index 4840a999ecd27..0d135a94588e4 100644
--- a/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
+++ b/flang/lib/Optimizer/OpenACC/Transforms/ACCRecipeBufferization.cpp
@@ -39,13 +39,13 @@ class BufferizeInterface {
 
   static mlir::Operation *load(mlir::OpBuilder &builder, mlir::Location loc,
                                mlir::Value value) {
-    return builder.create<fir::LoadOp>(loc, value);
+    return fir::LoadOp::create(builder, loc, value);
   }
 
   static mlir::Value placeInMemory(mlir::OpBuilder &builder, mlir::Location loc,
                                    mlir::Value value) {
-    auto alloca = builder.create<fir::AllocaOp>(loc, value.getType());
-    builder.create<fir::StoreOp>(loc, value, alloca);
+    auto alloca = fir::AllocaOp::create(builder, loc, value.getType());
+    fir::StoreOp::create(builder, loc, value, alloca);
     return alloca;
   }
 };
diff --git a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
index 817434ff3dc30..5793d46a192a7 100644
--- a/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
+++ b/flang/lib/Optimizer/OpenMP/AutomapToTargetData.cpp
@@ -130,8 +130,8 @@ class AutomapToTargetDataPass
           builder.getBoolAttr(false));
       clauses.mapVars.push_back(mapInfo);
       isa<fir::StoreOp>(memOp)
-          ? builder.create<omp::TargetEnterDataOp>(memOp.getLoc(), clauses)
-          : builder.create<omp::TargetExitDataOp>(memOp.getLoc(), clauses);
+          ? omp::TargetEnterDataOp::create(builder, memOp.getLoc(), clauses)
+          : omp::TargetExitDataOp::create(builder, memOp.getLoc(), clauses);
     };
 
     for (fir::GlobalOp globalOp : automapGlobals) {
diff --git a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
index 65a23be243716..1229018bd9b3e 100644
--- a/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
+++ b/flang/lib/Optimizer/OpenMP/DoConcurrentConversion.cpp
@@ -595,7 +595,7 @@ class DoConcurrentConversion
               mlir::omp::TargetOperands &clauseOps,
               mlir::omp::LoopNestOperands &loopNestClauseOps,
               const LiveInShapeInfoMap &liveInShapeInfoMap) const {
-    auto targetOp = rewriter.create<mlir::omp::TargetOp>(loc, clauseOps);
+    auto targetOp = mlir::omp::TargetOp::create(rewriter, loc, clauseOps);
     auto argIface = llvm::cast<mlir::omp::BlockArgOpenMPOpInterface>(*targetOp);
 
     mlir::Region &region = targetOp.getRegion();
@@ -672,7 +672,7 @@ class DoConcurrentConversion
     // temporary.
     Fortran::utils::openmp::cloneOrMapRegionOutsiders(builder, targetOp);
     rewriter.setInsertionPoint(
-        rewriter.create<mlir::omp::TerminatorOp>(targetOp.getLoc()));
+        mlir::omp::TerminatorOp::create(rewriter, targetOp.getLoc()));
 
     return targetOp;
   }
@@ -715,8 +715,8 @@ class DoConcurrentConversion
 
         auto shapeShiftType = fir::ShapeShiftType::get(
             builder.getContext(), shapeShiftOperands.size() / 2);
-        return builder.create<fir::ShapeShiftOp>(
-            liveInArg.getLoc(), shapeShiftType, shapeShiftOperands);
+        return fir::ShapeShiftOp::create(builder, liveInArg.getLoc(),
+                                         shapeShiftType, shapeShiftOperands);
       }
 
       llvm::SmallVector<mlir::Value> shapeOperands;
@@ -728,11 +728,11 @@ class DoConcurrentConversion
         ++shapeIdx;
       }
 
-      return builder.create<fir::ShapeOp>(liveInArg.getLoc(), shapeOperands);
+      return fir::ShapeOp::create(builder, liveInArg.getLoc(), shapeOperands);
     }();
 
-    return builder.create<hlfir::DeclareOp>(liveInArg.getLoc(), liveInArg,
-                                            liveInName, shape);
+    return hlfir::DeclareOp::create(builder, liveInArg.getLoc(), liveInArg,
+                                    liveInName, shape);
   }
 
   mlir::omp::TeamsOp genTeamsOp(mlir::ConversionPatternRewriter &rewriter,
@@ -742,13 +742,13 @@ class DoConcurrentConversion
     genReductions(rewriter, mapper, loop, teamsOps);
 
     mlir::Location loc = loop.getLoc();
-    auto teamsOp = rewriter.create<mlir::omp::TeamsOp>(loc, teamsOps);
+    auto teamsOp = mlir::omp::TeamsOp::create(rewriter, loc, teamsOps);
     Fortran::common::openmp::EntryBlockArgs teamsArgs;
     teamsArgs.reduction.vars = teamsOps.reductionVars;
     Fortran::common::openmp::genEntryBlock(rewriter, teamsArgs,
                                            teamsOp.getRegion());
 
-    rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+    rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
 
     for (auto [loopVar, teamsArg] : llvm::zip_equal(
              loop.getReduceVars(), teamsOp.getRegion().getArguments())) {
@@ -761,8 +761,8 @@ class DoConcurrentConversion
   mlir::omp::DistributeOp
   genDistributeOp(mlir::Location loc,
                   mlir::ConversionPatternRewriter &rewriter) const {
-    auto distOp = rewriter.create<mlir::omp::DistributeOp>(
-        loc, /*clauses=*/mlir::omp::DistributeOperands{});
+    auto distOp = mlir::omp::DistributeOp::create(
+        rewriter, loc, /*clauses=*/mlir::omp::DistributeOperands{});
 
     rewriter.createBlock(&distOp.getRegion());
     return distOp;
diff --git a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
index 8a9b383ec1356..7b61539984232 100644
--- a/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
+++ b/flang/lib/Optimizer/OpenMP/LowerWorkdistribute.cpp
@@ -282,14 +282,14 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
           &newTeams.getRegion(), newTeams.getRegion().begin(), {}, {});
       for (auto arg : teamsBlock->getArguments())
         newTeamsBlock->addArgument(arg.getType(), arg.getLoc());
-      auto newWorkdistribute = rewriter.create<omp::WorkdistributeOp>(loc);
-      rewriter.create<omp::TerminatorOp>(loc);
+      auto newWorkdistribute = omp::WorkdistributeOp::create(rewriter, loc);
+      omp::TerminatorOp::create(rewriter, loc);
       rewriter.createBlock(&newWorkdistribute.getRegion(),
                            newWorkdistribute.getRegion().begin(), {}, {});
       auto *cloned = rewriter.clone(*parallelize);
       parallelize->replaceAllUsesWith(cloned);
       parallelize->erase();
-      rewriter.create<omp::TerminatorOp>(loc);
+      omp::TerminatorOp::create(rewriter, loc);
       changed = true;
     }
   }
@@ -298,10 +298,10 @@ fissionWorkdistribute(omp::WorkdistributeOp workdistribute) {
 
 /// Generate omp.parallel operation with an empty region.
 static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
-  auto parallelOp = rewriter.create<mlir::omp::ParallelOp>(loc);
+  auto parallelOp = mlir::omp::ParallelOp::create(rewriter, loc);
   parallelOp.setComposite(composite);
   rewriter.createBlock(&parallelOp.getRegion());
-  rewriter.setInsertionPoint(rewriter.create<mlir::omp::TerminatorOp>(loc));
+  rewriter.setInsertionPoint(mlir::omp::TerminatorOp::create(rewriter, loc));
   return;
 }
 
@@ -309,7 +309,7 @@ static void genParallelOp(Location loc, OpBuilder &rewriter, bool composite) {
 static void genDistributeOp(Location loc, OpBuilder &rewriter, bool composite) {
   mlir::omp::DistributeOperands distributeClauseOps;
   auto distributeOp =
-      rewriter.create<mlir::omp::DistributeOp>(loc, distributeClauseOps);
+      mlir::omp::DistributeOp::create(rewriter, loc, distributeClauseOps);
   distributeOp.setComposite(composite);
   auto distributeBlock = rewriter.createBlock(&distributeOp.getRegion());
   rewriter.setInsertionPointToStart(distributeBlock);
@@ -334,12 +334,12 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
                         const mlir::omp::LoopNestOperands &clauseOps,
                         bool composite) {
 
-  auto wsloopOp = rewriter.create<mlir::omp::WsloopOp>(doLoop.getLoc());
+  auto wsloopOp = mlir::omp::WsloopOp::create(rewriter, doLoop.getLoc());
   wsloopOp.setComposite(composite);
   rewriter.createBlock(&wsloopOp.getRegion());
 
   auto loopNestOp =
-      rewriter.create<mlir::omp::LoopNestOp>(doLoop.getLoc(), clauseOps);
+      mlir::omp::LoopNestOp::create(rewriter, doLoop.getLoc(), clauseOps);
 
   // Clone the loop's body inside the loop nest construct using the
   // mapped values.
@@ -351,7 +351,7 @@ static void genWsLoopOp(mlir::OpBuilder &rewriter, fir::DoLoopOp doLoop,
   // Erase fir.result op of do loop and create yield op.
   if (auto resultOp = dyn_cast<fir::ResultOp>(terminatorOp)) {
     rewriter.setInsertionPoint(terminatorOp);
-    rewriter.create<mlir::omp::YieldOp>(doLoop->getLoc());
+    mlir::omp::YieldOp::create(rewriter, doLoop->getLoc());
     terminatorOp->erase();
   }
 }
@@ -494,15 +494,15 @@ static SmallVector<Value> convertFlatToMultiDim(OpBuilder &builder,
   // Convert flat index to multi-dimensional indices
   SmallVector<Value> indices(rank);
   Value temp = flatIdx;
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
 
   // Work backwards through dimensions (row-major order)
   for (int i = rank - 1; i >= 0; --i) {
-    Value zeroBasedIdx = builder.create<arith::RemSIOp>(loc, temp, extents[i]);
+    Value zeroBasedIdx = arith::RemSIOp::create(builder, loc, temp, extents[i]);
     // Convert to one-based index
-    indices[i] = builder.create<arith::AddIOp>(loc, zeroBasedIdx, c1);
+    indices[i] = arith::AddIOp::create(builder, loc, zeroBasedIdx, c1);
     if (i > 0) {
-      temp = builder.create<arith::DivSIOp>(loc, temp, extents[i]);
+      temp = arith::DivSIOp::create(builder, loc, temp, extents[i]);
     }
   }
 
@@ -525,7 +525,7 @@ static Value CalculateTotalElements(OpBuilder &builder, Location loc,
     if (i == 0) {
       totalElems = extent;
     } else {
-      totalElems = builder.create<arith::MulIOp>(loc, totalElems, extent);
+      totalElems = arith::MulIOp::create(builder, loc, totalElems, extent);
     }
   }
   return totalElems;
@@ -562,14 +562,14 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
   // Load destination array box (if it's a reference)
   Value arrayBox = destBox;
   if (isa<fir::ReferenceType>(destBox.getType()))
-    arrayBox = builder.create<fir::LoadOp>(loc, destBox);
+    arrayBox = fir::LoadOp::create(builder, loc, destBox);
 
-  auto scalarValue = builder.create<fir::BoxAddrOp>(loc, srcBox);
-  Value scalar = builder.create<fir::LoadOp>(loc, scalarValue);
+  auto scalarValue = fir::BoxAddrOp::create(builder, loc, srcBox);
+  Value scalar = fir::LoadOp::create(builder, loc, scalarValue);
 
   // Calculate total number of elements (flattened)
-  auto c0 = builder.create<arith::ConstantIndexOp>(loc, 0);
-  auto c1 = builder.create<arith::ConstantIndexOp>(loc, 1);
+  auto c0 = arith::ConstantIndexOp::create(builder, loc, 0);
+  auto c1 = arith::ConstantIndexOp::create(builder, loc, 1);
   Value totalElems = CalculateTotalElements(builder, loc, arrayBox);
 
   auto *workdistributeBlock = &workdistribute.getRegion().front();
@@ -587,7 +587,7 @@ static void replaceWithUnorderedDoLoop(OpBuilder &builder, Location loc,
       builder, loc, fir::ReferenceType::get(scalar.getType()), arrayBox,
       nullptr, nullptr, ValueRange{indices}, ValueRange{});
 
-  builder.create<fir::StoreOp>(loc, scalar, elemPtr);
+  fir::StoreOp::create(builder, loc, scalar, elemPtr);
 }
 
 /// workdistributeRuntimeCallLower method finds the runtime calls
@@ -749,14 +749,15 @@ FailureOr<omp::TargetOp> splitTargetData(omp::TargetOp targetOp,
   auto deviceAddrVars = targetOp.getHasDeviceAddrVars();
   auto devicePtrVars = targetOp.getIsDevicePtrVars();
   // Create the target data op
-  auto targetDataOp = rewriter.create<omp::TargetDataOp>(
-      loc, device, ifExpr, outerMapInfos, deviceAddrVars, devicePtrVars);
+  auto targetDataOp =
+      omp::TargetDataOp::create(rewriter, loc, device, ifExpr, outerMapInfos,
+                                deviceAddrVars, devicePtrVars);
   auto taregtDataBlock = rewriter.createBlock(&targetDataOp.getRegion());
-  rewriter.create<mlir::omp::TerminatorOp>(loc);
+  mlir::omp::TerminatorOp::create(rewriter, loc);
   rewriter.setInsertionPointToStart(taregtDataBlock);
   // Create the inner target op
-  auto newTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  auto newTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -821,19 +822,19 @@ static TempOmpVar allocateTempOmpVar(Location loc, Type ty,
   // Get the appropriate type for allocation
   if (isPtr(ty)) {
     Type intTy = rewriter.getI32Type();
-    auto one = rewriter.create<LLVM::ConstantOp>(loc, intTy, 1);
+    auto one = LLVM::ConstantOp::create(rewriter, loc, intTy, 1);
     allocType = llvmPtrTy;
-    alloc = rewriter.create<LLVM::AllocaOp>(loc, llvmPtrTy, allocType, one);
+    alloc = LLVM::AllocaOp::create(rewriter, loc, llvmPtrTy, allocType, one);
     allocType = intTy;
   } else {
     allocType = ty;
-    alloc = rewriter.create<fir::AllocaOp>(loc, allocType);
+    alloc = fir::AllocaOp::create(rewriter, loc, allocType);
   }
   // Lambda to create mapinfo ops
   auto getMapInfo = [&](mlir::omp::ClauseMapFlags mappingFlags,
                         const char *name) {
-    return rewriter.create<omp::MapInfoOp>(
-        loc, alloc.getType(), alloc, TypeAttr::get(allocType),
+    return omp::MapInfoOp::create(
+        rewriter, loc, alloc.getType(), alloc, TypeAttr::get(allocType),
         rewriter.getAttr<omp::ClauseMapFlagsAttr>(mappingFlags),
         rewriter.getAttr<omp::VariableCaptureKindAttr>(
             omp::VariableCaptureKind::ByRef),
@@ -979,12 +980,12 @@ static void reloadCacheAndRecompute(
     // If the original value is a pointer or reference, load and convert if
     // necessary.
     if (isPtr(original.getType())) {
-      restored = rewriter.create<LLVM::LoadOp>(loc, llvmPtrTy, newArg);
+      restored = LLVM::LoadOp::create(rewriter, loc, llvmPtrTy, newArg);
       if (!isa<LLVM::LLVMPointerType>(original.getType()))
         restored =
-            rewriter.create<fir::ConvertOp>(loc, original.getType(), restored);
+            fir::ConvertOp::create(rewriter, loc, original.getType(), restored);
     } else {
-      restored = rewriter.create<fir::LoadOp>(loc, newArg);
+      restored = fir::LoadOp::create(rewriter, loc, newArg);
     }
     irMapping.map(original, restored);
   }
@@ -1053,7 +1054,7 @@ static mlir::LLVM::ConstantOp
 genI32Constant(mlir::Location loc, mlir::RewriterBase &rewriter, int value) {
   mlir::Type i32Ty = rewriter.getI32Type();
   mlir::IntegerAttr attr = rewriter.getI32IntegerAttr(value);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, i32Ty, attr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, i32Ty, attr);
 }
 
 /// Given a box descriptor, extract the base address of the data it describes.
@@ -1230,8 +1231,8 @@ static void genFortranAssignOmpReplacement(fir::FirOpBuilder &builder,
       genOmpGetMappedPtrIfPresent(builder, loc, destBase, device, module);
   Value srcPtr =
       genOmpGetMappedPtrIfPresent(builder, loc, srcBase, device, module);
-  Value zero = builder.create<LLVM::ConstantOp>(loc, builder.getI64Type(),
-                                                builder.getI64IntegerAttr(0));
+  Value zero = LLVM::ConstantOp::create(builder, loc, builder.getI64Type(),
+                                        builder.getI64IntegerAttr(0));
 
   // Generate the call to omp_target_memcpy to perform the data copy on the
   // device.
@@ -1348,23 +1349,24 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
   for (Operation *op : opsToReplace) {
     if (auto allocOp = dyn_cast<fir::AllocMemOp>(op)) {
       rewriter.setInsertionPoint(allocOp);
-      auto ompAllocmemOp = rewriter.create<omp::TargetAllocMemOp>(
-          allocOp.getLoc(), rewriter.getI64Type(), device,
+      auto ompAllocmemOp = omp::TargetAllocMemOp::create(
+          rewriter, allocOp.getLoc(), rewriter.getI64Type(), device,
           allocOp.getInTypeAttr(), allocOp.getUniqNameAttr(),
           allocOp.getBindcNameAttr(), allocOp.getTypeparams(),
           allocOp.getShape());
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          allocOp.getLoc(), allocOp.getResult().getType(),
-          ompAllocmemOp.getResult());
+      auto firConvertOp = fir::ConvertOp::create(rewriter, allocOp.getLoc(),
+                                                 allocOp.getResult().getType(),
+                                                 ompAllocmemOp.getResult());
       rewriter.replaceOp(allocOp, firConvertOp.getResult());
     }
     // Replace fir.freemem with omp.target_freemem.
     else if (auto freeOp = dyn_cast<fir::FreeMemOp>(op)) {
       rewriter.setInsertionPoint(freeOp);
-      auto firConvertOp = rewriter.create<fir::ConvertOp>(
-          freeOp.getLoc(), rewriter.getI64Type(), freeOp.getHeapref());
-      rewriter.create<omp::TargetFreeMemOp>(freeOp.getLoc(), device,
-                                            firConvertOp.getResult());
+      auto firConvertOp =
+          fir::ConvertOp::create(rewriter, freeOp.getLoc(),
+                                 rewriter.getI64Type(), freeOp.getHeapref());
+      omp::TargetFreeMemOp::create(rewriter, freeOp.getLoc(), device,
+                                   firConvertOp.getResult());
       rewriter.eraseOp(freeOp);
     }
     // fir.declare changes its type when hoisting it out of omp.target to
@@ -1376,8 +1378,9 @@ static LogicalResult moveToHost(omp::TargetOp targetOp, RewriterBase &rewriter,
           dyn_cast<fir::ReferenceType>(clonedInType);
       Type clonedEleTy = clonedRefType.getElementType();
       rewriter.setInsertionPoint(op);
-      Value loadedValue = rewriter.create<fir::LoadOp>(
-          clonedDeclareOp.getLoc(), clonedEleTy, clonedDeclareOp.getMemref());
+      Value loadedValue =
+          fir::LoadOp::create(rewriter, clonedDeclareOp.getLoc(), clonedEleTy,
+                              clonedDeclareOp.getMemref());
       clonedDeclareOp.getResult().replaceAllUsesWith(loadedValue);
     }
     // Replace runtime calls with omp versions.
@@ -1473,8 +1476,8 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> preHostEvalVars{targetOp.getHostEvalVars()};
   // update the hostEvalVars of preTargetOp
-  omp::TargetOp preTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp preTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), preHostEvalVars,
@@ -1513,13 +1516,13 @@ genPreTargetOp(omp::TargetOp targetOp, SmallVector<Value> &preMapOperands,
     // Create the store operation.
     if (isPtr(originalResult.getType())) {
       if (!isa<LLVM::LLVMPointerType>(toStore.getType()))
-        toStore = rewriter.create<fir::ConvertOp>(loc, llvmPtrTy, toStore);
-      rewriter.create<LLVM::StoreOp>(loc, toStore, newArg);
+        toStore = fir::ConvertOp::create(rewriter, loc, llvmPtrTy, toStore);
+      LLVM::StoreOp::create(rewriter, loc, toStore, newArg);
     } else {
-      rewriter.create<fir::StoreOp>(loc, toStore, newArg);
+      fir::StoreOp::create(rewriter, loc, toStore, newArg);
     }
   }
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // Update hostEvalVars with the mapped values for the loop bounds if we have
   // a loopNestOp and we are not generating code for the target device.
@@ -1563,8 +1566,8 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
                                 hostEvalVars.steps.end());
   }
   // Create the isolated target op
-  omp::TargetOp isolatedTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp isolatedTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(),
@@ -1590,7 +1593,7 @@ genIsolatedTargetOp(omp::TargetOp targetOp, SmallVector<Value> &postMapOperands,
 
   // Clone the original operations.
   rewriter.clone(*splitBeforeOp, isolatedMapping);
-  rewriter.create<omp::TerminatorOp>(loc);
+  omp::TerminatorOp::create(rewriter, loc);
 
   // update the loop bounds in the isolatedTargetOp if we have host_eval vars
   // and we are not generating code for the target device.
@@ -1643,8 +1646,8 @@ static omp::TargetOp genPostTargetOp(omp::TargetOp targetOp,
   auto *targetBlock = &targetOp.getRegion().front();
   SmallVector<Value> postHostEvalVars{targetOp.getHostEvalVars()};
   // Create the post target op
-  omp::TargetOp postTargetOp = rewriter.create<omp::TargetOp>(
-      targetOp.getLoc(), targetOp.getAllocateVars(),
+  omp::TargetOp postTargetOp = omp::TargetOp::create(
+      rewriter, targetOp.getLoc(), targetOp.getAllocateVars(),
       targetOp.getAllocatorVars(), targetOp.getBareAttr(),
       targetOp.getDependKindsAttr(), targetOp.getDependVars(),
       targetOp.getDevice(), targetOp.getHasDeviceAddrVars(), postHostEvalVars,
diff --git a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
index fade023bc39b0..a6e2c8bae1184 100644
--- a/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
+++ b/flang/lib/Optimizer/OpenMP/MapInfoFinalization.cpp
@@ -903,18 +903,17 @@ class MapInfoFinalizationPass
     if (explicitMappingPresent(op, targetDataOp))
       return;
 
-    mlir::omp::MapInfoOp newDescParentMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), op.getResult().getType(), op.getVarPtr(),
-            op.getVarTypeAttr(),
-            builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
-                mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always |
-                mlir::omp::ClauseMapFlags::descriptor),
-            op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
-            mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
-            /*bounds=*/mlir::SmallVector<mlir::Value>{},
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newDescParentMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), op.getResult().getType(), op.getVarPtr(),
+        op.getVarTypeAttr(),
+        builder.getAttr<mlir::omp::ClauseMapFlagsAttr>(
+            mlir::omp::ClauseMapFlags::to | mlir::omp::ClauseMapFlags::always |
+            mlir::omp::ClauseMapFlags::descriptor),
+        op.getMapCaptureTypeAttr(), /*varPtrPtr=*/mlir::Value{},
+        mlir::SmallVector<mlir::Value>{}, mlir::ArrayAttr{},
+        /*bounds=*/mlir::SmallVector<mlir::Value>{},
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
 
     targetDataOp.getMapVarsMutable().append({newDescParentMapOp});
   }
@@ -966,14 +965,13 @@ class MapInfoFinalizationPass
     // need to see how well this alteration works.
     auto loadBaseAddr =
         builder.loadIfRef(op->getLoc(), baseAddr.getVarPtrPtr());
-    mlir::omp::MapInfoOp newBaseAddrMapOp =
-        builder.create<mlir::omp::MapInfoOp>(
-            op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
-            baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
-            baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members,
-            membersAttr, baseAddr.getBounds(),
-            /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
-            /*partial_map=*/builder.getBoolAttr(false));
+    mlir::omp::MapInfoOp newBaseAddrMapOp = mlir::omp::MapInfoOp::create(
+        builder, op->getLoc(), loadBaseAddr.getType(), loadBaseAddr,
+        baseAddr.getVarTypeAttr(), baseAddr.getMapTypeAttr(),
+        baseAddr.getMapCaptureTypeAttr(), mlir::Value{}, members, membersAttr,
+        baseAddr.getBounds(),
+        /*mapperId*/ mlir::FlatSymbolRefAttr(), op.getNameAttr(),
+        /*partial_map=*/builder.getBoolAttr(false));
     op.replaceAllUsesWith(newBaseAddrMapOp.getResult());
     op->erase();
     baseAddr.erase();
diff --git a/flang/lib/Optimizer/Passes/Pipelines.cpp b/flang/lib/Optimizer/Passes/Pipelines.cpp
index 1ce63ec85cef5..d9b1287829cac 100644
--- a/flang/lib/Optimizer/Passes/Pipelines.cpp
+++ b/flang/lib/Optimizer/Passes/Pipelines.cpp
@@ -430,6 +430,12 @@ void createMLIRToLLVMPassPipeline(mlir::PassManager &pm,
 
   // Add codegen pass pipeline.
   fir::createDefaultFIRCodeGenPassPipeline(pm, config, inputFilename);
+
+  // Run a pass to prepare for translation of delayed privatization in the
+  // context of deferred target tasks.
+  addPassConditionally(pm, disableFirToLlvmIr, [&]() {
+    return mlir::omp::createPrepareForOMPOffloadPrivatizationPass();
+  });
 }
 
 } // namespace fir
diff --git a/flang/lib/Optimizer/Support/Utils.cpp b/flang/lib/Optimizer/Support/Utils.cpp
index 92390e4a3a230..2f33d89564796 100644
--- a/flang/lib/Optimizer/Support/Utils.cpp
+++ b/flang/lib/Optimizer/Support/Utils.cpp
@@ -66,7 +66,7 @@ fir::genConstantIndex(mlir::Location loc, mlir::Type ity,
                       mlir::ConversionPatternRewriter &rewriter,
                       std::int64_t offset) {
   auto cattr = rewriter.getI64IntegerAttr(offset);
-  return rewriter.create<mlir::LLVM::ConstantOp>(loc, ity, cattr);
+  return mlir::LLVM::ConstantOp::create(rewriter, loc, ity, cattr);
 }
 
 mlir::Value
@@ -125,9 +125,9 @@ mlir::Value fir::integerCast(const fir::LLVMTypeConverter &converter,
       return rewriter.createOrFold<mlir::LLVM::SExtOp>(loc, ty, val);
   } else {
     if (toSize < fromSize)
-      return rewriter.create<mlir::LLVM::TruncOp>(loc, ty, val);
+      return mlir::LLVM::TruncOp::create(rewriter, loc, ty, val);
     if (toSize > fromSize)
-      return rewriter.create<mlir::LLVM::SExtOp>(loc, ty, val);
+      return mlir::LLVM::SExtOp::create(rewriter, loc, ty, val);
   }
   return val;
 }
diff --git a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
index ed9a2ae11f0d9..5bf783db92bf3 100644
--- a/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
+++ b/flang/lib/Optimizer/Transforms/ArrayValueCopy.cpp
@@ -832,8 +832,8 @@ static mlir::Type getEleTy(mlir::Type ty) {
 static bool isAssumedSize(llvm::SmallVectorImpl<mlir::Value> &extents) {
   if (extents.empty())
     return false;
-  auto cstLen = fir::getIntIfConstant(extents.back());
-  return cstLen.has_value() && *cstLen == -1;
+  return llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+      extents.back().getDefiningOp());
 }
 
 // Extract extents from the ShapeOp/ShapeShiftOp into the result vector.
diff --git a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
index 759e3a65dd24f..8d00272b09f42 100644
--- a/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFOpConversion.cpp
@@ -454,6 +454,8 @@ struct DeclareOpConversion : public mlir::OpRewritePattern<fir::DeclareOp> {
   mlir::LogicalResult
   matchAndRewrite(fir::DeclareOp op,
                   mlir::PatternRewriter &rewriter) const override {
+    if (op.getResult().getUsers().empty())
+      return success();
     if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
       if (auto global = symTab.lookup<fir::GlobalOp>(
               addrOfOp.getSymbol().getRootReference().getValue())) {
@@ -963,6 +965,8 @@ class CUFOpConversion : public fir::impl::CUFOpConversionBase<CUFOpConversion> {
     }
 
     target.addDynamicallyLegalOp<fir::DeclareOp>([&](fir::DeclareOp op) {
+      if (op.getResult().getUsers().empty())
+        return true;
       if (inDeviceContext(op))
         return true;
       if (auto addrOfOp = op.getMemref().getDefiningOp<fir::AddrOfOp>()) {
diff --git a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
index 00fdb5a4516bd..e1e6125fc348b 100644
--- a/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
+++ b/flang/lib/Optimizer/Transforms/DebugTypeGenerator.cpp
@@ -528,13 +528,10 @@ mlir::LLVM::DITypeAttr DebugTypeGenerator::convertSequenceType(
     if (dim == seqTy.getUnknownExtent()) {
       // This path is taken for both assumed size array or when the size of the
       // array is variable. In the case of variable size, we create a variable
-      // to use as countAttr. Note that fir has a constant size of -1 for
-      // assumed size array. So !optint check makes sure we don't generate
-      // variable in that case.
+      // to use as countAttr.
       if (declOp && declOp.getShape().size() > index) {
-        std::optional<std::int64_t> optint =
-            getIntIfConstant(declOp.getShape()[index]);
-        if (!optint)
+        if (!llvm::isa_and_nonnull<fir::AssumedSizeExtentOp>(
+                declOp.getShape()[index].getDefiningOp()))
           countAttr = generateArtificialVariable(
               context, declOp.getShape()[index], fileAttr, scope, declOp);
       }
diff --git a/flang/lib/Parser/openacc-parsers.cpp b/flang/lib/Parser/openacc-parsers.cpp
index ad035e6ade321..0dec56521f750 100644
--- a/flang/lib/Parser/openacc-parsers.cpp
+++ b/flang/lib/Parser/openacc-parsers.cpp
@@ -75,21 +75,21 @@ TYPE_PARSER(
 // tile size is one of:
 //   * (represented as an empty std::optional<ScalarIntExpr>)
 //   constant-int-expr
-TYPE_PARSER(construct<AccTileExpr>(scalarIntConstantExpr) ||
+TYPE_PARSER(sourced(construct<AccTileExpr>(scalarIntConstantExpr) ||
     construct<AccTileExpr>(
-        "*" >> construct<std::optional<ScalarIntConstantExpr>>()))
+        "*" >> construct<std::optional<ScalarIntConstantExpr>>())))
 TYPE_PARSER(construct<AccTileExprList>(nonemptyList(Parser<AccTileExpr>{})))
 
 // 2.9 (1979-1982) gang-arg is one of :
 //   [num:]int-expr
 //   dim:int-expr
 //   static:size-expr
-TYPE_PARSER(construct<AccGangArg>(construct<AccGangArg::Static>(
-                "STATIC: " >> Parser<AccSizeExpr>{})) ||
+TYPE_PARSER(sourced(construct<AccGangArg>(construct<AccGangArg::Static>(
+                        "STATIC: " >> Parser<AccSizeExpr>{})) ||
     construct<AccGangArg>(
         construct<AccGangArg::Dim>("DIM: " >> scalarIntExpr)) ||
     construct<AccGangArg>(
-        construct<AccGangArg::Num>(maybe("NUM: "_tok) >> scalarIntExpr)))
+        construct<AccGangArg::Num>(maybe("NUM: "_tok) >> scalarIntExpr))))
 
 // 2.9 gang-arg-list
 TYPE_PARSER(
@@ -101,7 +101,7 @@ TYPE_PARSER(construct<AccCollapseArg>(
 
 // 2.5.15 Reduction, F'2023 R1131, and CUF reduction-op
 // Operator for reduction
-TYPE_PARSER(sourced(construct<ReductionOperator>(
+TYPE_PARSER(construct<ReductionOperator>(
     first("+" >> pure(ReductionOperator::Operator::Plus),
         "*" >> pure(ReductionOperator::Operator::Multiply),
         "MAX" >> pure(ReductionOperator::Operator::Max),
@@ -112,32 +112,32 @@ TYPE_PARSER(sourced(construct<ReductionOperator>(
         ".AND." >> pure(ReductionOperator::Operator::And),
         ".OR." >> pure(ReductionOperator::Operator::Or),
         ".EQV." >> pure(ReductionOperator::Operator::Eqv),
-        ".NEQV." >> pure(ReductionOperator::Operator::Neqv)))))
+        ".NEQV." >> pure(ReductionOperator::Operator::Neqv))))
 
 // 2.15.1 Bind clause
-TYPE_PARSER(sourced(construct<AccBindClause>(name)) ||
-    sourced(construct<AccBindClause>(scalarDefaultCharExpr)))
+TYPE_PARSER(sourced(construct<AccBindClause>(name) ||
+    construct<AccBindClause>(scalarDefaultCharExpr)))
 
 // 2.5.16 Default clause
-TYPE_PARSER(construct<AccDefaultClause>(
+TYPE_PARSER(sourced(construct<AccDefaultClause>(
     first("NONE" >> pure(llvm::acc::DefaultValue::ACC_Default_none),
-        "PRESENT" >> pure(llvm::acc::DefaultValue::ACC_Default_present))))
+        "PRESENT" >> pure(llvm::acc::DefaultValue::ACC_Default_present)))))
 
 // SELF clause is either a simple optional condition for compute construct
 // or a synonym of the HOST clause for the update directive 2.14.4 holding
 // an object list.
-TYPE_PARSER(
+TYPE_PARSER(sourced(
     construct<AccSelfClause>(Parser<AccObjectList>{}) / lookAhead(")"_tok) ||
-    construct<AccSelfClause>(scalarLogicalExpr / lookAhead(")"_tok)) ||
+    construct<AccSelfClause>(scalarLogicalExpr) / lookAhead(")"_tok) ||
     construct<AccSelfClause>(
         recovery(fail<std::optional<ScalarLogicalExpr>>(
                      "logical expression or object list expected"_err_en_US),
-            SkipTo<')'>{} >> pure<std::optional<ScalarLogicalExpr>>())))
+            SkipTo<')'>{} >> pure<std::optional<ScalarLogicalExpr>>()))))
 
 // Modifier for copyin, copyout, cache and create
-TYPE_PARSER(construct<AccDataModifier>(
+TYPE_PARSER(sourced(construct<AccDataModifier>(
     first("ZERO:" >> pure(AccDataModifier::Modifier::Zero),
-        "READONLY:" >> pure(AccDataModifier::Modifier::ReadOnly))))
+        "READONLY:" >> pure(AccDataModifier::Modifier::ReadOnly)))))
 
 // Combined directives
 TYPE_PARSER(sourced(construct<AccCombinedDirective>(
@@ -166,14 +166,13 @@ TYPE_PARSER(sourced(construct<AccStandaloneDirective>(
 TYPE_PARSER(sourced(construct<AccLoopDirective>(
     first("LOOP" >> pure(llvm::acc::Directive::ACCD_loop)))))
 
-TYPE_PARSER(construct<AccBeginLoopDirective>(
-    sourced(Parser<AccLoopDirective>{}), Parser<AccClauseList>{}))
+TYPE_PARSER(sourced(construct<AccBeginLoopDirective>(
+    Parser<AccLoopDirective>{}, Parser<AccClauseList>{})))
 
 TYPE_PARSER(construct<AccEndLoop>("END LOOP"_tok))
 
 TYPE_PARSER(construct<OpenACCLoopConstruct>(
-    sourced(Parser<AccBeginLoopDirective>{} / endAccLine),
-    maybe(Parser<DoConstruct>{}),
+    Parser<AccBeginLoopDirective>{} / endAccLine, maybe(Parser<DoConstruct>{}),
     maybe(startAccLine >> Parser<AccEndLoop>{} / endAccLine)))
 
 // 2.15.1 Routine directive
@@ -186,8 +185,8 @@ TYPE_PARSER(sourced(
         parenthesized(Parser<AccObjectListWithModifier>{}))))
 
 // 2.11 Combined constructs
-TYPE_PARSER(construct<AccBeginCombinedDirective>(
-    sourced(Parser<AccCombinedDirective>{}), Parser<AccClauseList>{}))
+TYPE_PARSER(sourced(construct<AccBeginCombinedDirective>(
+    Parser<AccCombinedDirective>{}, Parser<AccClauseList>{})))
 
 // 2.12 Atomic constructs
 TYPE_PARSER(construct<AccEndAtomic>(startAccLine >> "END ATOMIC"_tok))
@@ -213,10 +212,10 @@ TYPE_PARSER("ATOMIC" >>
         statement(assignmentStmt), Parser<AccEndAtomic>{} / endAccLine))
 
 TYPE_PARSER(
-    sourced(construct<OpenACCAtomicConstruct>(Parser<AccAtomicRead>{})) ||
-    sourced(construct<OpenACCAtomicConstruct>(Parser<AccAtomicCapture>{})) ||
-    sourced(construct<OpenACCAtomicConstruct>(Parser<AccAtomicWrite>{})) ||
-    sourced(construct<OpenACCAtomicConstruct>(Parser<AccAtomicUpdate>{})))
+    sourced(construct<OpenACCAtomicConstruct>(Parser<AccAtomicRead>{}) ||
+        construct<OpenACCAtomicConstruct>(Parser<AccAtomicCapture>{}) ||
+        construct<OpenACCAtomicConstruct>(Parser<AccAtomicWrite>{}) ||
+        construct<OpenACCAtomicConstruct>(Parser<AccAtomicUpdate>{})))
 
 // 2.13 Declare constructs
 TYPE_PARSER(sourced(construct<AccDeclarativeDirective>(
@@ -250,18 +249,18 @@ TYPE_PARSER(construct<OpenACCBlockConstruct>(
             pure(llvm::acc::Directive::ACCD_data))))))
 
 // Standalone constructs
-TYPE_PARSER(construct<OpenACCStandaloneConstruct>(
-    sourced(Parser<AccStandaloneDirective>{}), Parser<AccClauseList>{}))
+TYPE_PARSER(sourced(construct<OpenACCStandaloneConstruct>(
+    Parser<AccStandaloneDirective>{}, Parser<AccClauseList>{})))
 
 // Standalone declarative constructs
-TYPE_PARSER(construct<OpenACCStandaloneDeclarativeConstruct>(
-    sourced(Parser<AccDeclarativeDirective>{}), Parser<AccClauseList>{}))
+TYPE_PARSER(sourced(construct<OpenACCStandaloneDeclarativeConstruct>(
+    Parser<AccDeclarativeDirective>{}, Parser<AccClauseList>{})))
 
 TYPE_PARSER(startAccLine >>
     withMessage("expected OpenACC directive"_err_en_US,
-        first(sourced(construct<OpenACCDeclarativeConstruct>(
-                  Parser<OpenACCStandaloneDeclarativeConstruct>{})),
-            sourced(construct<OpenACCDeclarativeConstruct>(
+        sourced(first(construct<OpenACCDeclarativeConstruct>(
+                          Parser<OpenACCStandaloneDeclarativeConstruct>{}),
+            construct<OpenACCDeclarativeConstruct>(
                 Parser<OpenACCRoutineConstruct>{})))))
 
 TYPE_PARSER(sourced(construct<OpenACCEndConstruct>(
@@ -293,9 +292,9 @@ TYPE_PARSER(startAccLine >>
             "SERIAL"_tok >> maybe("LOOP"_tok) >>
                 pure(llvm::acc::Directive::ACCD_serial_loop))))))
 
-TYPE_PARSER(construct<OpenACCCombinedConstruct>(
-    sourced(Parser<AccBeginCombinedDirective>{} / endAccLine),
+TYPE_PARSER(sourced(construct<OpenACCCombinedConstruct>(
+    Parser<AccBeginCombinedDirective>{} / endAccLine,
     maybe(Parser<DoConstruct>{}),
-    maybe(Parser<AccEndCombinedDirective>{} / endAccLine)))
+    maybe(Parser<AccEndCombinedDirective>{} / endAccLine))))
 
 } // namespace Fortran::parser
diff --git a/flang/lib/Parser/openmp-parsers.cpp b/flang/lib/Parser/openmp-parsers.cpp
index c0472ad3c0692..d1e081cfd1b41 100644
--- a/flang/lib/Parser/openmp-parsers.cpp
+++ b/flang/lib/Parser/openmp-parsers.cpp
@@ -367,8 +367,8 @@ struct OmpArgumentListParser {
 };
 
 TYPE_PARSER( //
-    construct<OmpTypeSpecifier>(Parser<DeclarationTypeSpec>{}) ||
-    construct<OmpTypeSpecifier>(Parser<TypeSpec>{}))
+    construct<OmpTypeName>(Parser<DeclarationTypeSpec>{}) ||
+    construct<OmpTypeName>(Parser<TypeSpec>{}))
 
 // 2.15.3.6 REDUCTION (reduction-identifier: variable-name-list)
 TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
@@ -376,8 +376,8 @@ TYPE_PARSER(construct<OmpReductionIdentifier>(Parser<DefinedOperator>{}) ||
 
 TYPE_PARSER(construct<OmpReductionSpecifier>( //
     Parser<OmpReductionIdentifier>{},
-    ":"_tok >> nonemptyList(Parser<OmpTypeSpecifier>{}),
-    maybe(":"_tok >> Parser<OmpReductionCombiner>{})))
+    ":"_tok >> nonemptyList(Parser<OmpTypeName>{}),
+    maybe(":"_tok >> Parser<OmpCombinerExpression>{})))
 
 // --- Parsers for context traits -------------------------------------
 
@@ -1832,8 +1832,8 @@ TYPE_PARSER(sourced(construct<OpenMPDeclareMapperConstruct>(
         IsDirective(llvm::omp::Directive::OMPD_declare_mapper)) >=
     Parser<OmpDirectiveSpecification>{})))
 
-TYPE_PARSER(construct<OmpReductionCombiner>(Parser<AssignmentStmt>{}) ||
-    construct<OmpReductionCombiner>(Parser<FunctionReference>{}))
+TYPE_PARSER(construct<OmpCombinerExpression>(Parser<AssignmentStmt>{}) ||
+    construct<OmpCombinerExpression>(Parser<FunctionReference>{}))
 
 TYPE_PARSER(sourced(construct<OpenMPCriticalConstruct>(
     OmpBlockConstructParser{llvm::omp::Directive::OMPD_critical})))
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 66e5b2cbd5c7f..4739da0676fa9 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -140,17 +140,9 @@ void Prescanner::Statement() {
       CHECK(*at_ == '!');
     }
     std::optional<int> condOffset;
-    if (InOpenMPConditionalLine()) {
+    if (InOpenMPConditionalLine()) { // !$
       condOffset = 2;
-    } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'c' &&
-        directiveSentinel_[2] == 'u' && directiveSentinel_[3] == 'f' &&
-        directiveSentinel_[4] == '\0') {
-      // CUDA conditional compilation line.
-      condOffset = 5;
-    } else if (directiveSentinel_[0] == '@' && directiveSentinel_[1] == 'a' &&
-        directiveSentinel_[2] == 'c' && directiveSentinel_[3] == 'c' &&
-        directiveSentinel_[4] == '\0') {
-      // OpenACC conditional compilation line.
+    } else if (InOpenACCOrCUDAConditionalLine()) { // !@acc or !@cuf
       condOffset = 5;
     }
     if (condOffset && !preprocessingOnly_) {
@@ -166,7 +158,8 @@ void Prescanner::Statement() {
     } else {
       // Compiler directive.  Emit normalized sentinel, squash following spaces.
       // Conditional compilation lines (!$) take this path in -E mode too
-      // so that -fopenmp only has to appear on the later compilation.
+      // so that -fopenmp only has to appear on the later compilation
+      // (ditto for !@cuf and !@acc).
       EmitChar(tokens, '!');
       ++at_, ++column_;
       for (const char *sp{directiveSentinel_}; *sp != '\0';
@@ -202,7 +195,7 @@ void Prescanner::Statement() {
       }
       tokens.CloseToken();
       SkipSpaces();
-      if (InOpenMPConditionalLine() && inFixedForm_ && !tabInCurrentLine_ &&
+      if (InConditionalLine() && inFixedForm_ && !tabInCurrentLine_ &&
           column_ == 6 && *at_ != '\n') {
         // !$   0   - turn '0' into a space
         // !$   1   - turn '1' into '&'
@@ -347,7 +340,7 @@ void Prescanner::Statement() {
       while (CompilerDirectiveContinuation(tokens, line.sentinel)) {
         newlineProvenance = GetCurrentProvenance();
       }
-      if (preprocessingOnly_ && inFixedForm_ && InOpenMPConditionalLine() &&
+      if (preprocessingOnly_ && inFixedForm_ && InConditionalLine() &&
           nextLine_ < limit_) {
         // In -E mode, when the line after !$ conditional compilation is a
         // regular fixed form continuation line, append a '&' to the line.
@@ -1360,11 +1353,10 @@ const char *Prescanner::FixedFormContinuationLine(bool atNewline) {
               features_.IsEnabled(LanguageFeature::OldDebugLines))) &&
       nextLine_[1] == ' ' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
       nextLine_[4] == ' '};
-  if (InCompilerDirective() &&
-      !(InOpenMPConditionalLine() && !preprocessingOnly_)) {
+  if (InCompilerDirective() && !(InConditionalLine() && !preprocessingOnly_)) {
     // !$ under -E is not continued, but deferred to later compilation
     if (IsFixedFormCommentChar(col1) &&
-        !(InOpenMPConditionalLine() && preprocessingOnly_)) {
+        !(InConditionalLine() && preprocessingOnly_)) {
       int j{1};
       for (; j < 5; ++j) {
         char ch{directiveSentinel_[j - 1]};
@@ -1388,19 +1380,23 @@ const char *Prescanner::FixedFormContinuationLine(bool atNewline) {
       }
     }
   } else { // Normal case: not in a compiler directive.
-    // !$ conditional compilation lines may be continuations when not
+    // Conditional compilation lines may be continuations when not
     // just preprocessing.
-    if (!preprocessingOnly_ && IsFixedFormCommentChar(col1) &&
-        nextLine_[1] == '$' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
-        nextLine_[4] == ' ' && IsCompilerDirectiveSentinel(&nextLine_[1], 1)) {
-      if (const char *col6{nextLine_ + 5};
-          *col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
-        if (atNewline && !IsSpace(nextLine_ + 6)) {
-          brokenToken_ = true;
+    if (!preprocessingOnly_ && IsFixedFormCommentChar(col1)) {
+      if ((nextLine_[1] == '$' && nextLine_[2] == ' ' && nextLine_[3] == ' ' &&
+              nextLine_[4] == ' ' &&
+              IsCompilerDirectiveSentinel(&nextLine_[1], 1)) ||
+          (nextLine_[1] == '@' &&
+              IsCompilerDirectiveSentinel(&nextLine_[1], 4))) {
+        if (const char *col6{nextLine_ + 5};
+            *col6 != '\n' && *col6 != '0' && !IsSpaceOrTab(col6)) {
+          if (atNewline && !IsSpace(nextLine_ + 6)) {
+            brokenToken_ = true;
+          }
+          return nextLine_ + 6;
+        } else {
+          return nullptr;
         }
-        return nextLine_ + 6;
-      } else {
-        return nullptr;
       }
     }
     if (col1 == '&' &&
@@ -1435,6 +1431,15 @@ const char *Prescanner::FixedFormContinuationLine(bool atNewline) {
   return nullptr; // not a continuation line
 }
 
+constexpr bool IsDirective(const char *match, const char *dir) {
+  for (; *match; ++match) {
+    if (*match != ToLowerCaseLetter(*dir++)) {
+      return false;
+    }
+  }
+  return true;
+}
+
 const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
   const char *lineStart{nextLine_};
   const char *p{lineStart};
@@ -1443,16 +1448,22 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
   }
   p = SkipWhiteSpaceIncludingEmptyMacros(p);
   if (InCompilerDirective()) {
-    if (InOpenMPConditionalLine()) {
+    if (InConditionalLine()) {
       if (preprocessingOnly_) {
         // in -E mode, don't treat !$ as a continuation
         return nullptr;
-      } else if (p[0] == '!' && p[1] == '$') {
-        // accept but do not require a matching sentinel
-        if (p[2] != '&' && !IsSpaceOrTab(&p[2])) {
-          return nullptr; // not !$
-        }
+      } else if (p[0] == '!' && (p[1] == '$' || p[1] == '@')) {
         p += 2;
+        if (InOpenACCOrCUDAConditionalLine()) {
+          if (IsDirective("acc", p) || IsDirective("cuf", p)) {
+            p += 3;
+          } else {
+            return nullptr;
+          }
+        }
+        if (*p != '&' && !IsSpaceOrTab(p)) {
+          return nullptr;
+        }
       }
     } else if (*p++ == '!') {
       for (const char *s{directiveSentinel_}; *s != '\0'; ++p, ++s) {
@@ -1475,10 +1486,17 @@ const char *Prescanner::FreeFormContinuationLine(bool ampersand) {
       return nullptr;
     }
   }
-  if (p[0] == '!' && p[1] == '$' && !preprocessingOnly_ &&
-      features_.IsEnabled(LanguageFeature::OpenMP)) {
-    // !$ conditional line can be a continuation
-    p = lineStart = SkipWhiteSpace(p + 2);
+  if (p[0] == '!' && !preprocessingOnly_) {
+    // Conditional lines can be continuations
+    if (p[1] == '$' && features_.IsEnabled(LanguageFeature::OpenMP)) {
+      p = lineStart = SkipWhiteSpace(p + 2);
+    } else if (IsDirective("@acc", p + 1) &&
+        features_.IsEnabled(LanguageFeature::OpenACC)) {
+      p = lineStart = SkipWhiteSpace(p + 5);
+    } else if (IsDirective("@cuf", p + 1) &&
+        features_.IsEnabled(LanguageFeature::CUDA)) {
+      p = lineStart = SkipWhiteSpace(p + 5);
+    }
   }
   if (*p == '&') {
     return p + 1;
@@ -1714,15 +1732,6 @@ Prescanner::IsCompilerDirectiveSentinel(const char *p) const {
   return std::nullopt;
 }
 
-constexpr bool IsDirective(const char *match, const char *dir) {
-  for (; *match; ++match) {
-    if (*match != ToLowerCaseLetter(*dir++)) {
-      return false;
-    }
-  }
-  return true;
-}
-
 Prescanner::LineClassification Prescanner::ClassifyLine(
     const char *start) const {
   if (inFixedForm_) {
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index fc38adb926530..5e7481781d944 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -171,7 +171,17 @@ class Prescanner {
   bool InOpenMPConditionalLine() const {
     return directiveSentinel_ && directiveSentinel_[0] == '$' &&
         !directiveSentinel_[1];
-    ;
+  }
+  bool InOpenACCOrCUDAConditionalLine() const {
+    return directiveSentinel_ && directiveSentinel_[0] == '@' &&
+        ((directiveSentinel_[1] == 'a' && directiveSentinel_[2] == 'c' &&
+             directiveSentinel_[3] == 'c') ||
+            (directiveSentinel_[1] == 'c' && directiveSentinel_[2] == 'u' &&
+                directiveSentinel_[3] == 'f')) &&
+        directiveSentinel_[4] == '\0';
+  }
+  bool InConditionalLine() const {
+    return InOpenMPConditionalLine() || InOpenACCOrCUDAConditionalLine();
   }
   bool InFixedFormSource() const {
     return inFixedForm_ && !inPreprocessorDirective_ && !InCompilerDirective();
diff --git a/flang/lib/Parser/program-parsers.cpp b/flang/lib/Parser/program-parsers.cpp
index 92c0a64b39a9d..740dbbfab9db7 100644
--- a/flang/lib/Parser/program-parsers.cpp
+++ b/flang/lib/Parser/program-parsers.cpp
@@ -484,7 +484,7 @@ constexpr auto starOrExpr{
         applyFunction(presentOptional<ScalarExpr>, scalarExpr))};
 TYPE_PARSER(extension<LanguageFeature::CUDA>(
     "<<<" >> construct<CallStmt::Chevrons>(starOrExpr, ", " >> scalarExpr,
-                 maybe("," >> scalarIntExpr), maybe("," >> scalarIntExpr)) /
+                 maybe("," >> scalarExpr), maybe("," >> scalarIntExpr)) /
         ">>>"))
 constexpr auto actualArgSpecList{optionalList(actualArgSpec)};
 TYPE_CONTEXT_PARSER("CALL statement"_en_US,
diff --git a/flang/lib/Parser/unparse.cpp b/flang/lib/Parser/unparse.cpp
index b172e429c84e8..2f86c76c5fe03 100644
--- a/flang/lib/Parser/unparse.cpp
+++ b/flang/lib/Parser/unparse.cpp
@@ -2111,7 +2111,7 @@ class UnparseVisitor {
     Walk(std::get<OmpReductionIdentifier>(x.t));
     Put(":");
     Walk(std::get<OmpTypeNameList>(x.t));
-    Walk(": ", std::get<std::optional<OmpReductionCombiner>>(x.t));
+    Walk(": ", std::get<std::optional<OmpCombinerExpression>>(x.t));
   }
   void Unparse(const llvm::omp::Directive &x) {
     unsigned ompVersion{langOpts_.OpenMPVersion};
@@ -2519,7 +2519,7 @@ class UnparseVisitor {
       Walk(x.u);
     }
   }
-  void Unparse(const OmpReductionCombiner &x) {
+  void Unparse(const OmpCombinerExpression &x) {
     // Don't let the visitor go to the normal AssignmentStmt Unparse function,
     // it adds an extra newline that we don't want.
     if (const auto *assignment{std::get_if<AssignmentStmt>(&x.u)}) {
diff --git a/flang/lib/Semantics/check-call.cpp b/flang/lib/Semantics/check-call.cpp
index e4d2a0d220c12..c51d40b9e5039 100644
--- a/flang/lib/Semantics/check-call.cpp
+++ b/flang/lib/Semantics/check-call.cpp
@@ -2241,10 +2241,9 @@ static void CheckSpecificIntrinsic(const characteristics::Procedure &proc,
   }
 }
 
-static parser::Messages CheckExplicitInterface(
-    const characteristics::Procedure &proc, evaluate::ActualArguments &actuals,
-    SemanticsContext &context, const Scope *scope,
-    const evaluate::SpecificIntrinsic *intrinsic,
+parser::Messages CheckExplicitInterface(const characteristics::Procedure &proc,
+    evaluate::ActualArguments &actuals, SemanticsContext &context,
+    const Scope *scope, const evaluate::SpecificIntrinsic *intrinsic,
     bool allowActualArgumentConversions, bool extentErrors,
     bool ignoreImplicitVsExplicit) {
   evaluate::FoldingContext &foldingContext{context.foldingContext()};
diff --git a/flang/lib/Semantics/check-call.h b/flang/lib/Semantics/check-call.h
index 46bc61a601bd3..a69b792b646e6 100644
--- a/flang/lib/Semantics/check-call.h
+++ b/flang/lib/Semantics/check-call.h
@@ -12,11 +12,8 @@
 #define FORTRAN_SEMANTICS_CHECK_CALL_H_
 
 #include "flang/Evaluate/call.h"
+#include "flang/Parser/message.h"
 
-namespace Fortran::parser {
-class Messages;
-class ContextualMessages;
-} // namespace Fortran::parser
 namespace Fortran::evaluate::characteristics {
 struct Procedure;
 }
@@ -47,6 +44,12 @@ bool CheckArgumentIsConstantExprInRange(
     const evaluate::ActualArguments &actuals, int index, int lowerBound,
     int upperBound, parser::ContextualMessages &messages);
 
+parser::Messages CheckExplicitInterface(
+    const evaluate::characteristics::Procedure &, evaluate::ActualArguments &,
+    SemanticsContext &, const Scope *, const evaluate::SpecificIntrinsic *,
+    bool allowActualArgumentConversions, bool extentErrors,
+    bool ignoreImplicitVsExplicit);
+
 // Checks actual arguments for the purpose of resolving a generic interface.
 bool CheckInterfaceForGeneric(const evaluate::characteristics::Procedure &,
     evaluate::ActualArguments &, SemanticsContext &,
diff --git a/flang/lib/Semantics/check-cuda.cpp b/flang/lib/Semantics/check-cuda.cpp
index 3d2db6a9c8aa9..caa9bdd28f1f4 100644
--- a/flang/lib/Semantics/check-cuda.cpp
+++ b/flang/lib/Semantics/check-cuda.cpp
@@ -131,6 +131,9 @@ struct FindHostArray
     return (*this)(x.base());
   }
   Result operator()(const Symbol &symbol) const {
+    if (symbol.IsFuncResult()) {
+      return nullptr;
+    }
     if (const auto *details{
             symbol.GetUltimate().detailsIf<semantics::ObjectEntityDetails>()}) {
       if (details->IsArray() &&
diff --git a/flang/lib/Semantics/check-declarations.cpp b/flang/lib/Semantics/check-declarations.cpp
index 31e246cf0ab03..549ee83b70fce 100644
--- a/flang/lib/Semantics/check-declarations.cpp
+++ b/flang/lib/Semantics/check-declarations.cpp
@@ -472,6 +472,10 @@ void CheckHelper::Check(const Symbol &symbol) {
       messages_.Say(
           "A function result may not also be a named constant"_err_en_US);
     }
+    if (!IsProcedurePointer(symbol) && IsProcedure(symbol)) {
+      messages_.Say(
+          "A function result may not be a procedure unless it is a procedure pointer"_err_en_US);
+    }
   }
   if (IsAutomatic(symbol)) {
     if (const Symbol * common{FindCommonBlockContaining(symbol)}) {
@@ -1781,7 +1785,7 @@ void CheckHelper::CheckExternal(const Symbol &symbol) {
         if (auto previousChars{Characterize(previous)}) {
           std::string whyNot;
           if (!chars->IsCompatibleWith(*previousChars,
-                  /*ignoreImplicitVsExplicit=*/false, &whyNot)) {
+                  /*ignoreImplicitVsExplicit=*/true, &whyNot)) {
             if (auto *msg{Warn(common::UsageWarning::ExternalInterfaceMismatch,
                     "The external interface '%s' is not compatible with an earlier definition (%s)"_warn_en_US,
                     symbol.name(), whyNot)}) {
diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index e305755f4ad90..7ae8ef55f6184 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -61,6 +61,124 @@ namespace Fortran::semantics {
 using namespace Fortran::semantics::omp;
 using namespace Fortran::parser::omp;
 
+OmpStructureChecker::OmpStructureChecker(SemanticsContext &context)
+    : DirectiveStructureChecker(context,
+#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
+#include "llvm/Frontend/OpenMP/OMP.inc"
+      ) {
+  scopeStack_.push_back(&context.globalScope());
+}
+
+bool OmpStructureChecker::Enter(const parser::MainProgram &x) {
+  using StatementProgramStmt = parser::Statement<parser::ProgramStmt>;
+  if (auto &stmt{std::get<std::optional<StatementProgramStmt>>(x.t)}) {
+    scopeStack_.push_back(stmt->statement.v.symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      // There can only be one main program.
+      if (scope.kind() == Scope::Kind::MainProgram) {
+        scopeStack_.push_back(&scope);
+        break;
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::MainProgram &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockData &x) {
+  // The BLOCK DATA name is optional, so we need to look for the
+  // corresponding scope in the global scope.
+  auto &stmt{std::get<parser::Statement<parser::BlockDataStmt>>(x.t)};
+  if (auto &name{stmt.statement.v}) {
+    scopeStack_.push_back(name->symbol->scope());
+  } else {
+    for (const Scope &scope : context_.globalScope().children()) {
+      if (scope.kind() == Scope::Kind::BlockData) {
+        if (scope.symbol()->name().empty()) {
+          scopeStack_.push_back(&scope);
+          break;
+        }
+      }
+    }
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockData &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Module &x) {
+  auto &stmt{std::get<parser::Statement<parser::ModuleStmt>>(x.t)};
+  const Symbol *sym{stmt.statement.v.symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Module &x) {
+  scopeStack_.pop_back();
+}
+
+bool OmpStructureChecker::Enter(const parser::Submodule &x) {
+  auto &stmt{std::get<parser::Statement<parser::SubmoduleStmt>>(x.t)};
+  const Symbol *sym{std::get<parser::Name>(stmt.statement.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::Submodule &x) {
+  scopeStack_.pop_back();
+}
+
+// Function/subroutine subprogram nodes don't appear in INTERFACEs, but
+// the subprogram/end statements do.
+bool OmpStructureChecker::Enter(const parser::SubroutineStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndSubroutineStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::FunctionStmt &x) {
+  const Symbol *sym{std::get<parser::Name>(x.t).symbol};
+  scopeStack_.push_back(sym->scope());
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::EndFunctionStmt &x) {
+  scopeStack_.pop_back();
+  return true;
+}
+
+bool OmpStructureChecker::Enter(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+  return true;
+}
+
+void OmpStructureChecker::Leave(const parser::BlockConstruct &x) {
+  auto &specPart{std::get<parser::BlockSpecificationPart>(x.t)};
+  auto &execPart{std::get<parser::Block>(x.t)};
+  if (auto &&source{parser::GetSource(specPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  } else if (auto &&source{parser::GetSource(execPart)}) {
+    scopeStack_.push_back(&context_.FindScope(*source));
+  }
+}
+
 // Use when clause falls under 'struct OmpClause' in 'parse-tree.h'.
 #define CHECK_SIMPLE_CLAUSE(X, Y) \
   void OmpStructureChecker::Enter(const parser::OmpClause::X &) { \
@@ -362,6 +480,36 @@ bool OmpStructureChecker::IsNestedInDirective(llvm::omp::Directive directive) {
   return false;
 }
 
+bool OmpStructureChecker::InTargetRegion() {
+  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target)) {
+    // Return true even for device_type(host).
+    return true;
+  }
+  for (const Scope *scope : llvm::reverse(scopeStack_)) {
+    if (const auto *symbol{scope->symbol()}) {
+      if (symbol->test(Symbol::Flag::OmpDeclareTarget)) {
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
+bool OmpStructureChecker::HasRequires(llvm::omp::Clause req) {
+  const Scope &unit{GetProgramUnit(*scopeStack_.back())};
+  return common::visit(
+      [&](const auto &details) {
+        if constexpr (std::is_convertible_v<decltype(details),
+                          const WithOmpDeclarative &>) {
+          if (auto *reqs{details.ompRequires()}) {
+            return reqs->test(req);
+          }
+        }
+        return false;
+      },
+      DEREF(unit.symbol()).details());
+}
+
 void OmpStructureChecker::CheckVariableListItem(
     const SymbolSourceMap &symbols) {
   for (auto &[symbol, source] : symbols) {
@@ -1578,40 +1726,95 @@ void OmpStructureChecker::Leave(const parser::OpenMPRequiresConstruct &) {
   dirContext_.pop_back();
 }
 
-void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
-  isPredefinedAllocator = true;
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
-  SymbolSourceMap currSymbols;
-  GetSymbolsInObjectList(objectList, currSymbols);
-  for (auto &[symbol, source] : currSymbols) {
-    if (IsPointer(*symbol)) {
-      context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not have POINTER "
-          "attribute"_err_en_US,
-          source.ToString());
+void OmpStructureChecker::CheckAllocateDirective(parser::CharBlock source,
+    const parser::OmpObjectList &objects,
+    const parser::OmpClauseList &clauses) {
+  const Scope &thisScope{context_.FindScope(source)};
+  SymbolSourceMap symbols;
+  GetSymbolsInObjectList(objects, symbols);
+
+  auto maybeHasPredefinedAllocator{[&](const parser::OmpClause *calloc) {
+    // Return "true" if the ALLOCATOR clause was provided with an argument
+    // that is either a prefdefined allocator, or a run-time value.
+    // Otherwise return "false".
+    if (!calloc) {
+      return false;
     }
-    if (IsDummy(*symbol)) {
+    auto *allocator{std::get_if<parser::OmpClause::Allocator>(&calloc->u)};
+    if (auto val{ToInt64(GetEvaluateExpr(DEREF(allocator).v))}) {
+      // Predefined allocators (defined in OpenMP 6.0 20.8.1):
+      //   omp_null_allocator = 0,
+      //   omp_default_mem_alloc = 1,
+      //   omp_large_cap_mem_alloc = 2,
+      //   omp_const_mem_alloc = 3,
+      //   omp_high_bw_mem_alloc = 4,
+      //   omp_low_lat_mem_alloc = 5,
+      //   omp_cgroup_mem_alloc = 6,
+      //   omp_pteam_mem_alloc = 7,
+      //   omp_thread_mem_alloc = 8
+      return *val >= 0 && *val <= 8;
+    }
+    return true;
+  }};
+
+  const auto *allocator{FindClause(llvm::omp::Clause::OMPC_allocator)};
+  if (InTargetRegion()) {
+    bool hasDynAllocators{
+        HasRequires(llvm::omp::Clause::OMPC_dynamic_allocators)};
+    if (!allocator && !hasDynAllocators) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be a dummy "
-          "argument"_err_en_US,
-          source.ToString());
+          "An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified"_err_en_US);
+    }
+  }
+
+  auto maybePredefined{maybeHasPredefinedAllocator(allocator)};
+
+  for (auto &[symbol, source] : symbols) {
+    if (!inExecutableAllocate_) {
+      if (symbol->owner() != thisScope) {
+        context_.Say(source,
+            "A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears"_err_en_US);
+      }
+      if (IsPointer(*symbol) || IsAllocatable(*symbol)) {
+        context_.Say(source,
+            "A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute"_err_en_US);
+      }
     }
     if (symbol->GetUltimate().has<AssocEntityDetails>()) {
       context_.Say(source,
-          "List item '%s' in ALLOCATE directive must not be an associate "
-          "name"_err_en_US,
-          source.ToString());
+          "A list item in a declarative ALLOCATE cannot be an associate name"_err_en_US);
+    }
+    if (symbol->attrs().test(Attr::SAVE) || IsCommonBlock(*symbol)) {
+      if (!allocator) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator"_err_en_US);
+      } else if (!maybePredefined) {
+        context_.Say(source,
+            "If a list item is a named common block or has SAVE attribute, only a predefined allocator may be used on the ALLOCATOR clause"_err_en_US);
+      }
+    }
+    if (FindCommonBlockContaining(*symbol)) {
+      context_.Say(source,
+          "A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block"_err_en_US);
     }
   }
-  CheckVarIsNotPartOfAnotherVar(dir.source, objectList);
+  CheckVarIsNotPartOfAnotherVar(source, objects);
 }
 
-void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+void OmpStructureChecker::Enter(const parser::OpenMPDeclarativeAllocate &x) {
   const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
-  CheckPredefinedAllocatorRestriction(dir.source, objectList);
+  PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
+}
+
+void OmpStructureChecker::Leave(const parser::OpenMPDeclarativeAllocate &x) {
+  if (!inExecutableAllocate_) {
+    const auto &dir{std::get<parser::Verbatim>(x.t)};
+    const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
+    const auto &objectList{std::get<parser::OmpObjectList>(x.t)};
+
+    isPredefinedAllocator = true;
+    CheckAllocateDirective(dir.source, objectList, clauseList);
+  }
   dirContext_.pop_back();
 }
 
@@ -1967,6 +2170,7 @@ void OmpStructureChecker::CheckNameInAllocateStmt(
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
+  inExecutableAllocate_ = true;
   const auto &dir{std::get<parser::Verbatim>(x.t)};
   PushContextAndClauseSets(dir.source, llvm::omp::Directive::OMPD_allocate);
 
@@ -1976,24 +2180,6 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
         "The executable form of the OpenMP ALLOCATE directive has been deprecated, please use ALLOCATORS instead"_warn_en_US);
   }
 
-  bool hasAllocator = false;
-  // TODO: Investigate whether searching the clause list can be done with
-  // parser::Unwrap instead of the following loop
-  const auto &clauseList{std::get<parser::OmpClauseList>(x.t)};
-  for (const auto &clause : clauseList.v) {
-    if (std::get_if<parser::OmpClause::Allocator>(&clause.u)) {
-      hasAllocator = true;
-    }
-  }
-
-  if (IsNestedInDirective(llvm::omp::Directive::OMPD_target) && !hasAllocator) {
-    // TODO: expand this check to exclude the case when a requires
-    //       directive with the dynamic_allocators clause is present
-    //       in the same compilation unit (OMP5.0 2.11.3).
-    context_.Say(x.source,
-        "ALLOCATE directives that appear in a TARGET region must specify an allocator clause"_err_en_US);
-  }
-
   const auto &allocateStmt =
       std::get<parser::Statement<parser::AllocateStmt>>(x.t).statement;
   if (const auto &list{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
@@ -2010,18 +2196,34 @@ void OmpStructureChecker::Enter(const parser::OpenMPExecutableAllocate &x) {
   }
 
   isPredefinedAllocator = true;
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList) {
-    CheckVarIsNotPartOfAnotherVar(dir.source, *objectList);
-  }
 }
 
 void OmpStructureChecker::Leave(const parser::OpenMPExecutableAllocate &x) {
-  const auto &dir{std::get<parser::Verbatim>(x.t)};
-  const auto &objectList{std::get<std::optional<parser::OmpObjectList>>(x.t)};
-  if (objectList)
-    CheckPredefinedAllocatorRestriction(dir.source, *objectList);
+  parser::OmpObjectList empty{std::list<parser::OmpObject>{}};
+  auto &objects{[&]() -> const parser::OmpObjectList & {
+    if (auto &objects{std::get<std::optional<parser::OmpObjectList>>(x.t)}) {
+      return *objects;
+    } else {
+      return empty;
+    }
+  }()};
+  auto &clauses{std::get<parser::OmpClauseList>(x.t)};
+  CheckAllocateDirective(
+      std::get<parser::Verbatim>(x.t).source, objects, clauses);
+
+  if (const auto &subDirs{
+          std::get<std::optional<std::list<parser::OpenMPDeclarativeAllocate>>>(
+              x.t)}) {
+    for (const auto &dalloc : *subDirs) {
+      const auto &dir{std::get<parser::Verbatim>(x.t)};
+      const auto &clauses{std::get<parser::OmpClauseList>(dalloc.t)};
+      const auto &objects{std::get<parser::OmpObjectList>(dalloc.t)};
+      CheckAllocateDirective(dir.source, objects, clauses);
+    }
+  }
+
   dirContext_.pop_back();
+  inExecutableAllocate_ = false;
 }
 
 void OmpStructureChecker::Enter(const parser::OpenMPAllocatorsConstruct &x) {
diff --git a/flang/lib/Semantics/check-omp-structure.h b/flang/lib/Semantics/check-omp-structure.h
index b3fd6c8b4b7c0..7426559e77ff7 100644
--- a/flang/lib/Semantics/check-omp-structure.h
+++ b/flang/lib/Semantics/check-omp-structure.h
@@ -56,21 +56,32 @@ using SymbolSourceMap = std::multimap<const Symbol *, parser::CharBlock>;
 using DirectivesClauseTriple = std::multimap<llvm::omp::Directive,
     std::pair<llvm::omp::Directive, const OmpClauseSet>>;
 
-class OmpStructureChecker
-    : public DirectiveStructureChecker<llvm::omp::Directive, llvm::omp::Clause,
-          parser::OmpClause, llvm::omp::Clause_enumSize> {
+using OmpStructureCheckerBase = DirectiveStructureChecker<llvm::omp::Directive,
+    llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+
+class OmpStructureChecker : public OmpStructureCheckerBase {
 public:
-  using Base = DirectiveStructureChecker<llvm::omp::Directive,
-      llvm::omp::Clause, parser::OmpClause, llvm::omp::Clause_enumSize>;
+  using Base = OmpStructureCheckerBase;
+
+  OmpStructureChecker(SemanticsContext &context);
 
-  OmpStructureChecker(SemanticsContext &context)
-      : DirectiveStructureChecker(context,
-#define GEN_FLANG_DIRECTIVE_CLAUSE_MAP
-#include "llvm/Frontend/OpenMP/OMP.inc"
-        ) {
-  }
   using llvmOmpClause = const llvm::omp::Clause;
 
+  bool Enter(const parser::MainProgram &);
+  void Leave(const parser::MainProgram &);
+  bool Enter(const parser::BlockData &);
+  void Leave(const parser::BlockData &);
+  bool Enter(const parser::Module &);
+  void Leave(const parser::Module &);
+  bool Enter(const parser::Submodule &);
+  void Leave(const parser::Submodule &);
+  bool Enter(const parser::SubroutineStmt &);
+  bool Enter(const parser::EndSubroutineStmt &);
+  bool Enter(const parser::FunctionStmt &);
+  bool Enter(const parser::EndFunctionStmt &);
+  bool Enter(const parser::BlockConstruct &);
+  void Leave(const parser::BlockConstruct &);
+
   void Enter(const parser::OpenMPConstruct &);
   void Leave(const parser::OpenMPConstruct &);
   void Enter(const parser::OpenMPInteropConstruct &);
@@ -177,10 +188,12 @@ class OmpStructureChecker
       const parser::CharBlock &, const OmpDirectiveSet &);
   bool IsCloselyNestedRegion(const OmpDirectiveSet &set);
   bool IsNestedInDirective(llvm::omp::Directive directive);
+  bool InTargetRegion();
   void HasInvalidTeamsNesting(
       const llvm::omp::Directive &dir, const parser::CharBlock &source);
   void HasInvalidDistributeNesting(const parser::OpenMPLoopConstruct &x);
   void HasInvalidLoopBinding(const parser::OpenMPLoopConstruct &x);
+  bool HasRequires(llvm::omp::Clause req);
   // specific clause related
   void CheckAllowedMapTypes(
       parser::OmpMapType::Value, llvm::ArrayRef<parser::OmpMapType::Value>);
@@ -250,6 +263,9 @@ class OmpStructureChecker
   bool CheckTargetBlockOnlyTeams(const parser::Block &);
   void CheckWorkshareBlockStmts(const parser::Block &, parser::CharBlock);
   void CheckWorkdistributeBlockStmts(const parser::Block &, parser::CharBlock);
+  void CheckAllocateDirective(parser::CharBlock source,
+      const parser::OmpObjectList &objects,
+      const parser::OmpClauseList &clauses);
 
   void CheckIteratorRange(const parser::OmpIteratorSpecifier &x);
   void CheckIteratorModifier(const parser::OmpIterator &x);
@@ -367,12 +383,15 @@ class OmpStructureChecker
   };
   int directiveNest_[LastType + 1] = {0};
 
+  bool inExecutableAllocate_{false};
   parser::CharBlock visitedAtomicSource_;
   SymbolSourceMap deferredNonVariables_;
 
   using LoopConstruct = std::variant<const parser::DoConstruct *,
       const parser::OpenMPLoopConstruct *>;
   std::vector<LoopConstruct> loopStack_;
+  // Scopes for scoping units.
+  std::vector<const Scope *> scopeStack_;
 };
 
 /// Find a duplicate entry in the range, and return an iterator to it.
diff --git a/flang/lib/Semantics/expression.cpp b/flang/lib/Semantics/expression.cpp
index 4aeb9a44088e2..32aa6b1e0aa1d 100644
--- a/flang/lib/Semantics/expression.cpp
+++ b/flang/lib/Semantics/expression.cpp
@@ -2552,11 +2552,12 @@ auto ExpressionAnalyzer::AnalyzeProcedureComponentRef(
                 }
                 return true;
               }};
-          auto pair{
-              ResolveGeneric(generic, arguments, adjustment, isSubroutine)};
-          sym = pair.first;
+          auto result{ResolveGeneric(
+              generic, arguments, adjustment, isSubroutine, SymbolVector{})};
+          sym = result.specific;
           if (!sym) {
-            EmitGenericResolutionError(generic, pair.second, isSubroutine);
+            EmitGenericResolutionError(generic, result.failedDueToAmbiguity,
+                isSubroutine, arguments, result.tried);
             return std::nullopt;
           }
           // re-resolve the name to the specific binding
@@ -2886,10 +2887,10 @@ const Symbol *ExpressionAnalyzer::ResolveForward(const Symbol &symbol) {
 
 // Resolve a call to a generic procedure with given actual arguments.
 // adjustActuals is called on procedure bindings to handle pass arg.
-std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
-    const Symbol &symbol, const ActualArguments &actuals,
-    const AdjustActuals &adjustActuals, bool isSubroutine,
-    bool mightBeStructureConstructor) {
+auto ExpressionAnalyzer::ResolveGeneric(const Symbol &symbol,
+    const ActualArguments &actuals, const AdjustActuals &adjustActuals,
+    bool isSubroutine, SymbolVector &&tried, bool mightBeStructureConstructor)
+    -> GenericResolution {
   const Symbol &ultimate{symbol.GetUltimate()};
   // Check for a match with an explicit INTRINSIC
   const Symbol *explicitIntrinsic{nullptr};
@@ -2948,7 +2949,7 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
               // cannot be unambiguously distinguished
               // Underspecified external procedure actual arguments can
               // also lead to ambiguity.
-              return {nullptr, true /* due to ambiguity */};
+              return {nullptr, true /* due to ambiguity */, std::move(tried)};
             }
           }
           if (!procedure->IsElemental()) {
@@ -2959,6 +2960,8 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
           }
           crtMatchingDistance = ComputeCudaMatchingDistance(
               context_.languageFeatures(), *procedure, localActuals);
+        } else {
+          tried.push_back(*specific);
         }
       }
     }
@@ -3038,11 +3041,12 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
   // Check parent derived type
   if (const auto *parentScope{symbol.owner().GetDerivedTypeParent()}) {
     if (const Symbol * extended{parentScope->FindComponent(symbol.name())}) {
-      auto pair{ResolveGeneric(
-          *extended, actuals, adjustActuals, isSubroutine, false)};
-      if (pair.first) {
-        return pair;
+      auto result{ResolveGeneric(*extended, actuals, adjustActuals,
+          isSubroutine, std::move(tried), false)};
+      if (result.specific != nullptr) {
+        return result;
       }
+      tried = std::move(result.tried);
     }
   }
   // Structure constructor?
@@ -3054,14 +3058,15 @@ std::pair<const Symbol *, bool> ExpressionAnalyzer::ResolveGeneric(
   if (!symbol.owner().IsGlobal() && !symbol.owner().IsDerivedType()) {
     if (const Symbol *
         outer{symbol.owner().parent().FindSymbol(symbol.name())}) {
-      auto pair{ResolveGeneric(*outer, actuals, adjustActuals, isSubroutine,
-          mightBeStructureConstructor)};
-      if (pair.first) {
-        return pair;
+      auto result{ResolveGeneric(*outer, actuals, adjustActuals, isSubroutine,
+          std::move(tried), mightBeStructureConstructor)};
+      if (result.specific) {
+        return result;
       }
+      tried = std::move(result.tried);
     }
   }
-  return {nullptr, false};
+  return {nullptr, false, std::move(tried)};
 }
 
 const Symbol &ExpressionAnalyzer::AccessSpecific(
@@ -3098,16 +3103,39 @@ const Symbol &ExpressionAnalyzer::AccessSpecific(
   }
 }
 
-void ExpressionAnalyzer::EmitGenericResolutionError(
-    const Symbol &symbol, bool dueToAmbiguity, bool isSubroutine) {
-  Say(dueToAmbiguity
-          ? "The actual arguments to the generic procedure '%s' matched multiple specific procedures, perhaps due to use of NULL() without MOLD= or an actual procedure with an implicit interface"_err_en_US
-          : semantics::IsGenericDefinedOp(symbol)
-          ? "No specific procedure of generic operator '%s' matches the actual arguments"_err_en_US
-          : isSubroutine
-          ? "No specific subroutine of generic '%s' matches the actual arguments"_err_en_US
-          : "No specific function of generic '%s' matches the actual arguments"_err_en_US,
-      symbol.name());
+void ExpressionAnalyzer::EmitGenericResolutionError(const Symbol &symbol,
+    bool dueToAmbiguity, bool isSubroutine, ActualArguments &arguments,
+    const SymbolVector &tried) {
+  if (auto *msg{Say(dueToAmbiguity
+              ? "The actual arguments to the generic procedure '%s' matched multiple specific procedures, perhaps due to use of NULL() without MOLD= or an actual procedure with an implicit interface"_err_en_US
+              : semantics::IsGenericDefinedOp(symbol)
+              ? "No specific procedure of generic operator '%s' matches the actual arguments"_err_en_US
+              : isSubroutine
+              ? "No specific subroutine of generic '%s' matches the actual arguments"_err_en_US
+              : "No specific function of generic '%s' matches the actual arguments"_err_en_US,
+          symbol.name())}) {
+    parser::ContextualMessages &messages{GetContextualMessages()};
+    semantics::Scope &scope{context_.FindScope(messages.at())};
+    for (const Symbol &specific : tried) {
+      if (auto procChars{characteristics::Procedure::Characterize(
+              specific, GetFoldingContext())}) {
+        if (procChars->HasExplicitInterface()) {
+          if (auto reasons{semantics::CheckExplicitInterface(*procChars,
+                  arguments, context_, &scope, /*intrinsic=*/nullptr,
+                  /*allocActualArgumentConversions=*/false,
+                  /*extentErrors=*/false,
+                  /*ignoreImplicitVsExplicit=*/false)};
+              !reasons.empty()) {
+            reasons.AttachTo(
+                msg->Attach(specific.name(),
+                    "Specific procedure '%s' does not match the actual arguments because"_en_US,
+                    specific.name()),
+                parser::Severity::None);
+          }
+        }
+      }
+    }
+  }
 }
 
 auto ExpressionAnalyzer::GetCalleeAndArguments(
@@ -3146,12 +3174,14 @@ auto ExpressionAnalyzer::GetCalleeAndArguments(const parser::Name &name,
   bool isGenericInterface{ultimate.has<semantics::GenericDetails>()};
   bool isExplicitIntrinsic{ultimate.attrs().test(semantics::Attr::INTRINSIC)};
   const Symbol *resolution{nullptr};
+  SymbolVector tried;
   if (isGenericInterface || isExplicitIntrinsic) {
     ExpressionAnalyzer::AdjustActuals noAdjustment;
-    auto pair{ResolveGeneric(*symbol, arguments, noAdjustment, isSubroutine,
-        mightBeStructureConstructor)};
-    resolution = pair.first;
-    dueToAmbiguity = pair.second;
+    auto result{ResolveGeneric(*symbol, arguments, noAdjustment, isSubroutine,
+        SymbolVector{}, mightBeStructureConstructor)};
+    resolution = result.specific;
+    dueToAmbiguity = result.failedDueToAmbiguity;
+    tried = std::move(result.tried);
     if (resolution) {
       if (context_.GetPPCBuiltinsScope() &&
           resolution->name().ToString().rfind("__ppc_", 0) == 0) {
@@ -3182,7 +3212,8 @@ auto ExpressionAnalyzer::GetCalleeAndArguments(const parser::Name &name,
           std::move(specificCall->arguments)};
     } else {
       if (isGenericInterface) {
-        EmitGenericResolutionError(*symbol, dueToAmbiguity, isSubroutine);
+        EmitGenericResolutionError(
+            *symbol, dueToAmbiguity, isSubroutine, arguments, tried);
       }
       return std::nullopt;
     }
@@ -4955,8 +4986,10 @@ std::optional<ProcedureRef> ArgumentAnalyzer::GetDefinedAssignmentProc(
     auto restorer{context_.GetContextualMessages().DiscardMessages()};
     if (const Symbol *symbol{scope.FindSymbol(oprName)}) {
       ExpressionAnalyzer::AdjustActuals noAdjustment;
-      proc =
-          context_.ResolveGeneric(*symbol, actuals_, noAdjustment, true).first;
+      proc = context_
+                 .ResolveGeneric(
+                     *symbol, actuals_, noAdjustment, true, SymbolVector{})
+                 .specific;
       if (proc) {
         isProcElemental = IsElementalProcedure(*proc);
       }
@@ -5105,17 +5138,18 @@ const Symbol *ArgumentAnalyzer::FindBoundOp(parser::CharBlock oprName,
         [&](const Symbol &proc, ActualArguments &) {
           return passIndex == GetPassIndex(proc).value_or(-1);
         }};
-    auto pair{
-        context_.ResolveGeneric(*generic, actuals_, adjustment, isSubroutine)};
-    if (const Symbol *binding{pair.first}) {
+    auto result{context_.ResolveGeneric(
+        *generic, actuals_, adjustment, isSubroutine, SymbolVector{})};
+    if (const Symbol *binding{result.specific}) {
       CHECK(binding->has<semantics::ProcBindingDetails>());
       // Use the most recent override of the binding, if any
       return scope->FindComponent(binding->name());
     } else {
       if (isAmbiguous) {
-        *isAmbiguous = pair.second;
+        *isAmbiguous = result.failedDueToAmbiguity;
       }
-      context_.EmitGenericResolutionError(*generic, pair.second, isSubroutine);
+      context_.EmitGenericResolutionError(*generic, result.failedDueToAmbiguity,
+          isSubroutine, actuals_, result.tried);
     }
   }
   return nullptr;
diff --git a/flang/lib/Semantics/openmp-utils.cpp b/flang/lib/Semantics/openmp-utils.cpp
index 292e73b4899c0..6b304b62ef867 100644
--- a/flang/lib/Semantics/openmp-utils.cpp
+++ b/flang/lib/Semantics/openmp-utils.cpp
@@ -26,7 +26,9 @@
 #include "flang/Parser/openmp-utils.h"
 #include "flang/Parser/parse-tree.h"
 #include "flang/Semantics/expression.h"
+#include "flang/Semantics/scope.h"
 #include "flang/Semantics/semantics.h"
+#include "flang/Semantics/symbol.h"
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/STLExtras.h"
@@ -218,7 +220,7 @@ bool IsMapExitingType(parser::OmpMapType::Value type) {
   }
 }
 
-std::optional<SomeExpr> GetEvaluateExpr(const parser::Expr &parserExpr) {
+MaybeExpr GetEvaluateExpr(const parser::Expr &parserExpr) {
   const parser::TypedExpr &typedExpr{parserExpr.typedExpr};
   // ForwardOwningPointer           typedExpr
   // `- GenericExprWrapper          ^.get()
@@ -505,5 +507,4 @@ bool IsStrictlyStructuredBlock(const parser::Block &block) {
     return false;
   }
 }
-
 } // namespace Fortran::semantics::omp
diff --git a/flang/lib/Semantics/resolve-directives.cpp b/flang/lib/Semantics/resolve-directives.cpp
index ee9ed5c6abed3..b9a3ad46877d8 100644
--- a/flang/lib/Semantics/resolve-directives.cpp
+++ b/flang/lib/Semantics/resolve-directives.cpp
@@ -31,15 +31,17 @@
 #include <list>
 #include <map>
 
+namespace Fortran::semantics {
+
 template <typename T>
-static Fortran::semantics::Scope *GetScope(
-    Fortran::semantics::SemanticsContext &context, const T &x) {
-  std::optional<Fortran::parser::CharBlock> source{GetLastSource(x)};
-  return source ? &context.FindScope(*source) : nullptr;
+static Scope *GetScope(SemanticsContext &context, const T &x) {
+  if (auto source{GetLastSource(x)}) {
+    return &context.FindScope(*source);
+  } else {
+    return nullptr;
+  }
 }
 
-namespace Fortran::semantics {
-
 template <typename T> class DirectiveAttributeVisitor {
 public:
   explicit DirectiveAttributeVisitor(SemanticsContext &context)
@@ -361,7 +363,7 @@ class AccAttributeVisitor : DirectiveAttributeVisitor<llvm::acc::Directive> {
   void ResolveAccObject(const parser::AccObject &, Symbol::Flag);
   Symbol *ResolveAcc(const parser::Name &, Symbol::Flag, Scope &);
   Symbol *ResolveAcc(Symbol &, Symbol::Flag, Scope &);
-  Symbol *ResolveName(const parser::Name &, bool parentScope = false);
+  Symbol *ResolveName(const parser::Name &);
   Symbol *ResolveFctName(const parser::Name &);
   Symbol *ResolveAccCommonBlockName(const parser::Name *);
   Symbol *DeclareOrMarkOtherAccessEntity(const parser::Name &, Symbol::Flag);
@@ -1257,31 +1259,22 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCStandaloneConstruct &x) {
   return true;
 }
 
-Symbol *AccAttributeVisitor::ResolveName(
-    const parser::Name &name, bool parentScope) {
-  Symbol *prev{currScope().FindSymbol(name.source)};
-  // Check in parent scope if asked for.
-  if (!prev && parentScope) {
-    prev = currScope().parent().FindSymbol(name.source);
-  }
-  if (prev != name.symbol) {
-    name.symbol = prev;
-  }
-  return prev;
+Symbol *AccAttributeVisitor::ResolveName(const parser::Name &name) {
+  return name.symbol;
 }
 
 Symbol *AccAttributeVisitor::ResolveFctName(const parser::Name &name) {
   Symbol *prev{currScope().FindSymbol(name.source)};
-  if (!prev || (prev && prev->IsFuncResult())) {
+  if (prev && prev->IsFuncResult()) {
     prev = currScope().parent().FindSymbol(name.source);
-    if (!prev) {
-      prev = &context_.globalScope().MakeSymbol(
-          name.source, Attrs{}, ProcEntityDetails{});
-    }
   }
-  if (prev != name.symbol) {
-    name.symbol = prev;
+  if (!prev) {
+    prev = &*context_.globalScope()
+                 .try_emplace(name.source, ProcEntityDetails{})
+                 .first->second;
   }
+  CHECK(!name.symbol || name.symbol == prev);
+  name.symbol = prev;
   return prev;
 }
 
@@ -1388,9 +1381,8 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCRoutineConstruct &x) {
   } else {
     PushContext(verbatim.source, llvm::acc::Directive::ACCD_routine);
   }
-  const auto &optName{std::get<std::optional<parser::Name>>(x.t)};
-  if (optName) {
-    if (Symbol *sym = ResolveFctName(*optName)) {
+  if (const auto &optName{std::get<std::optional<parser::Name>>(x.t)}) {
+    if (Symbol * sym{ResolveFctName(*optName)}) {
       Symbol &ultimate{sym->GetUltimate()};
       AddRoutineInfoToSymbol(ultimate, x);
     } else {
@@ -1425,7 +1417,7 @@ bool AccAttributeVisitor::Pre(const parser::OpenACCCombinedConstruct &x) {
   case llvm::acc::Directive::ACCD_kernels_loop:
   case llvm::acc::Directive::ACCD_parallel_loop:
   case llvm::acc::Directive::ACCD_serial_loop:
-    PushContext(combinedDir.source, combinedDir.v);
+    PushContext(x.source, combinedDir.v);
     break;
   default:
     break;
@@ -1706,41 +1698,37 @@ void AccAttributeVisitor::Post(const parser::AccDefaultClause &x) {
   }
 }
 
-// For OpenACC constructs, check all the data-refs within the constructs
-// and adjust the symbol for each Name if necessary
 void AccAttributeVisitor::Post(const parser::Name &name) {
-  auto *symbol{name.symbol};
-  if (symbol && WithinConstruct()) {
-    symbol = &symbol->GetUltimate();
-    if (!symbol->owner().IsDerivedType() && !symbol->has<ProcEntityDetails>() &&
-        !symbol->has<SubprogramDetails>() && !IsObjectWithVisibleDSA(*symbol)) {
+  if (name.symbol && WithinConstruct()) {
+    const Symbol &symbol{name.symbol->GetUltimate()};
+    if (!symbol.owner().IsDerivedType() && !symbol.has<ProcEntityDetails>() &&
+        !symbol.has<SubprogramDetails>() && !IsObjectWithVisibleDSA(symbol)) {
       if (Symbol * found{currScope().FindSymbol(name.source)}) {
-        if (symbol != found) {
-          name.symbol = found; // adjust the symbol within region
+        if (&symbol != found) {
+          // adjust the symbol within the region
+          // TODO: why didn't name resolution set the right name originally?
+          name.symbol = found;
         } else if (GetContext().defaultDSA == Symbol::Flag::AccNone) {
           // 2.5.14.
           context_.Say(name.source,
               "The DEFAULT(NONE) clause requires that '%s' must be listed in a data-mapping clause"_err_en_US,
-              symbol->name());
+              symbol.name());
         }
+      } else {
+        // TODO: assertion here?  or clear name.symbol?
       }
     }
-  } // within OpenACC construct
+  }
 }
 
 Symbol *AccAttributeVisitor::ResolveAccCommonBlockName(
     const parser::Name *name) {
-  if (auto *prev{name
-              ? GetContext().scope.parent().FindCommonBlock(name->source)
-              : nullptr}) {
-    name->symbol = prev;
-    return prev;
-  }
-  // Check if the Common Block is declared in the current scope
-  if (auto *commonBlockSymbol{
-          name ? GetContext().scope.FindCommonBlock(name->source) : nullptr}) {
-    name->symbol = commonBlockSymbol;
-    return commonBlockSymbol;
+  if (name) {
+    if (Symbol *
+        cb{GetContext().scope.FindCommonBlockInVisibleScopes(name->source)}) {
+      name->symbol = cb;
+      return cb;
+    }
   }
   return nullptr;
 }
@@ -1790,8 +1778,8 @@ void AccAttributeVisitor::ResolveAccObject(
               }
             } else {
               context_.Say(name.source,
-                  "COMMON block must be declared in the same scoping unit "
-                  "in which the OpenACC directive or clause appears"_err_en_US);
+                  "Could not find COMMON block '%s' used in OpenACC directive"_err_en_US,
+                  name.ToString());
             }
           },
       },
@@ -1810,13 +1798,11 @@ Symbol *AccAttributeVisitor::ResolveAcc(
 
 Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity(
     const parser::Name &name, Symbol::Flag accFlag) {
-  Symbol *prev{currScope().FindSymbol(name.source)};
-  if (!name.symbol || !prev) {
+  if (name.symbol) {
+    return DeclareOrMarkOtherAccessEntity(*name.symbol, accFlag);
+  } else {
     return nullptr;
-  } else if (prev != name.symbol) {
-    name.symbol = prev;
   }
-  return DeclareOrMarkOtherAccessEntity(*prev, accFlag);
 }
 
 Symbol *AccAttributeVisitor::DeclareOrMarkOtherAccessEntity(
@@ -2991,6 +2977,7 @@ void OmpAttributeVisitor::Post(const parser::Name &name) {
 }
 
 Symbol *OmpAttributeVisitor::ResolveName(const parser::Name *name) {
+  // TODO: why is the symbol not properly resolved by name resolution?
   if (auto *resolvedSymbol{
           name ? GetContext().scope.FindSymbol(name->source) : nullptr}) {
     name->symbol = resolvedSymbol;
@@ -3108,26 +3095,6 @@ void OmpAttributeVisitor::ResolveOmpDesignator(
         AddAllocateName(name);
       }
     }
-    if (ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective &&
-        IsAllocatable(*symbol) &&
-        !IsNestedInDirective(llvm::omp::Directive::OMPD_allocate)) {
-      context_.Say(designator.source,
-          "List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement"_err_en_US);
-    }
-    bool checkScope{ompFlag == Symbol::Flag::OmpDeclarativeAllocateDirective};
-    // In 5.1 the scope check only applies to declarative allocate.
-    if (version == 50 && !checkScope) {
-      checkScope = ompFlag == Symbol::Flag::OmpExecutableAllocateDirective;
-    }
-    if (checkScope) {
-      if (omp::GetScopingUnit(GetContext().scope) !=
-          omp::GetScopingUnit(symbol->GetUltimate().owner())) {
-        context_.Say(designator.source, // 2.15.3
-            "List items must be declared in the same scoping unit in which the %s directive appears"_err_en_US,
-            parser::ToUpperCaseLetters(
-                llvm::omp::getOpenMPDirectiveName(directive, version)));
-      }
-    }
     if (ompFlag == Symbol::Flag::OmpReduction) {
       // Using variables inside of a namelist in OpenMP reductions
       // is allowed by the standard, but is not allowed for
diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index 0af1c94502bb4..4af6cf6a91239 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -1441,6 +1441,30 @@ class AccVisitor : public virtual DeclarationVisitor {
   void Post(const parser::AccBeginLoopDirective &x) {
     messageHandler().set_currStmtSource(std::nullopt);
   }
+  bool Pre(const parser::OpenACCStandaloneConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
+  bool Pre(const parser::OpenACCCacheConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
+  bool Pre(const parser::OpenACCWaitConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
+  bool Pre(const parser::OpenACCAtomicConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
+  bool Pre(const parser::OpenACCEndConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
+  bool Pre(const parser::OpenACCDeclarativeConstruct &x) {
+    currScope().AddSourceRange(x.source);
+    return true;
+  }
 
   void CopySymbolWithDevice(const parser::Name *name);
 
@@ -1480,7 +1504,8 @@ void AccVisitor::CopySymbolWithDevice(const parser::Name *name) {
   // symbols are created for the one appearing in the use_device
   // clause. These new symbols have the CUDA Fortran device
   // attribute.
-  if (context_.languageFeatures().IsEnabled(common::LanguageFeature::CUDA)) {
+  if (context_.languageFeatures().IsEnabled(common::LanguageFeature::CUDA) &&
+      name->symbol) {
     name->symbol = currScope().CopySymbol(*name->symbol);
     if (auto *object{name->symbol->detailsIf<ObjectEntityDetails>()}) {
       object->set_cudaDataAttr(common::CUDADataAttr::Device);
@@ -1490,15 +1515,12 @@ void AccVisitor::CopySymbolWithDevice(const parser::Name *name) {
 
 bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) {
   for (const auto &accObject : x.v.v) {
+    Walk(accObject);
     common::visit(
         common::visitors{
             [&](const parser::Designator &designator) {
               if (const auto *name{
                       parser::GetDesignatorNameIfDataRef(designator)}) {
-                Symbol *prev{currScope().FindSymbol(name->source)};
-                if (prev != name->symbol) {
-                  name->symbol = prev;
-                }
                 CopySymbolWithDevice(name);
               } else {
                 if (const auto *dataRef{
@@ -1507,13 +1529,8 @@ bool AccVisitor::Pre(const parser::AccClause::UseDevice &x) {
                       common::Indirection<parser::ArrayElement>;
                   if (auto *ind{std::get_if<ElementIndirection>(&dataRef->u)}) {
                     const parser::ArrayElement &arrayElement{ind->value()};
-                    Walk(arrayElement.subscripts);
                     const parser::DataRef &base{arrayElement.base};
                     if (auto *name{std::get_if<parser::Name>(&base.u)}) {
-                      Symbol *prev{currScope().FindSymbol(name->source)};
-                      if (prev != name->symbol) {
-                        name->symbol = prev;
-                      }
                       CopySymbolWithDevice(name);
                     }
                   }
@@ -1537,6 +1554,7 @@ void AccVisitor::Post(const parser::OpenACCBlockConstruct &x) {
 
 bool AccVisitor::Pre(const parser::OpenACCCombinedConstruct &x) {
   PushScope(Scope::Kind::OpenACCConstruct, nullptr);
+  currScope().AddSourceRange(x.source);
   return true;
 }
 
@@ -1754,11 +1772,11 @@ class OmpVisitor : public virtual DeclarationVisitor {
     messageHandler().set_currStmtSource(std::nullopt);
   }
 
-  bool Pre(const parser::OmpTypeSpecifier &x) {
+  bool Pre(const parser::OmpTypeName &x) {
     BeginDeclTypeSpec();
     return true;
   }
-  void Post(const parser::OmpTypeSpecifier &x) { //
+  void Post(const parser::OmpTypeName &x) { //
     EndDeclTypeSpec();
   }
 
@@ -1989,7 +2007,7 @@ void OmpVisitor::ProcessReductionSpecifier(
       }
     }
     EndDeclTypeSpec();
-    Walk(std::get<std::optional<parser::OmpReductionCombiner>>(spec.t));
+    Walk(std::get<std::optional<parser::OmpCombinerExpression>>(spec.t));
     Walk(clauses);
     PopScope();
   }
@@ -3627,6 +3645,20 @@ void ModuleVisitor::Post(const parser::UseStmt &x) {
       }
     }
   }
+  // Go through the list of COMMON block symbols in the module scope and add
+  // their USE association to the current scope's USE-associated COMMON blocks.
+  for (const auto &[name, symbol] : useModuleScope_->commonBlocks()) {
+    if (!currScope().FindCommonBlockInVisibleScopes(name)) {
+      currScope().AddCommonBlockUse(
+          name, symbol->attrs(), symbol->GetUltimate());
+    }
+  }
+  // Go through the list of USE-associated COMMON block symbols in the module
+  // scope and add USE associations to their ultimate symbols to the current
+  // scope's USE-associated COMMON blocks.
+  for (const auto &[name, symbol] : useModuleScope_->commonBlockUses()) {
+    currScope().AddCommonBlockUse(name, symbol->attrs(), symbol->GetUltimate());
+  }
   useModuleScope_ = nullptr;
 }
 
@@ -5433,7 +5465,8 @@ void SubprogramVisitor::PushBlockDataScope(const parser::Name &name) {
   }
 }
 
-// If name is a generic, return specific subprogram with the same name.
+// If name is a generic in the same scope, return its specific subprogram with
+// the same name, if any.
 Symbol *SubprogramVisitor::GetSpecificFromGeneric(const parser::Name &name) {
   // Search for the name but don't resolve it
   if (auto *symbol{currScope().FindSymbol(name.source)}) {
@@ -5443,6 +5476,9 @@ Symbol *SubprogramVisitor::GetSpecificFromGeneric(const parser::Name &name) {
         // symbol doesn't inherit it and ruin the ability to check it.
         symbol->attrs().reset(Attr::MODULE);
       }
+    } else if (&symbol->owner() != &currScope() && inInterfaceBlock() &&
+        !isGeneric()) {
+      // non-generic interface shadows outer definition
     } else if (auto *details{symbol->detailsIf<GenericDetails>()}) {
       // found generic, want specific procedure
       auto *specific{details->specific()};
diff --git a/flang/lib/Semantics/scope.cpp b/flang/lib/Semantics/scope.cpp
index 4af371f3611f3..ab75d4c608387 100644
--- a/flang/lib/Semantics/scope.cpp
+++ b/flang/lib/Semantics/scope.cpp
@@ -144,9 +144,8 @@ void Scope::add_crayPointer(const SourceName &name, Symbol &pointer) {
 }
 
 Symbol &Scope::MakeCommonBlock(SourceName name, SourceName location) {
-  const auto it{commonBlocks_.find(name)};
-  if (it != commonBlocks_.end()) {
-    return *it->second;
+  if (auto *cb{FindCommonBlock(name)}) {
+    return *cb;
   } else {
     Symbol &symbol{MakeSymbol(
         name, Attrs{}, CommonBlockDetails{name.empty() ? location : name})};
@@ -154,9 +153,25 @@ Symbol &Scope::MakeCommonBlock(SourceName name, SourceName location) {
     return symbol;
   }
 }
-Symbol *Scope::FindCommonBlock(const SourceName &name) const {
-  const auto it{commonBlocks_.find(name)};
-  return it != commonBlocks_.end() ? &*it->second : nullptr;
+
+Symbol *Scope::FindCommonBlockInVisibleScopes(const SourceName &name) const {
+  if (Symbol * cb{FindCommonBlock(name)}) {
+    return cb;
+  } else if (Symbol * cb{FindCommonBlockUse(name)}) {
+    return &cb->GetUltimate();
+  } else if (IsSubmodule()) {
+    if (const Scope *parent{
+            symbol_ ? symbol_->get<ModuleDetails>().parent() : nullptr}) {
+      if (auto *cb{parent->FindCommonBlockInVisibleScopes(name)}) {
+        return cb;
+      }
+    }
+  } else if (!IsTopLevel() && parent_) {
+    if (auto *cb{parent_->FindCommonBlockInVisibleScopes(name)}) {
+      return cb;
+    }
+  }
+  return nullptr;
 }
 
 Scope *Scope::FindSubmodule(const SourceName &name) const {
@@ -167,6 +182,31 @@ Scope *Scope::FindSubmodule(const SourceName &name) const {
     return &*it->second;
   }
 }
+
+bool Scope::AddCommonBlockUse(
+    const SourceName &name, Attrs attrs, Symbol &cbUltimate) {
+  CHECK(cbUltimate.has<CommonBlockDetails>());
+  // Make a symbol, but don't add it to the Scope, since it needs to
+  // be added to the USE-associated COMMON blocks
+  Symbol &useCB{MakeSymbol(name, attrs, UseDetails{name, cbUltimate})};
+  return commonBlockUses_.emplace(name, useCB).second;
+}
+
+Symbol *Scope::FindCommonBlock(const SourceName &name) const {
+  if (const auto it{commonBlocks_.find(name)}; it != commonBlocks_.end()) {
+    return &*it->second;
+  }
+  return nullptr;
+}
+
+Symbol *Scope::FindCommonBlockUse(const SourceName &name) const {
+  if (const auto it{commonBlockUses_.find(name)};
+      it != commonBlockUses_.end()) {
+    return &*it->second;
+  }
+  return nullptr;
+}
+
 bool Scope::AddSubmodule(const SourceName &name, Scope &submodule) {
   return submodules_.emplace(name, submodule).second;
 }
diff --git a/flang/lib/Semantics/semantics.cpp b/flang/lib/Semantics/semantics.cpp
index bdb5377265c14..2606d997b1cd7 100644
--- a/flang/lib/Semantics/semantics.cpp
+++ b/flang/lib/Semantics/semantics.cpp
@@ -452,6 +452,15 @@ void SemanticsContext::UpdateScopeIndex(
   }
 }
 
+void SemanticsContext::DumpScopeIndex(llvm::raw_ostream &out) const {
+  out << "scopeIndex_:\n";
+  for (const auto &[source, scope] : scopeIndex_) {
+    out << "source '" << source.ToString() << "' -> scope " << scope
+        << "... whose source range is '" << scope.sourceRange().ToString()
+        << "'\n";
+  }
+}
+
 bool SemanticsContext::IsInModuleFile(parser::CharBlock source) const {
   for (const Scope *scope{&FindScope(source)}; !scope->IsGlobal();
        scope = &scope->parent()) {
diff --git a/flang/lib/Utils/OpenMP.cpp b/flang/lib/Utils/OpenMP.cpp
index bcc9380bc8aac..b07caf853191a 100644
--- a/flang/lib/Utils/OpenMP.cpp
+++ b/flang/lib/Utils/OpenMP.cpp
@@ -112,7 +112,7 @@ mlir::Value mapTemporaryValue(fir::FirOpBuilder &firOpBuilder,
   mlir::Block *entryBlock = &region.getBlocks().front();
   firOpBuilder.setInsertionPointToStart(entryBlock);
   auto loadOp =
-      firOpBuilder.create<fir::LoadOp>(clonedValArg.getLoc(), clonedValArg);
+      fir::LoadOp::create(firOpBuilder, clonedValArg.getLoc(), clonedValArg);
   return loadOp.getResult();
 }
 
diff --git a/flang/module/cudadevice.f90 b/flang/module/cudadevice.f90
index 22df9cdf410d5..5182950cbffea 100644
--- a/flang/module/cudadevice.f90
+++ b/flang/module/cudadevice.f90
@@ -21,23 +21,32 @@ module cudadevice
     procedure :: syncthreads
   end interface
 
-  interface
-    attributes(device) integer function syncthreads_and(value)
-      integer, value :: value
+  interface syncthreads_and
+    attributes(device) integer function syncthreads_and_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_and_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_and
 
-  interface
-    attributes(device) integer function syncthreads_count(value)
-      integer, value :: value
+  interface syncthreads_count
+    attributes(device) integer function syncthreads_count_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_count_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_count
 
-  interface
-    attributes(device) integer function syncthreads_or(value)
-      integer, value :: value
+  interface syncthreads_or
+    attributes(device) integer function syncthreads_or_i4(value)
+      integer(4), value :: value
     end function
-  end interface
+    attributes(device) integer function syncthreads_or_l4(value)
+      logical(4), value :: value
+    end function
+  end interface syncthreads_or
 
   interface
     attributes(device) subroutine syncwarp(mask)
diff --git a/flang/test/CMakeLists.txt b/flang/test/CMakeLists.txt
index da557f9ec3443..8c8e92faa787a 100644
--- a/flang/test/CMakeLists.txt
+++ b/flang/test/CMakeLists.txt
@@ -72,6 +72,7 @@ if (NOT FLANG_STANDALONE_BUILD)
     FileCheck
     count
     not
+    llvm-bcanalyzer
     llvm-dis
     llvm-objcopy
     llvm-objdump
diff --git a/flang/test/Driver/lto-bc.f90 b/flang/test/Driver/lto-bc.f90
index 5e34cdb87c5b1..5705fe0cbb929 100644
--- a/flang/test/Driver/lto-bc.f90
+++ b/flang/test/Driver/lto-bc.f90
@@ -1,21 +1,35 @@
 ! Test that the output is LLVM bitcode for LTO and not a native objectfile by
-! disassembling it to LLVM IR.
-! Right now there is nothing special about it and it is similar to non-lto IR,
-! more work is needed to add things like module summaries.
+! disassembling it to LLVM IR. Also tests that module summaries are emitted for LTO
 
 ! RUN: %flang %s -c -o - | not llvm-dis -o %t
 ! RUN: %flang_fc1 %s -emit-llvm-bc -o - | llvm-dis -o - | FileCheck %s
-
-! RUN: %flang -flto %s -c -o - | llvm-dis -o - | FileCheck %s
-! RUN: %flang -flto=thin %s -c -o - | llvm-dis -o - | FileCheck %s
-
 ! CHECK: define void @_QQmain()
 ! CHECK-NEXT:  ret void
 ! CHECK-NEXT: }
+! CHECK-NOT: !{{.*}} = !{i32 1, !"ThinLTO", i32 0}
+! CHECK-NOT: ^{{.*}} = module:
+! CHECK-NOT: ^{{.*}} = gv: (name:
+! CHECK-NOT: ^{{.*}} = blockcount:
+
+! RUN: %flang -flto=thin %s -c -o - | llvm-dis -o - | FileCheck %s --check-prefix=THIN
+! THIN: define void @_QQmain()
+! THIN-NEXT:  ret void
+! THIN-NEXT: }
+! THIN-NOT: !{{.*}} = !{i32 1, !"ThinLTO", i32 0}
+! THIN-NOT: ^{{.*}} = module:
+! THIN-NOT: ^{{.*}} = gv: (name:
+! THIN-NOT: ^{{.*}} = blockcount:
 
-! CHECK-NOT: ^0 = module:
-! CHECK-NOT: ^1 = gv: (name:
-! CHECK-NOT: ^2 = flags:
-! CHECK-NOT: ^3 = blockcount:
+! RUN: %flang -flto %s -c -o - | llvm-dis -o - | FileCheck %s --check-prefix=FULL
+! FULL: define void @_QQmain()
+! FULL-NEXT:  ret void
+! FULL-NEXT: }
+! FULL: !{{.*}} = !{i32 1, !"ThinLTO", i32 0}
+! FULL: ^{{.*}} = module:
+! FULL: ^{{.*}} = gv: (name:
+! FULL: ^{{.*}} = blockcount:
 
+! RUN: %flang_fc1 -flto -emit-llvm-bc %s -o - | llvm-bcanalyzer -dump| FileCheck --check-prefix=MOD-SUMM %s
+! MOD-SUMM: FULL_LTO_GLOBALVAL_SUMMARY_BLOCK
+program main
 end program
diff --git a/flang/test/Fir/CUDA/cuda-global-addr.mlir b/flang/test/Fir/CUDA/cuda-global-addr.mlir
index 3e50c7a51f49c..6f7816c9163cb 100644
--- a/flang/test/Fir/CUDA/cuda-global-addr.mlir
+++ b/flang/test/Fir/CUDA/cuda-global-addr.mlir
@@ -63,6 +63,7 @@ module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> :
 
 // We cannot call _FortranACUFGetDeviceAddress on a constant global. 
 // There is no symbol for it and the call would result into an unresolved reference.
+// CHECK-LABEL: func.func @_QQmain() attributes {fir.bindc_name = "arraysize"}
 // CHECK-NOT: fir.call {{.*}}GetDeviceAddress
 
 // -----
@@ -90,3 +91,22 @@ func.func @_QQmain() attributes {fir.bindc_name = "test"} {
 // CHECK-NOT: fir.call {{.*}}GetDeviceAddress
 
 }
+
+// -----
+
+// Check that we do not introduce call to _FortranACUFGetDeviceAddress when the
+// value has no user.
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<#dlti.dl_entry<f80, dense<128> : vector<2xi64>>, #dlti.dl_entry<i128, dense<128> : vector<2xi64>>, #dlti.dl_entry<i64, dense<64> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr<272>, dense<64> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<271>, dense<32> : vector<4xi64>>, #dlti.dl_entry<!llvm.ptr<270>, dense<32> : vector<4xi64>>, #dlti.dl_entry<f128, dense<128> : vector<2xi64>>, #dlti.dl_entry<f64, dense<64> : vector<2xi64>>, #dlti.dl_entry<f16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i32, dense<32> : vector<2xi64>>, #dlti.dl_entry<i16, dense<16> : vector<2xi64>>, #dlti.dl_entry<i8, dense<8> : vector<2xi64>>, #dlti.dl_entry<i1, dense<8> : vector<2xi64>>, #dlti.dl_entry<!llvm.ptr, dense<64> : vector<4xi64>>, #dlti.dl_entry<"dlti.endianness", "little">, #dlti.dl_entry<"dlti.stack_alignment", 128 : i64>>} {
+  func.func @_QQmain() attributes {fir.bindc_name = "T"} {
+    %0 = fir.dummy_scope : !fir.dscope
+    %1 = fir.address_of(@_QMcon2Ezzz) : !fir.ref<i32>
+    %2 = fir.declare %1 {data_attr = #cuf.cuda<constant>, uniq_name = "_QMcon2Ezzz"} : (!fir.ref<i32>) -> !fir.ref<i32>
+    return
+  }
+  fir.global @_QMcon2Ezzz {data_attr = #cuf.cuda<constant>} : i32
+}
+
+// CHECK-LABEL:  func.func @_QQmain()
+// CHECK: fir.address_of(@_QMcon2Ezzz) : !fir.ref<i32>
+// CHECK-NOT: fir.call {{.*}}GetDeviceAddress
diff --git a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
index a334934f31723..48fee10f3db97 100644
--- a/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
+++ b/flang/test/Fir/CUDA/cuda-target-rewrite.mlir
@@ -55,3 +55,56 @@ func.func @main(%arg0: complex<f64>) {
 // CHECK-SAME: (%arg0: f64, %arg1: f64) kernel {
 // CHECK: gpu.return
 // CHECK: gpu.launch_func  @testmod::@_QPtest blocks in (%{{.*}}, %{{.*}}, %{{.*}}) threads in (%{{.*}}, %{{.*}}, %{{.*}}) : i64 dynamic_shared_memory_size %{{.*}} args(%{{.*}} : f64, %{{.*}} : f64) {cuf.proc_attr = #cuf.cuda_proc<global>}
+
+// -----
+
+module attributes {gpu.container_module, fir.defaultkind = "a1c4d8i4l4r4", fir.kindmap = "", llvm.data_layout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128", llvm.target_triple = "x86_64-unknown-linux-gnu"} {
+  gpu.module @testmod {
+    gpu.func @_QMbarPfoo(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WORKGROUP:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WORKGROUP]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPfoo2(%arg0: f32, %arg1: !fir.ref<!fir.array<100xf32>>, %arg2: !fir.boxchar<1>) workgroup(%arg3 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %arg4 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg4[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPfoo2(
+// CHECK-SAME: %{{.*}}: f32, %{{.*}}: !fir.ref<!fir.array<100xf32>>, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG1:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}, %[[WG2:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG1]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[WG2]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+
+    gpu.func @_QMbarPprivate(%arg0: f32, %arg1: !fir.boxchar<1>) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%arg3 : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      memref.store %arg0, %arg3[%c0] : memref<1xf32, #gpu.address_space<private>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @_QMbarPprivate(
+// CHECK-SAME: %{{.*}}: f32, %[[CHAR:.*]]: !fir.ref<!fir.char<1,?>>, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>> {llvm.align = 16 : i32}) private(%[[PRIVATE:.*]] : memref<1xf32, #gpu.address_space<private>> {llvm.align = 16 : i32}) {
+// CHECK: %{{.*}} = fir.emboxchar %[[CHAR]], %[[LENGTH]] : (!fir.ref<!fir.char<1,?>>, i64) -> !fir.boxchar<1>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+// CHECK: memref.store %{{.*}}, %[[PRIVATE]][%{{.*}}] : memref<1xf32, #gpu.address_space<private>>
+    
+    gpu.func @test_with_char_proc(%arg0: f32, %arg1: tuple<() -> (), i64> {fir.char_proc}) workgroup(%arg2 : memref<1xf32, #gpu.address_space<workgroup>>) {
+      %c0 = arith.constant 0 : index
+      memref.store %arg0, %arg2[%c0] : memref<1xf32, #gpu.address_space<workgroup>>
+      gpu.return
+    }
+// CHECK-LABEL: gpu.func @test_with_char_proc(
+// CHECK-SAME: %{{.*}}: f32, %[[CHARPROC:.*]]: () -> () {fir.char_proc}, %[[LENGTH:.*]]: i64) workgroup(%[[WG:.*]] : memref<1xf32, #gpu.address_space<workgroup>>) {
+// CHECK: %{{.*}} = fir.undefined tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[CHARPROC]], [0 : index] : (tuple<() -> (), i64>, () -> ()) -> tuple<() -> (), i64>
+// CHECK: %{{.*}} = fir.insert_value %{{.*}}, %[[LENGTH]], [1 : index] : (tuple<() -> (), i64>, i64) -> tuple<() -> (), i64>
+// CHECK: memref.store %{{.*}}, %[[WG]][%{{.*}}] : memref<1xf32, #gpu.address_space<workgroup>>
+  }
+}
+
diff --git a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90 b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
index e8951cceeeaeb..ec97114d30f88 100644
--- a/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
+++ b/flang/test/Fir/OpenACC/openacc-type-categories-class.f90
@@ -43,4 +43,4 @@ subroutine init_unlimited(this)
 
 ! TODO: After using select type - the appropriate type category should be
 ! possible. Add the rest of the test once OpenACC lowering correctly handles
-! unlimited polymorhic.
+! unlimited polymorphic.
diff --git a/flang/test/Fir/assumed-size-ops-codegen.fir b/flang/test/Fir/assumed-size-ops-codegen.fir
new file mode 100644
index 0000000000000..54e9b3c8f30f4
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-codegen.fir
@@ -0,0 +1,19 @@
+// RUN: fir-opt --fir-to-llvm-ir="target=x86_64-unknown-linux-gnu" %s | FileCheck %s
+
+// CHECK-LABEL: @assumed_size_extent(
+// CHECK: %[[CNEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: llvm.return %[[CNEG1]] : i64
+func.func @assumed_size_extent() -> index {
+  %e = fir.assumed_size_extent : index
+  return %e : index
+}
+
+// CHECK-LABEL:  @is_assumed_size_extent(
+// CHECK: %[[NEG1:.*]] = llvm.mlir.constant(-1 : i64)
+// CHECK: %[[CMP:.*]] = llvm.icmp "eq"
+// CHECK: llvm.return %[[CMP]] : i1
+func.func @is_assumed_size_extent(%x: index) -> i1 {
+  %c = fir.is_assumed_size_extent %x : (index) -> i1
+  return %c : i1
+}
+
diff --git a/flang/test/Fir/assumed-size-ops-folding.fir b/flang/test/Fir/assumed-size-ops-folding.fir
new file mode 100644
index 0000000000000..9fd5fabb7290b
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-folding.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt --canonicalize %s | FileCheck %s
+
+// Verify: fir.is_assumed_size_extent(fir.assumed_size_extent) folds to i1 true.
+
+// CHECK-LABEL: func.func @fold(
+func.func @fold() -> i1 {
+  %e = fir.assumed_size_extent : index
+  // CHECK: %[[C:.*]] = arith.constant true
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+  return %t : i1
+}
+
+
diff --git a/flang/test/Fir/assumed-size-ops-roundtrip.fir b/flang/test/Fir/assumed-size-ops-roundtrip.fir
new file mode 100644
index 0000000000000..c3c1883a2b7c0
--- /dev/null
+++ b/flang/test/Fir/assumed-size-ops-roundtrip.fir
@@ -0,0 +1,13 @@
+// RUN: fir-opt %s | fir-opt | FileCheck %s
+
+func.func @roundtrip() {
+  // CHECK: %[[E:.*]] = fir.assumed_size_extent : index
+  %e = fir.assumed_size_extent : index
+
+  // CHECK: %[[T:.*]] = fir.is_assumed_size_extent %[[E]] : (index) -> i1
+  %t = fir.is_assumed_size_extent %e : (index) -> i1
+
+  return
+}
+
+
diff --git a/flang/test/Fir/basic-program.fir b/flang/test/Fir/basic-program.fir
index 5159c91dafa5c..6d2beae4da1c8 100644
--- a/flang/test/Fir/basic-program.fir
+++ b/flang/test/Fir/basic-program.fir
@@ -161,4 +161,5 @@ func.func @_QQmain() {
 // PASSES-NEXT:  LowerNontemporalPass
 // PASSES-NEXT: FIRToLLVMLowering
 // PASSES-NEXT: ReconcileUnrealizedCasts
+// PASSES-NEXT: PrepareForOMPOffloadPrivatizationPass
 // PASSES-NEXT: LLVMIRLoweringPass
diff --git a/flang/test/HLFIR/assumed-type-actual-args.f90 b/flang/test/HLFIR/assumed-type-actual-args.f90
index 42e9ed27340e7..aaac98ba3c79d 100644
--- a/flang/test/HLFIR/assumed-type-actual-args.f90
+++ b/flang/test/HLFIR/assumed-type-actual-args.f90
@@ -113,7 +113,7 @@ subroutine s5b(x)
 ! CHECK-LABEL:   func.func @_QPtest2(
 ! CHECK-SAME:                        %[[VAL_0:.*]]: !fir.ref<!fir.array<?xnone>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[DSCOPE:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_1:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_1:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_2:.*]] = fir.shape %[[VAL_1]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_3:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_2]]) dummy_scope %[[DSCOPE]] {uniq_name = "_QFtest2Ex"} : (!fir.ref<!fir.array<?xnone>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xnone>>, !fir.ref<!fir.array<?xnone>>)
 ! CHECK:           fir.call @_QPs2(%[[VAL_3]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xnone>>) -> ()
diff --git a/flang/test/Lower/CUDA/cuda-device-proc.cuf b/flang/test/Lower/CUDA/cuda-device-proc.cuf
index 29c348c5260a5..5c4c3c6d39820 100644
--- a/flang/test/Lower/CUDA/cuda-device-proc.cuf
+++ b/flang/test/Lower/CUDA/cuda-device-proc.cuf
@@ -12,17 +12,23 @@ attributes(global) subroutine devsub()
   integer(8) :: al
   integer(8) :: time
   integer :: smalltime
-  integer(4) :: res
+  integer(4) :: res, offset
   integer(8) :: resl
 
+  integer :: tid
+  tid = threadIdx%x
+
   call syncthreads()
   call syncwarp(1)
   call threadfence()
   call threadfence_block()
   call threadfence_system()
   ret = syncthreads_and(1)
+  res = syncthreads_and(tid > offset)
   ret = syncthreads_count(1)
+  ret = syncthreads_count(tid > offset)
   ret = syncthreads_or(1)
+  ret = syncthreads_or(tid > offset)
 
   ai = atomicadd(ai, 1_4)
   al = atomicadd(al, 1_8)
@@ -100,9 +106,24 @@ end
 ! CHECK: fir.call @llvm.nvvm.membar.gl() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.cta() fastmath<contract> : () -> ()
 ! CHECK: fir.call @llvm.nvvm.membar.sys() fastmath<contract> : () -> ()
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1_i32_0) fastmath<contract> : (i32) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1_i32_1) fastmath<contract> : (i32) -> i32
-! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1_i32_2) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.and(%[[CONV]])
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.popc(%[[CONV]]) fastmath<contract> : (i32) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%c1{{.*}}) fastmath<contract> : (i32) -> i32
+! CHECK: %[[A:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[B:.*]] = fir.load %{{.*}} : !fir.ref<i32>
+! CHECK: %[[CMP:.*]] = arith.cmpi sgt, %[[A]], %[[B]] : i32
+! CHECK: %[[CONV:.*]] = fir.convert %[[CMP]] : (i1) -> i32
+! CHECK: %{{.*}} = fir.call @llvm.nvvm.barrier0.or(%[[CONV]]) fastmath<contract> : (i32) -> i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i32
 ! CHECK: %{{.*}} = llvm.atomicrmw add  %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, i64
 ! CHECK: %{{.*}} = llvm.atomicrmw fadd %{{.*}}, %{{.*}} seq_cst : !llvm.ptr, f32
@@ -458,3 +479,16 @@ end subroutine
 
 ! CHECK-LABEL: func.func @_QPtest_bulk_s2g
 ! CHECL: nvvm.cp.async.bulk.global.shared.cta %{{.*}}, %{{.*}}, %{{.*}} : <1>, <3>
+
+attributes(device) subroutine testAtomicCasLoop(aa, n)
+  integer :: a
+  do while (atomiccas(a, 0, 1) == 1)
+  end do
+end subroutine
+
+! CHECK-LABEL: func.func @_QPtestatomiccasloop
+! CHECK: %[[CMP_XCHG:.*]] = llvm.cmpxchg %15, %c0_i32, %c1_i32 acq_rel monotonic : !llvm.ptr, i32
+! CHECK: %[[CMP_XCHG_EV:.*]] = llvm.extractvalue %[[CMP_XCHG]][1] : !llvm.struct<(i32, i1)> 
+! CHECK: %[[CASTED_CMP_XCHG_EV:.*]] = fir.convert %[[CMP_XCHG_EV]] : (i1) -> i32
+! CHECK: %{{.*}} = arith.constant 1 : i32
+! CHECK: %19 = arith.cmpi eq, %[[CASTED_CMP_XCHG_EV]], %{{.*}} : i32
diff --git a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
index 71e594e4742ec..e0941f74072ba 100644
--- a/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
+++ b/flang/test/Lower/CUDA/cuda-kernel-calls.cuf
@@ -16,6 +16,7 @@ contains
   subroutine host()
     real, device :: a
     integer(8) :: stream
+    integer(4) :: nbytes
 
 ! CHECK-LABEL: func.func @_QMtest_callPhost()
 ! CHECK: %[[A:.*]]:2 = hlfir.declare %{{.*}} {data_attr = #cuf.cuda<device>, uniq_name = "_QMtest_callFhostEa"} : (!fir.ref<f32>) -> (!fir.ref<f32>, !fir.ref<f32>)
@@ -57,6 +58,10 @@ contains
     call dev_kernel1<<<*,32,0,stream>>>(a)
 ! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%c-1{{.*}}, %c1{{.*}}, %c1{{.*}}, %c32{{.*}}, %c1{{.*}}, %c1{{.*}}, %c0{{.*}}, %{{.*}} : !fir.ref<i64>>>>(%{{.*}}) : (!fir.ref<f32>)
 
+    call dev_kernel1<<<*, 32, 0.8 * nbytes>>>(a)
+! CHECK: %[[MUL:.*]] = arith.mulf %{{.*}}, %{{.*}} fastmath<contract> : f32
+! CHECK: %[[BYTES:.*]] = fir.convert %[[MUL]] : (f32) -> i32
+! CHECK: cuf.kernel_launch @_QMtest_callPdev_kernel1<<<%{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %{{.*}}, %[[BYTES]]>>>(%{{.*}}) : (!fir.ref<f32>)
   end
 
 end
diff --git a/flang/test/Lower/HLFIR/assumed-rank-iface.f90 b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
index 9ecbb7c9e5b94..ffb36fa4b7003 100644
--- a/flang/test/Lower/HLFIR/assumed-rank-iface.f90
+++ b/flang/test/Lower/HLFIR/assumed-rank-iface.f90
@@ -145,7 +145,7 @@ subroutine int_r2_assumed_size_to_assumed_rank(x)
 ! CHECK:           %[[VAL_3:.*]] = arith.constant 0 : index
 ! CHECK:           %[[VAL_4:.*]] = arith.cmpi sgt, %[[VAL_2]], %[[VAL_3]] : index
 ! CHECK:           %[[VAL_5:.*]] = arith.select %[[VAL_4]], %[[VAL_2]], %[[VAL_3]] : index
-! CHECK:           %[[VAL_6:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_6:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_7:.*]] = fir.shape %[[VAL_5]], %[[VAL_6]] : (index, index) -> !fir.shape<2>
 ! CHECK:           %[[VAL_8:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_7]]) dummy_scope %{{[0-9]+}} {uniq_name = "_QFint_r2_assumed_size_to_assumed_rankEx"} : (!fir.ref<!fir.array<10x?xi32>>, !fir.shape<2>, !fir.dscope) -> (!fir.box<!fir.array<10x?xi32>>, !fir.ref<!fir.array<10x?xi32>>)
 ! CHECK:           %[[VAL_9:.*]] = fir.convert %[[VAL_8]]#0 : (!fir.box<!fir.array<10x?xi32>>) -> !fir.box<!fir.array<*:i32>>
diff --git a/flang/test/Lower/HLFIR/select-rank.f90 b/flang/test/Lower/HLFIR/select-rank.f90
index 0f80c723e468f..f1f968decd412 100644
--- a/flang/test/Lower/HLFIR/select-rank.f90
+++ b/flang/test/Lower/HLFIR/select-rank.f90
@@ -371,7 +371,7 @@ subroutine test_branching(x)
 ! CHECK:           fir.call @_QPr1(%[[VAL_11]]#0) fastmath<contract> : (!fir.box<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_12:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_12:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_13:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_14:.*]] = fir.convert %[[VAL_13]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_15:.*]] = fir.shape %[[VAL_12]] : (index) -> !fir.shape<1>
@@ -435,7 +435,7 @@ subroutine test_branching(x)
 ! CHECK:           fir.call @_QPrdefault(%[[VAL_8]]#0) fastmath<contract> : (!fir.box<!fir.array<*:f32>>) -> ()
 ! CHECK:           cf.br ^bb5
 ! CHECK:         ^bb4:
-! CHECK:           %[[VAL_9:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_9:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_10:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_11:.*]] = fir.convert %[[VAL_10]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_12:.*]] = fir.shape %[[VAL_9]] : (index) -> !fir.shape<1>
@@ -482,7 +482,7 @@ subroutine test_branching(x)
 ! CHECK:           fir.call @_QPr1_implicit(%[[VAL_21]]#1) fastmath<contract> : (!fir.ref<!fir.array<?xf32>>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_22:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_22:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_23:.*]] = fir.box_addr %[[VAL_2]]#1 : (!fir.box<!fir.array<*:f32>>) -> !fir.ref<!fir.array<*:f32>>
 ! CHECK:           %[[VAL_24:.*]] = fir.convert %[[VAL_23]] : (!fir.ref<!fir.array<*:f32>>) -> !fir.ref<!fir.array<?xf32>>
 ! CHECK:           %[[VAL_25:.*]] = fir.shape %[[VAL_22]] : (index) -> !fir.shape<1>
@@ -534,7 +534,7 @@ subroutine test_branching(x)
 ! CHECK:           fir.call @_QPrc1_implicit(%[[VAL_26]]) fastmath<contract> : (!fir.boxchar<1>) -> ()
 ! CHECK:           cf.br ^bb6
 ! CHECK:         ^bb5:
-! CHECK:           %[[VAL_27:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_27:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_28:.*]] = fir.box_addr %[[VAL_8]]#1 : (!fir.box<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<*:!fir.char<1,?>>>
 ! CHECK:           %[[VAL_29:.*]] = fir.convert %[[VAL_28]] : (!fir.ref<!fir.array<*:!fir.char<1,?>>>) -> !fir.ref<!fir.array<?x!fir.char<1,?>>>
 ! CHECK:           %[[VAL_30:.*]] = fir.shape %[[VAL_27]] : (index) -> !fir.shape<1>
diff --git a/flang/test/Lower/Intrinsics/lbound.f90 b/flang/test/Lower/Intrinsics/lbound.f90
index a5ca2d39a5ee4..75c11ff8bc95f 100644
--- a/flang/test/Lower/Intrinsics/lbound.f90
+++ b/flang/test/Lower/Intrinsics/lbound.f90
@@ -40,7 +40,7 @@ subroutine lbound_test_2(a, dim, res)
 subroutine lbound_test_3(a, dim, res)
   real, dimension(2:10, 3:*) :: a
   integer(8):: dim, res
-! CHECK:  %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:  %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:  %[[VAL_1:.*]] = fir.load %arg1 : !fir.ref<i64>
 ! CHECK:  %[[VAL_2:.*]] = fir.shape_shift %{{.*}}, %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index, index) -> !fir.shapeshift<2>
 ! CHECK:         %[[VAL_3:.*]] = fir.embox %arg0(%[[VAL_2]]) : (!fir.ref<!fir.array<9x?xf32>>, !fir.shapeshift<2>) -> !fir.box<!fir.array<9x?xf32>>
diff --git a/flang/test/Lower/Intrinsics/ubound.f90 b/flang/test/Lower/Intrinsics/ubound.f90
index dae21acda170d..bc8cff8982542 100644
--- a/flang/test/Lower/Intrinsics/ubound.f90
+++ b/flang/test/Lower/Intrinsics/ubound.f90
@@ -48,7 +48,7 @@ subroutine ubound_test_2(a, dim, res)
 subroutine ubound_test_3(a, dim, res)
   real, dimension(10, 20, *) :: a
   integer(8):: dim, res
-! CHECK:         %[[VAL_0:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_0:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_1:.*]] = fir.shape %{{.*}}, %{{.*}}, %[[VAL_0]] : (index, index, index) -> !fir.shape<3>
 ! CHECK:         %[[VAL_2:.*]] = fir.embox %{{.*}}(%[[VAL_1]]) : (!fir.ref<!fir.array<10x20x?xf32>>, !fir.shape<3>) -> !fir.box<!fir.array<10x20x?xf32>>
 ! CHECK:         %[[VAL_3:.*]] = fir.load %{{.*}} : !fir.ref<i64>
diff --git a/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90 b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90
new file mode 100644
index 0000000000000..1ab883e0599c0
--- /dev/null
+++ b/flang/test/Lower/OpenACC/acc-data-operands-remapping-common.f90
@@ -0,0 +1,43 @@
+! Test remapping of common blocks appearing in OpenACC data directives.
+
+! RUN: bbc -fopenacc -emit-hlfir %s -o - | FileCheck %s
+
+subroutine test
+  real :: x(100), y(100), overlap1(100), overlap2(100)
+  equivalence (x(50), overlap1)
+  equivalence (x(40), overlap2)
+  common /comm/ x, y
+  !$acc declare link(/comm/)
+  !$acc parallel loop copyin(/comm/)
+    do i = 1, 100
+	   x(i) = overlap1(i)*2+ overlap2(i)
+    enddo
+end subroutine
+! CHECK-LABEL:   func.func @_QPtest() {
+! CHECK:           %[[ADDRESS_OF_0:.*]] = fir.address_of(@comm_)
+! CHECK:           %[[COPYIN_0:.*]] = acc.copyin varPtr(%[[ADDRESS_OF_0]] : !fir.ref<!fir.array<800xi8>>) -> !fir.ref<!fir.array<800xi8>> {name = "comm"}
+! CHECK:           acc.parallel combined(loop) dataOperands(%[[COPYIN_0]] : !fir.ref<!fir.array<800xi8>>) {
+! CHECK:             %[[CONSTANT_8:.*]] = arith.constant 196 : index
+! CHECK:             %[[COORDINATE_OF_4:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_4:.*]] = fir.convert %[[COORDINATE_OF_4]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_4:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_5:.*]]:2 = hlfir.declare %[[CONVERT_4]](%[[SHAPE_4]]) storage(%[[COPYIN_0]][196]) {uniq_name = "_QFtestEoverlap1"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_9:.*]] = arith.constant 156 : index
+! CHECK:             %[[COORDINATE_OF_5:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_5:.*]] = fir.convert %[[COORDINATE_OF_5]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_5:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_6:.*]]:2 = hlfir.declare %[[CONVERT_5]](%[[SHAPE_5]]) storage(%[[COPYIN_0]][156]) {uniq_name = "_QFtestEoverlap2"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_10:.*]] = arith.constant 0 : index
+! CHECK:             %[[COORDINATE_OF_6:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_6:.*]] = fir.convert %[[COORDINATE_OF_6]] : (!fir.ref<i8>) -> !fir.ptr<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_6:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_7:.*]]:2 = hlfir.declare %[[CONVERT_6]](%[[SHAPE_6]]) storage(%[[COPYIN_0]][0]) {uniq_name = "_QFtestEx"} : (!fir.ptr<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ptr<!fir.array<100xf32>>, !fir.ptr<!fir.array<100xf32>>)
+! CHECK:             %[[CONSTANT_11:.*]] = arith.constant 400 : index
+! CHECK:             %[[COORDINATE_OF_7:.*]] = fir.coordinate_of %[[COPYIN_0]], %{{.*}} : (!fir.ref<!fir.array<800xi8>>, index) -> !fir.ref<i8>
+! CHECK:             %[[CONVERT_7:.*]] = fir.convert %[[COORDINATE_OF_7]] : (!fir.ref<i8>) -> !fir.ref<!fir.array<100xf32>>
+! CHECK:             %[[SHAPE_7:.*]] = fir.shape %{{.*}} : (index) -> !fir.shape<1>
+! CHECK:             %[[DECLARE_8:.*]]:2 = hlfir.declare %[[CONVERT_7]](%[[SHAPE_7]]) storage(%[[COPYIN_0]][400]) {uniq_name = "_QFtestEy"} : (!fir.ref<!fir.array<100xf32>>, !fir.shape<1>, !fir.ref<!fir.array<800xi8>>) -> (!fir.ref<!fir.array<100xf32>>, !fir.ref<!fir.array<100xf32>>)
+! CHECK:             acc.loop combined(parallel)
+! CHECK:               %[[DESIGNATE_0:.*]] = hlfir.designate %[[DECLARE_5]]#0
+! CHECK:               %[[DESIGNATE_1:.*]] = hlfir.designate %[[DECLARE_6]]#0
+! CHECK:               %[[DESIGNATE_2:.*]] = hlfir.designate %[[DECLARE_7]]#0
diff --git a/flang/test/Lower/array-expression-assumed-size.f90 b/flang/test/Lower/array-expression-assumed-size.f90
index 2fbf315aff114..a498148d07fc7 100644
--- a/flang/test/Lower/array-expression-assumed-size.f90
+++ b/flang/test/Lower/array-expression-assumed-size.f90
@@ -19,7 +19,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_1A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_1B:.*]] = arith.cmpi sgt, %[[VAL_1A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_1:.*]] = arith.select %[[VAL_1B]], %[[VAL_1A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_3:.*]] = arith.constant 1 : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 1 : i64
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i64) -> index
@@ -82,7 +82,7 @@ end subroutine assumed_size_forall_test
 ! CHECK:         %[[VAL_2A:.*]] = fir.convert %c10{{.*}} : (i64) -> index 
 ! CHECK:         %[[VAL_2B:.*]] = arith.cmpi sgt, %[[VAL_2A]], %c0{{.*}} : index 
 ! CHECK:         %[[VAL_2:.*]] = arith.select %[[VAL_2B]], %[[VAL_2A]], %c0{{.*}} : index
-! CHECK:         %[[VAL_3:.*]] = arith.constant -1 : index
+! CHECK:         %[[VAL_3:.*]] = fir.assumed_size_extent : index
 ! CHECK:         %[[VAL_4:.*]] = arith.constant 2 : i32
 ! CHECK:         %[[VAL_5:.*]] = fir.convert %[[VAL_4]] : (i32) -> index
 ! CHECK:         %[[VAL_6:.*]] = arith.constant 6 : i32
@@ -149,7 +149,7 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 3 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 4 : index
-! PostOpt-DAG:         %[[VAL_7:.*]] = arith.constant -1 : index
+! PostOpt-DAG:         %[[VAL_7:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_8:.*]] = fir.shape %[[VAL_1]], %[[VAL_7]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_9:.*]] = fir.slice %[[VAL_2]], %[[VAL_1]], %[[VAL_2]], %[[VAL_2]], %[[VAL_3]], %[[VAL_2]] : (index, index, index, index, index, index) -> !fir.slice<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_3]]
@@ -227,8 +227,8 @@ end subroutine assumed_size_forall_test
 ! PostOpt-DAG:         %[[VAL_4:.*]] = arith.constant 1 : index
 ! PostOpt-DAG:         %[[VAL_5:.*]] = arith.constant 0 : index
 ! PostOpt-DAG:         %[[VAL_6:.*]] = arith.constant 5 : index
-! PostOpt-DAG:         %[[VAL_8:.*]] = arith.constant -1 : index
 ! PostOpt:         %[[VAL_7:.*]] = fir.alloca i32 {adapt.valuebyref, bindc_name = "i"}
+! PostOpt:         %[[VAL_8:.*]] = fir.assumed_size_extent : index
 ! PostOpt:         %[[VAL_9:.*]] = fir.shape %[[VAL_2]], %[[VAL_8]] : (index, index) -> !fir.shape<2>
 ! PostOpt:         %[[VAL_10:.*]] = fir.allocmem !fir.array<10x?xi32>, %[[VAL_4]]
 ! PostOpt:         br ^bb1(%[[VAL_5]], %[[VAL_4]] : index, index)
diff --git a/flang/test/Lower/entry-statement.f90 b/flang/test/Lower/entry-statement.f90
index 83d2d324aed87..f1e535a2bd3a1 100644
--- a/flang/test/Lower/entry-statement.f90
+++ b/flang/test/Lower/entry-statement.f90
@@ -491,7 +491,7 @@ subroutine assumed_size()
 ! CHECK-LABEL:   func.func @_QPentry_with_assumed_size(
 ! CHECK-SAME:                                          %[[VAL_0:.*]]: !fir.ref<!fir.array<?xf32>> {fir.bindc_name = "x"}) {
 ! CHECK:           %[[VAL_1:.*]] = fir.dummy_scope : !fir.dscope
-! CHECK:           %[[VAL_2:.*]] = arith.constant -1 : index
+! CHECK:           %[[VAL_2:.*]] = fir.assumed_size_extent : index
 ! CHECK:           %[[VAL_3:.*]] = fir.shape %[[VAL_2]] : (index) -> !fir.shape<1>
 ! CHECK:           %[[VAL_4:.*]]:2 = hlfir.declare %[[VAL_0]](%[[VAL_3]]) dummy_scope %[[VAL_1]] {uniq_name = "_QFassumed_sizeEx"} : (!fir.ref<!fir.array<?xf32>>, !fir.shape<1>, !fir.dscope) -> (!fir.box<!fir.array<?xf32>>, !fir.ref<!fir.array<?xf32>>)
 ! CHECK:           cf.br ^bb1
diff --git a/flang/test/Lower/forall-polymorphic.f90 b/flang/test/Lower/forall-polymorphic.f90
new file mode 100644
index 0000000000000..2b7a51f9b549a
--- /dev/null
+++ b/flang/test/Lower/forall-polymorphic.f90
@@ -0,0 +1,89 @@
+! Test lower of FORALL polymorphic pointer assignment 
+! RUN: bbc -emit-fir %s -o - | FileCheck %s
+
+!! Test when LHS is polymorphic and RHS is not polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic
+  subroutine forallPolymorphic()
+  TYPE :: DT
+    CLASS(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  TYPE(DT1), TARGET  :: Tar1(10)
+  CLASS(DT), POINTER :: T(:)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>> {bindc_name = "t", uniq_name = "_QFforallpolymorphicEt"}
+! CHECK: %[[V_15:[0-9]+]] = fir.declare %[[V_11]] {fortran_attrs = #fir.var_attrs<pointer>, uniq_name = "_QFforallpolymorphicEt"} : (!fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_16:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>> {bindc_name = "tar1", fir.target, uniq_name = "_QFforallpolymorphicEtar1"}
+! CHECK: %[[V_17:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %[[V_16]](%[[V_17]]) {fortran_attrs = #fir.var_attrs<target>, uniq_name = "_QFforallpolymorphicEtar1"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_19:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>
+! CHECK: %[[V_34:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_35:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg0 = %[[V_34]] to %[[V_35]] step %c1
+! CHECK: {
+! CHECK: %[[V_36:[0-9]+]] = fir.convert %arg0 : (index) -> i32
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_15]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]] = fir.convert %[[V_36]] : (i32) -> i64
+! CHECK: %[[C0:.*]] = arith.constant 0 : index
+! CHECK: %[[V_39:[0-9]+]]:3 = fir.box_dims %37, %[[C0]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_40:[0-9]+]] = fir.shift %[[V_39]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_41:[0-9]+]] = fir.array_coor %[[V_37]](%[[V_40]]) %[[V_38]] : (!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>, !fir.shift<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_42:[0-9]+]] = fir.embox %[[V_41]] source_box %[[V_37]] : (!fir.ref<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>, !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>) -> !fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>
+! CHECK: %[[V_43:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>
+! CHECK: %[[V_44:[0-9]+]] = fir.coordinate_of %[[V_42]], ptr : (!fir.class<!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>) -> !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: %[[V_45:[0-9]+]] = fir.embox %[[V_18]](%[[V_17]]) : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>, !fir.shape<1>) -> !fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>
+! CHECK: %[[V_46:[0-9]+]] = fir.convert %[[V_45]] : (!fir.box<!fir.ptr<!fir.array<10x!fir.type<_QFforallpolymorphicTdt1{dt:!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>}>>>>) -> !fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>
+! CHECK: fir.store %[[V_46]] to %[[V_44]] : !fir.ref<!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt{ptr:!fir.class<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphicTdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic
+
+!! Test when LHS is not polymorphic but RHS is polymorphic
+! CHECK-LABEL: c.func @_QPforallpolymorphic2(
+! CHECK-SAME: %arg0: !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>> {fir.bindc_name = "tar1", fir.target}) {
+  subroutine forallPolymorphic2(Tar1)
+  TYPE :: DT
+    TYPE(DT), POINTER    :: Ptr(:) => NULL()
+  END TYPE
+
+  TYPE, EXTENDS(DT) :: DT1
+  END TYPE
+
+  CLASS(DT), ALLOCATABLE, TARGET  :: Tar1(:)
+  TYPE(DT) :: T(10)
+  integer :: I
+
+  FORALL (I=1:10)
+    T(I)%Ptr => Tar1
+  END FORALL
+
+! CHECK: %[[V_11:[0-9]+]] = fir.alloca !fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>> {bindc_name = "t", uniq_name = "_QFforallpolymorphic2Et"}
+! CHECK: %[[V_12:[0-9]+]] = fir.shape %c10 : (index) -> !fir.shape<1>
+! CHECK: %[[V_13:[0-9]+]] = fir.declare %[[V_11]](%[[V_12]]) {uniq_name = "_QFforallpolymorphic2Et"} : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>) -> !fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>
+! CHECK: %[[V_18:[0-9]+]] = fir.declare %arg0 dummy_scope %0 {fortran_attrs = #fir.var_attrs<allocatable, target>, uniq_name = "_QFforallpolymorphic2Etar1"} : (!fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>, !fir.dscope) -> !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_30:[0-9]+]] = fir.convert %c1_i32 : (i32) -> index
+! CHECK: %[[V_31:[0-9]+]] = fir.convert %c10_i32 : (i32) -> index
+! CHECK: fir.do_loop %arg1 = %[[V_30]] to %[[V_31]] step %c1
+! CHECK: {
+! CHECK: %[[V_32:[0-9]+]] = fir.convert %arg1 : (index) -> i32
+! CHECK: %[[V_33:[0-9]+]] = fir.convert %[[V_32]] : (i32) -> i64
+! CHECK: %[[V_34:[0-9]+]] = fir.array_coor %[[V_13]](%[[V_12]]) %[[V_33]] : (!fir.ref<!fir.array<10x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>, !fir.shape<1>, i64) -> !fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>
+! CHECK: %[[V_35:[0-9]+]] = fir.field_index ptr, !fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>
+! CHECK: %[[V_36:[0-9]+]] = fir.coordinate_of %[[V_34]], ptr : (!fir.ref<!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>) -> !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_37:[0-9]+]] = fir.load %[[V_18]] : !fir.ref<!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: %[[V_38:[0-9]+]]:3 = fir.box_dims %[[V_37]], %c0 : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, index) -> (index, index, index)
+! CHECK: %[[V_39:[0-9]+]] = fir.shift %[[V_38]]#0 : (index) -> !fir.shift<1>
+! CHECK: %[[V_40:[0-9]+]] = fir.rebox %[[V_37]](%[[V_39]]) : (!fir.class<!fir.heap<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>, !fir.shift<1>) -> !fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>
+! CHECK: fir.store %[[V_40]] to %[[V_36]] : !fir.ref<!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt{ptr:!fir.box<!fir.ptr<!fir.array<?x!fir.type<_QFforallpolymorphic2Tdt>>>>}>>>>>
+! CHECK: }
+
+  end subroutine forallPolymorphic2
+
diff --git a/flang/test/Lower/math-lowering/sinh.f90 b/flang/test/Lower/math-lowering/sinh.f90
index 934385450d07f..a043bad8357c8 100644
--- a/flang/test/Lower/math-lowering/sinh.f90
+++ b/flang/test/Lower/math-lowering/sinh.f90
@@ -1,9 +1,9 @@
-! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL %s
-! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL %s
-! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=fast | FileCheck --check-prefixes=ALL,FAST %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=fast %s -o - | FileCheck --check-prefixes=ALL,FAST %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=relaxed | FileCheck --check-prefixes=ALL,RELAXED %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=relaxed %s -o - | FileCheck --check-prefixes=ALL,RELAXED %s
+! RUN: bbc -emit-fir %s -o - --math-runtime=precise | FileCheck --check-prefixes=ALL,PRECISE %s
+! RUN: %flang_fc1 -emit-fir -mllvm -math-runtime=precise %s -o - | FileCheck --check-prefixes=ALL,PRECISE %s
 
 function test_real4(x)
   real :: x, test_real4
@@ -11,7 +11,9 @@ function test_real4(x)
 end function
 
 ! ALL-LABEL: @_QPtest_real4
-! ALL: {{%[A-Za-z0-9._]+}} = fir.call @sinhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32
+! FAST: {{%[A-Za-z0-9._]+}} = math.sinh {{%[A-Za-z0-9._]+}} {{.*}}: f32
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.sinh {{%[A-Za-z0-9._]+}} {{.*}}: f32
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @sinhf({{%[A-Za-z0-9._]+}}) {{.*}}: (f32) -> f32
 
 function test_real8(x)
   real(8) :: x, test_real8
@@ -19,7 +21,9 @@ function test_real8(x)
 end function
 
 ! ALL-LABEL: @_QPtest_real8
-! ALL: {{%[A-Za-z0-9._]+}} = fir.call @sinh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
+! FAST: {{%[A-Za-z0-9._]+}} = math.sinh {{%[A-Za-z0-9._]+}} {{.*}}: f64
+! RELAXED: {{%[A-Za-z0-9._]+}} = math.sinh {{%[A-Za-z0-9._]+}} {{.*}}: f64
+! PRECISE: {{%[A-Za-z0-9._]+}} = fir.call @sinh({{%[A-Za-z0-9._]+}}) {{.*}}: (f64) -> f64
 
-! ALL-DAG: func.func private @sinhf(f32) -> f32 attributes {fir.bindc_name = "sinhf", fir.runtime}
-! ALL-DAG: func.func private @sinh(f64) -> f64 attributes {fir.bindc_name = "sinh", fir.runtime}
+! PRECISE-DAG: func.func private @sinhf(f32) -> f32 attributes {fir.bindc_name = "sinhf", fir.runtime}
+! PRECISE-DAG: func.func private @sinh(f64) -> f64 attributes {fir.bindc_name = "sinh", fir.runtime}
diff --git a/flang/test/Lower/trigonometric-intrinsics.f90 b/flang/test/Lower/trigonometric-intrinsics.f90
index d1edd4ef48dc3..8465b9ea22114 100644
--- a/flang/test/Lower/trigonometric-intrinsics.f90
+++ b/flang/test/Lower/trigonometric-intrinsics.f90
@@ -278,10 +278,10 @@ subroutine sinh_testcd(z)
 ! CMPLX-PRECISE: fir.call @csin
 
 ! CHECK-LABEL: @fir.sinh.contract.f32.f32
-! CHECK: fir.call {{.*}}sinh
+! CHECK: math.sinh {{.*}} : f32
 
 ! CHECK-LABEL: @fir.sinh.contract.f64.f64
-! CHECK: fir.call {{.*}}sinh
+! CHECK: math.sinh {{.*}} : f64
 
 ! CHECK-LABEL: @fir.sinh.contract.z32.z32
 ! CHECK: fir.call @csinhf
diff --git a/flang/test/Parser/OpenMP/declare-reduction-multi.f90 b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
index 0af3ed6e78571..a682958eb9128 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-multi.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-multi.f90
@@ -32,9 +32,9 @@ program omp_examples
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r+omp_in%r'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
 
   !$omp declare reduction(*:tt:omp_out%r = omp_out%r * omp_in%r) initializer(omp_priv%r = 1)
@@ -44,9 +44,9 @@ program omp_examples
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Multiply
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=omp_out%r*omp_in%r'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
 
   !$omp declare reduction(max:tt:omp_out = mymax(omp_out, omp_in)) initializer(omp_priv%r = 0)
@@ -56,9 +56,9 @@ program omp_examples
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'max'
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=mymax(omp_out,omp_in)'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=0._4'
 
   !$omp declare reduction(min:tt:omp_out%r = min(omp_out%r, omp_in%r)) initializer(omp_priv%r = 1)
@@ -68,9 +68,9 @@ program omp_examples
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'min'
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%r=min(omp_out%r,omp_in%r)'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv%r=1._4'
 
   call random_number(values%r)
diff --git a/flang/test/Parser/OpenMP/declare-reduction-operator.f90 b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
index 347588468617b..e4d07c8265b1e 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-operator.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-operator.f90
@@ -22,9 +22,9 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt(x=0_4,y=0_4)'
 
   !$omp declare reduction(+ : tt :  omp_out = tt(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt(0,0))
@@ -36,9 +36,9 @@ subroutine reduce_1 ( n, tts )
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 'tt2'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=tt2(x=omp_out%x-omp_in%x,y=omp_out%y-omp_in%y)'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=tt2(x=0._8,y=0._8)'
 
   !$omp declare reduction(+ :tt2 :  omp_out = tt2(omp_out%x - omp_in%x , omp_out%y - omp_in%y)) initializer(omp_priv = tt2(0,0))
diff --git a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90 b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
index 7514f0cf83877..73d7ccf489f01 100644
--- a/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
+++ b/flang/test/Parser/OpenMP/declare-reduction-unparse.f90
@@ -25,9 +25,9 @@ end subroutine initme
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'red_add'
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> KindSelector -> Scalar -> Integer -> Constant -> Expr = '4_4'
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec -> KindSelector -> Scalar -> Integer -> Constant -> Expr = '4_4'
 !PARSE-TREE: | | | LiteralConstant -> IntLiteralConstant = '4'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> OmpInitializerProc
 !PARSE-TREE: | | ProcedureDesignator -> Name = 'initme'
 
@@ -73,6 +73,6 @@ end program main
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> ProcedureDesignator -> Name = 'my_add_red'
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out=omp_out+omp_in'
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out=omp_out+omp_in'
 !PARSE-TREE: | OmpClauseList -> OmpClause -> Initializer -> OmpInitializerClause -> AssignmentStmt = 'omp_priv=0_4'
diff --git a/flang/test/Parser/OpenMP/metadirective-dirspec.f90 b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
index baf969317c18f..c373001be8963 100644
--- a/flang/test/Parser/OpenMP/metadirective-dirspec.f90
+++ b/flang/test/Parser/OpenMP/metadirective-dirspec.f90
@@ -123,11 +123,11 @@ subroutine f03
 !PARSE-TREE: | | | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | | | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
-!PARSE-TREE: | | | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | | | Name = 'tt1'
-!PARSE-TREE: | | | | OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | | | OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | | | Name = 'tt2'
-!PARSE-TREE: | | | | OmpReductionCombiner -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
+!PARSE-TREE: | | | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_in%x+omp_out%x'
 !PARSE-TREE: | | | | | | Designator -> DataRef -> StructureComponent
 !PARSE-TREE: | | | | | | | DataRef -> Name = 'omp_out'
 !PARSE-TREE: | | | | | | | Name = 'x'
diff --git a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90 b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
index f4cdd556bd4e5..39e8f059bbb24 100644
--- a/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
+++ b/flang/test/Parser/OpenMP/openmp6-directive-spellings.f90
@@ -86,9 +86,9 @@ subroutine f02
 !PARSE-TREE: | OmpDirectiveName -> llvm::omp::Directive = declare reduction
 !PARSE-TREE: | OmpArgumentList -> OmpArgument -> OmpReductionSpecifier
 !PARSE-TREE: | | OmpReductionIdentifier -> DefinedOperator -> IntrinsicOperator = Add
-!PARSE-TREE: | | OmpTypeNameList -> OmpTypeSpecifier -> TypeSpec -> DerivedTypeSpec
+!PARSE-TREE: | | OmpTypeNameList -> OmpTypeName -> TypeSpec -> DerivedTypeSpec
 !PARSE-TREE: | | | Name = 't'
-!PARSE-TREE: | | OmpReductionCombiner -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
+!PARSE-TREE: | | OmpCombinerExpression -> AssignmentStmt = 'omp_out%x=omp_out%x+omp_in%x'
 !PARSE-TREE: | | | Variable = 'omp_out%x'
 !PARSE-TREE: | | | | Designator -> DataRef -> StructureComponent
 !PARSE-TREE: | | | | | DataRef -> Name = 'omp_out'
diff --git a/flang/test/Parser/cuf-sanity-common b/flang/test/Parser/cuf-sanity-common
index 816e03bed7220..2348c2edf3b73 100644
--- a/flang/test/Parser/cuf-sanity-common
+++ b/flang/test/Parser/cuf-sanity-common
@@ -43,6 +43,7 @@ module m
     call globalsub<<<1, 2>>>
     call globalsub<<<1, 2, 3>>>
     call globalsub<<<1, 2, 3, 4>>>
+    call globalsub<<<1, 2, 0.9*10, 4>>>
     call globalsub<<<*,5>>>
     allocate(pa(32), pinned = isPinned)
   end subroutine
diff --git a/flang/test/Parser/cuf-sanity-tree.CUF b/flang/test/Parser/cuf-sanity-tree.CUF
index 83d7540b8dec5..b4d53f27cf395 100644
--- a/flang/test/Parser/cuf-sanity-tree.CUF
+++ b/flang/test/Parser/cuf-sanity-tree.CUF
@@ -178,7 +178,7 @@ include "cuf-sanity-common"
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | Scalar -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL globalsub<<<1_4,2_4,3_4,4_4>>>()'
 !CHECK: | | | | | Call
@@ -188,10 +188,27 @@ include "cuf-sanity-common"
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
 !CHECK: | | | | | | Scalar -> Expr = '2_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
-!CHECK: | | | | | | Scalar -> Integer -> Expr = '3_4'
+!CHECK: | | | | | | Scalar -> Expr = '3_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '3'
 !CHECK: | | | | | | Scalar -> Integer -> Expr = '4_4'
 !CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '4'
+!CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> CallStmt = 'CALL globalsub<<<1_4,2_4,9._4,4_4>>>()'
+!CHECK: | | | | | Call
+!CHECK: | | | | | | ProcedureDesignator -> Name = 'globalsub'
+!CHECK: | | | | | Chevrons
+!CHECK: | | | | | | StarOrExpr -> Scalar -> Expr = '1_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '1'
+!CHECK: | | | | | | Scalar -> Expr = '2_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '2'
+!CHECK: | | | | | | Scalar -> Expr = '9._4'
+!CHECK: | | | | | | | Multiply
+!CHECK: | | | | | | | | Expr = '8.9999997615814208984375e-1_4'
+!CHECK: | | | | | | | | | LiteralConstant -> RealLiteralConstant
+!CHECK: | | | | | | | | | | Real = '0.9'
+!CHECK: | | | | | | | | Expr = '10_4'
+!CHECK: | | | | | | | | | LiteralConstant -> IntLiteralConstant = '10'
+!CHECK: | | | | | | Scalar -> Integer -> Expr = '4_4'
+!CHECK: | | | | | | | LiteralConstant -> IntLiteralConstant = '4'
 !CHECK: | | | | ExecutionPartConstruct -> ExecutableConstruct -> ActionStmt -> AllocateStmt
 !CHECK: | | | | | Allocation
 !CHECK: | | | | | | AllocateObject = 'pa'
diff --git a/flang/test/Preprocessing/bug164470.cuf b/flang/test/Preprocessing/bug164470.cuf
new file mode 100644
index 0000000000000..3e959f40d2e3f
--- /dev/null
+++ b/flang/test/Preprocessing/bug164470.cuf
@@ -0,0 +1,6 @@
+!RUN: %flang_fc1 -x cuda -fdebug-unparse %s 2>&1 | FileCheck %s
+!CHECK: ATTRIBUTES(DEVICE) FUNCTION foo()
+!@cuf attributes(device) &
+function foo()
+  foo = 1.
+end
diff --git a/flang/test/Preprocessing/bug164727.cuf b/flang/test/Preprocessing/bug164727.cuf
new file mode 100644
index 0000000000000..89c846d25bb22
--- /dev/null
+++ b/flang/test/Preprocessing/bug164727.cuf
@@ -0,0 +1,6 @@
+!RUN: %flang_fc1 -fdebug-unparse -x cuda %s 2>&1 | FileCheck %s
+!CHECK: REAL, MANAGED, ALLOCATABLE :: x
+real, &
+  !@cuf managed, &
+  allocatable :: x
+end
diff --git a/flang/test/Semantics/OpenACC/acc-common.f90 b/flang/test/Semantics/OpenACC/acc-common.f90
new file mode 100644
index 0000000000000..31c4d190d576b
--- /dev/null
+++ b/flang/test/Semantics/OpenACC/acc-common.f90
@@ -0,0 +1,41 @@
+! RUN: %python %S/../test_errors.py %s %flang -fopenacc
+module acc_common_decl
+  implicit none
+  integer a
+  common /a_common/ a
+!$acc declare create (/a_common/)
+  data a/42/
+end module acc_common_decl
+
+module acc_common_another
+  implicit none
+  integer c, d
+  common /a_common/ c, d
+!$acc declare create (/a_common/)
+end module acc_common_another
+
+module acc_common_intermediate
+  use acc_common_decl
+  implicit none
+  integer b
+  common /b_common/ b
+!$acc declare create (/b_common/)
+end module acc_common_intermediate
+
+program acc_decl_test
+  use acc_common_intermediate
+  use acc_common_another
+  implicit none
+
+  a = 1
+  b = 10
+!$acc update device (/a_common/)
+  a = 2
+!$acc update device (/b_common/)
+  b = 20
+!$acc update device (/a_common/)
+  c = 3
+  d = 30
+!ERROR: Could not find COMMON block 'a_common_bad' used in OpenACC directive
+!$acc update device (/a_common_bad/)
+end program
diff --git a/flang/test/Semantics/OpenACC/bug1583.f90 b/flang/test/Semantics/OpenACC/bug1583.f90
new file mode 100644
index 0000000000000..7778d4644a9ff
--- /dev/null
+++ b/flang/test/Semantics/OpenACC/bug1583.f90
@@ -0,0 +1,23 @@
+! RUN: %python %S/../test_symbols.py %s %flang_fc1 -fopenacc
+!DEF: /m Module
+module m
+ !DEF: /m/t PUBLIC DerivedType
+ type :: t
+  !DEF: /m/t/c ALLOCATABLE ObjectEntity REAL(4)
+  real, allocatable :: c(:)
+ end type
+contains
+ !DEF: /m/sub PUBLIC (Subroutine) Subprogram
+ !DEF: /m/sub/v ObjectEntity TYPE(t)
+ subroutine sub (v)
+  !REF: /m/t
+  !REF: /m/sub/v
+  type(t) :: v
+!$acc host_data use_device(v%c)
+  !DEF: /foo EXTERNAL (Subroutine) ProcEntity
+  !REF: /m/sub/v
+  !REF: /m/t/c
+  call foo(v%c)
+!$acc end host_data
+ end subroutine
+end module
diff --git a/flang/test/Semantics/OpenMP/allocate01.f90 b/flang/test/Semantics/OpenMP/allocate01.f90
index 1d99811156438..229fd4d6c3f95 100644
--- a/flang/test/Semantics/OpenMP/allocate01.f90
+++ b/flang/test/Semantics/OpenMP/allocate01.f90
@@ -15,7 +15,7 @@ subroutine sema()
     integer :: a, b
     real, dimension (:,:), allocatable :: darray
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+    !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
     !$omp allocate(y)
         print *, a
 
diff --git a/flang/test/Semantics/OpenMP/allocate04.f90 b/flang/test/Semantics/OpenMP/allocate04.f90
index bbd74eb2ca101..5fd75bad6c4ec 100644
--- a/flang/test/Semantics/OpenMP/allocate04.f90
+++ b/flang/test/Semantics/OpenMP/allocate04.f90
@@ -14,16 +14,19 @@ subroutine allocate(z)
   type(c_ptr), pointer :: p
   integer :: x, y, z
 
-  associate (a => x)
-  !$omp allocate(x) allocator(omp_default_mem_alloc)
-
   !ERROR: PRIVATE clause is not allowed on the ALLOCATE directive
   !$omp allocate(y) private(y)
-  !ERROR: List item 'z' in ALLOCATE directive must not be a dummy argument
-  !$omp allocate(z)
-  !ERROR: List item 'p' in ALLOCATE directive must not have POINTER attribute
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(p)
-  !ERROR: List item 'a' in ALLOCATE directive must not be an associate name
+
+  associate (a => x)
+  block
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !$omp allocate(x) allocator(omp_default_mem_alloc)
+
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
+  !ERROR: A list item in a declarative ALLOCATE cannot be an associate name
   !$omp allocate(a)
+  end block
   end associate
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocate05.f90 b/flang/test/Semantics/OpenMP/allocate05.f90
index 28ab40e31bcc9..2376ca72ca911 100644
--- a/flang/test/Semantics/OpenMP/allocate05.f90
+++ b/flang/test/Semantics/OpenMP/allocate05.f90
@@ -18,7 +18,7 @@ subroutine allocate()
   !$omp end target
 
   !$omp target
-      !ERROR: ALLOCATE directives that appear in a TARGET region must specify an allocator clause
+      !ERROR: An ALLOCATE directive in a TARGET region must specify an ALLOCATOR clause or REQUIRES(DYNAMIC_ALLOCATORS) must be specified
       !$omp allocate
           allocate ( darray(a, b) )
   !$omp end target
diff --git a/flang/test/Semantics/OpenMP/allocate06.f90 b/flang/test/Semantics/OpenMP/allocate06.f90
index 60eb65e9a37b1..fdfcf783c7445 100644
--- a/flang/test/Semantics/OpenMP/allocate06.f90
+++ b/flang/test/Semantics/OpenMP/allocate06.f90
@@ -11,7 +11,7 @@ subroutine allocate()
   integer :: a, b, x
   real, dimension (:,:), allocatable :: darray
 
-  !ERROR: List items specified in the ALLOCATE directive must not have the ALLOCATABLE attribute unless the directive is associated with an ALLOCATE statement
+  !ERROR: A list item in a declarative ALLOCATE cannot have the ALLOCATABLE or POINTER attribute
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
 
   !$omp allocate(darray) allocator(omp_default_mem_alloc)
diff --git a/flang/test/Semantics/OpenMP/allocate08.f90 b/flang/test/Semantics/OpenMP/allocate08.f90
index 5bfa918be4cad..f4f11e229a28b 100644
--- a/flang/test/Semantics/OpenMP/allocate08.f90
+++ b/flang/test/Semantics/OpenMP/allocate08.f90
@@ -3,14 +3,15 @@
 ! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags
 ! OpenMP Version 5.0
 ! 2.11.3 allocate Directive
-! If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a
-! module, then only predefined memory allocator parameters can be used in the allocator clause
+! If list items within the ALLOCATE directive have the SAVE attribute, are a
+! common block name, or are declared in the scope of a module, then only
+! predefined memory allocator parameters can be used in the allocator clause
 
 module AllocateModule
   INTEGER :: z
 end module
 
-subroutine allocate()
+subroutine allocate(custom_allocator)
 use omp_lib
 use AllocateModule
   integer, SAVE :: x
@@ -18,30 +19,25 @@ subroutine allocate()
   COMMON /CommonName/ y
 
   integer(kind=omp_allocator_handle_kind) :: custom_allocator
-  integer(kind=omp_memspace_handle_kind) :: memspace
-  type(omp_alloctrait), dimension(1) :: trait
-  memspace = omp_default_mem_space
-  trait(1)%key = fallback
-  trait(1)%value = default_mem_fb
-  custom_allocator = omp_init_allocator(memspace, 1, trait)
 
   !$omp allocate(x) allocator(omp_default_mem_alloc)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(omp_default_mem_alloc)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(omp_default_mem_alloc)
 
+  !ERROR: If a list item is a named common block or has SAVE attribute, an ALLOCATOR clause must be present with a predefined allocator
   !$omp allocate(x)
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y)
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z)
 
   !$omp allocate(w) allocator(custom_allocator)
 
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
   !$omp allocate(x) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
+  !ERROR: A variable that is part of a common block may not be specified as a list item in an ALLOCATE directive, except implicitly via the named common block
   !$omp allocate(y) allocator(custom_allocator)
-  !ERROR: If list items within the ALLOCATE directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-  !ERROR: List items must be declared in the same scoping unit in which the ALLOCATE directive appears
+  !ERROR: A list item on a declarative ALLOCATE must be declared in the same scope in which the directive appears
   !$omp allocate(z) allocator(custom_allocator)
 end subroutine allocate
diff --git a/flang/test/Semantics/OpenMP/allocators04.f90 b/flang/test/Semantics/OpenMP/allocators04.f90
index c71c7ca8466ba..212e48fbd1b26 100644
--- a/flang/test/Semantics/OpenMP/allocators04.f90
+++ b/flang/test/Semantics/OpenMP/allocators04.f90
@@ -22,12 +22,10 @@ subroutine allocate()
     trait(1)%value = default_mem_fb
     custom_allocator = omp_init_allocator(omp_default_mem_space, 1, trait)
 
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(omp_default_mem_alloc: a)
         allocate(a)
 
     !ERROR: If list items within the ALLOCATORS directive have the SAVE attribute, are a common block name, or are declared in the scope of a module, then only predefined memory allocator parameters can be used in the allocator clause
-    !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
     !$omp allocators allocate(custom_allocator: b)
         allocate(b)
 end subroutine
diff --git a/flang/test/Semantics/OpenMP/allocators06.f90 b/flang/test/Semantics/OpenMP/allocators06.f90
deleted file mode 100644
index 8e63512369e38..0000000000000
--- a/flang/test/Semantics/OpenMP/allocators06.f90
+++ /dev/null
@@ -1,18 +0,0 @@
-! REQUIRES: openmp_runtime
-
-! RUN: %python %S/../test_errors.py %s %flang_fc1 %openmp_flags -fopenmp-version=50
-! OpenMP Version 5.2
-! Inherited from 2.11.3 allocate directive
-! The allocate directive must appear in the same scope as the declarations of
-! each of its list items and must follow all such declarations.
-
-subroutine allocate()
-    use omp_lib
-    integer, allocatable :: a
-contains
-    subroutine test()
-        !ERROR: List items must be declared in the same scoping unit in which the ALLOCATORS directive appears
-        !$omp allocators allocate(omp_default_mem_alloc: a)
-            allocate(a)
-    end subroutine
-end subroutine
diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90
index dcde963689eb0..04b8c3d43ba7a 100644
--- a/flang/test/Semantics/OpenMP/declarative-directive02.f90
+++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90
@@ -9,7 +9,7 @@ subroutine test_decl
   implicit none
   save :: x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! OMPv5.2 7.7 declare-simd
@@ -33,12 +33,12 @@ subroutine test_decl
 subroutine test_decl2
   save x1, y1
   !$omp threadprivate(x1)
-  !$omp allocate(y1)
+  !$omp allocate(y1) allocator(0)
   integer :: x1, y1
 
   ! implicit decl
   !$omp threadprivate(x2)
-  !$omp allocate(y2)
+  !$omp allocate(y2) allocator(0)
   save x2, y2
 end subroutine
 
diff --git a/flang/test/Semantics/bug1491.f90 b/flang/test/Semantics/bug1491.f90
new file mode 100644
index 0000000000000..ccc6eea31a1d4
--- /dev/null
+++ b/flang/test/Semantics/bug1491.f90
@@ -0,0 +1,21 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1 -Werror -pedantic
+module m
+  interface
+    integer function foo1()
+    end function
+    integer function foo2(j)
+    end function
+    integer function foo3()
+    end function
+  end interface
+end module
+
+subroutine test()
+  integer, external :: foo1
+!WARNING: The external interface 'foo2' is not compatible with an earlier definition (distinct numbers of dummy arguments) [-Wexternal-interface-mismatch]
+  integer, external :: foo2
+  integer, external :: foo3
+  call bar(foo1())
+  call bar(foo2())
+  call baz(foo3)
+end subroutine
diff --git a/flang/test/Semantics/bug164303.f90 b/flang/test/Semantics/bug164303.f90
new file mode 100644
index 0000000000000..c356c07392577
--- /dev/null
+++ b/flang/test/Semantics/bug164303.f90
@@ -0,0 +1,31 @@
+!RUN: %flang -fc1 -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+module foo_mod
+  use, intrinsic :: iso_fortran_env
+  use, intrinsic :: iso_c_binding
+  implicit none
+
+  interface new_foo
+    procedure :: foo_ctor
+  end interface
+
+contains
+
+function foo_ctor(options) result(retval)
+  implicit none
+  integer, intent(in) :: options
+  integer :: retval
+
+  interface
+!CHECK-NOT: error:
+    subroutine new_foo(f, opt) bind(c, name='new_foo')
+      import
+      implicit none
+      integer, intent(inout) :: f
+      integer(c_int), intent(in) :: opt
+    end subroutine
+  end interface
+
+  call new_foo(retval, options)
+end function
+
+end module
diff --git a/flang/test/Semantics/cuf09.cuf b/flang/test/Semantics/cuf09.cuf
index 9178b0a63adbe..df6568df9b480 100644
--- a/flang/test/Semantics/cuf09.cuf
+++ b/flang/test/Semantics/cuf09.cuf
@@ -36,6 +36,12 @@ module m
     if (i .le. N) a(i) = m(i)
   end subroutine
 
+  attributes(device) function devfct(r1, r2) result(res)
+    real(4), intent(in) :: r1(3), r2(3)
+    real(4) :: res(3)
+    res = r1 - r2 ! Do not error on function result
+  end function
+
   attributes(global) subroutine hostparameter(a)
     integer :: a(*)
     i = threadIdx%x
diff --git a/flang/test/Semantics/func-proc-result.f90 b/flang/test/Semantics/func-proc-result.f90
new file mode 100644
index 0000000000000..5bf8ac9c4ddea
--- /dev/null
+++ b/flang/test/Semantics/func-proc-result.f90
@@ -0,0 +1,18 @@
+!RUN: %python %S/test_errors.py %s %flang_fc1
+
+function good() result(pptr)
+  procedure(), pointer :: pptr
+  external whatever
+  pptr => whatever
+end
+
+function bad1() result(res1)
+  !ERROR: A function result may not be a procedure unless it is a procedure pointer
+  procedure() res1
+end
+
+!ERROR: Procedure 'res2' is referenced before being sufficiently defined in a context where it must be so
+function bad2() result(res2)
+  !ERROR: EXTERNAL attribute not allowed on 'res2'
+  external res2
+end
diff --git a/flang/test/Semantics/generic-error.f90 b/flang/test/Semantics/generic-error.f90
new file mode 100644
index 0000000000000..25c0410a938c5
--- /dev/null
+++ b/flang/test/Semantics/generic-error.f90
@@ -0,0 +1,21 @@
+! RUN: not %flang_fc1 -fsyntax-only %s 2>&1 | FileCheck %s
+module m
+  interface generic
+    procedure :: sub1, sub2
+  end interface
+ contains
+  subroutine sub1(x)
+  end
+  subroutine sub2(j)
+  end
+end
+
+program test
+  use m
+!CHECK: error: No specific subroutine of generic 'generic' matches the actual arguments
+!CHECK: Specific procedure 'sub1' does not match the actual arguments
+!CHECK: Actual argument type 'REAL(8)' is not compatible with dummy argument type 'REAL(4)'
+!CHECK: Specific procedure 'sub2' does not match the actual arguments
+!CHECK: Actual argument type 'REAL(8)' is not compatible with dummy argument type 'INTEGER(4)'
+  call generic(1.d0)
+end
diff --git a/flang/test/Semantics/null-init.f90 b/flang/test/Semantics/null-init.f90
index d01ad75a75a11..a88cefe136a5e 100644
--- a/flang/test/Semantics/null-init.f90
+++ b/flang/test/Semantics/null-init.f90
@@ -37,7 +37,7 @@ module m6
 
 module m7
   interface
-    !WARNING: The external interface 'null' is not compatible with an earlier definition (incompatible procedure attributes: ImplicitInterface) [-Wexternal-interface-mismatch]
+    !WARNING: The external interface 'null' is not compatible with an earlier definition (function results have incompatible attributes) [-Wexternal-interface-mismatch]
     function null() result(p)
       integer, pointer :: p
     end function
diff --git a/flang/test/Transforms/debug-assumed-size-array.fir b/flang/test/Transforms/debug-assumed-size-array.fir
index 40e57100fd9ff..2c915526e32bb 100644
--- a/flang/test/Transforms/debug-assumed-size-array.fir
+++ b/flang/test/Transforms/debug-assumed-size-array.fir
@@ -5,7 +5,7 @@ module {
     %c5 = arith.constant 5 : index
     %c1 = arith.constant 1 : index
     %c2 = arith.constant 2 : index
-    %c-1 = arith.constant -1 : index
+    %c-1 = fir.assumed_size_extent : index
     %0 = fir.undefined !fir.dscope
     %1 = fircg.ext_declare %arg0(%c5, %c-1) dummy_scope %0 {uniq_name = "_QMhelperFfnEa1"} : (!fir.ref<!fir.array<5x?xi32>>, index, index, !fir.dscope) -> !fir.ref<!fir.array<5x?xi32>> loc(#loc1)
     %2 = fircg.ext_declare %arg1(%c-1) origin %c2 dummy_scope %0 {uniq_name = "_QMhelperFfnEa2"} : (!fir.ref<!fir.array<?xi32>>, index, index, !fir.dscope) -> !fir.ref<!fir.array<?xi32>> loc(#loc2)
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index 32b0a1dfa5c7a..67d07eee1f4fc 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -50,6 +50,7 @@ int main(int argc, char **argv) {
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
+  registry.insert<mlir::memref::MemRefDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));
diff --git a/libc/include/CMakeLists.txt b/libc/include/CMakeLists.txt
index 09f169b9a985f..7b52daf9a8062 100644
--- a/libc/include/CMakeLists.txt
+++ b/libc/include/CMakeLists.txt
@@ -166,7 +166,18 @@ add_header_macro(
   stdfix.h
   DEPENDS
     .llvm-libc-macros.stdfix_macros
-    .llvm-libc-types.stdfix-types
+    .llvm-libc-types.int_hk_t
+    .llvm-libc-types.int_hr_t
+    .llvm-libc-types.int_k_t
+    .llvm-libc-types.int_lk_t
+    .llvm-libc-types.int_lr_t
+    .llvm-libc-types.int_r_t
+    .llvm-libc-types.uint_uhk_t
+    .llvm-libc-types.uint_uhr_t
+    .llvm-libc-types.uint_uk_t
+    .llvm-libc-types.uint_ulk_t
+    .llvm-libc-types.uint_ulr_t
+    .llvm-libc-types.uint_ur_t
 )
 
 
@@ -776,6 +787,7 @@ add_header_macro(
   ../libc/include/nl_types.yaml
   nl_types.h
   DEPENDS
+    .llvm-libc-macros.nl_types_macros
     .llvm-libc-types.nl_catd
   )
 
diff --git a/libc/include/llvm-libc-macros/CMakeLists.txt b/libc/include/llvm-libc-macros/CMakeLists.txt
index 76c03d913ee12..b16337cccd58b 100644
--- a/libc/include/llvm-libc-macros/CMakeLists.txt
+++ b/libc/include/llvm-libc-macros/CMakeLists.txt
@@ -31,6 +31,12 @@ if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
   add_subdirectory(${CMAKE_CURRENT_SOURCE_DIR}/${LIBC_TARGET_OS})
 endif()
 
+add_macro_header(
+  annex_k_macros
+  HDR
+    annex-k-macros.h
+)
+
 add_macro_header(
   assert_macros
   HDR
@@ -345,6 +351,12 @@ add_macro_header(
     locale-macros.h
 )
 
+add_macro_header(
+  nl_types_macros
+  HDR
+    nl-types-macros.h
+)
+
 add_macro_header(
   pthread_macros
   HDR
diff --git a/libc/include/llvm-libc-macros/annex-k-macros.h b/libc/include/llvm-libc-macros/annex-k-macros.h
new file mode 100644
index 0000000000000..7cfb5c1d3060b
--- /dev/null
+++ b/libc/include/llvm-libc-macros/annex-k-macros.h
@@ -0,0 +1,27 @@
+//===-- Definition of macros to be used with Annex K functions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
+#define LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
+
+#if (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||              \
+    (defined(__cplusplus) && __cplusplus >= 201703L)
+
+// TODO(bassiounix): Who should def this macro (clang vs libc)? Where?
+// TODO(bassiounix): uncomment/move when Annex K is fully implemented.
+// #define __STDC_LIB_EXT1__ 201112L
+
+#if defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1
+
+#define LIBC_HAS_ANNEX_K
+
+#endif // defined(__STDC_WANT_LIB_EXT1__) && __STDC_WANT_LIB_EXT1__ == 1
+
+#endif // (defined(__STDC_VERSION__) && __STDC_VERSION__ >= 201112L) ||
+       // (defined(__cplusplus) && __cplusplus >= 201703L)
+#endif // LLVM_LIBC_INCLUDE_LLVM_LIBC_MACROS_ANNEX_K_MACROS_H
diff --git a/libc/include/llvm-libc-macros/gpu/signal-macros.h b/libc/include/llvm-libc-macros/gpu/signal-macros.h
index f0d49ea34fe0e..76488e9796fc4 100644
--- a/libc/include/llvm-libc-macros/gpu/signal-macros.h
+++ b/libc/include/llvm-libc-macros/gpu/signal-macros.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_MACROS_GPU_SIGNAL_MACROS_H
 #define LLVM_LIBC_MACROS_GPU_SIGNAL_MACROS_H
 
+#include "__llvm-libc-common.h"
+
 #define SIGINT 2
 #define SIGILL 4
 #define SIGABRT 6
@@ -16,9 +18,10 @@
 #define SIGSEGV 11
 #define SIGTERM 15
 
-#define SIG_DFL ((void (*)(int))(0))
-#define SIG_IGN ((void (*)(int))(1))
-#define SIG_ERR ((void (*)(int))(-1))
+#define SIG_ERR __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), -1)
+#define SIG_DFL __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 0)
+#define SIG_IGN __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 1)
+#define SIG_HOLD __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 2)
 
 // Max signal number
 #define NSIG 64
diff --git a/libc/include/llvm-libc-macros/linux/signal-macros.h b/libc/include/llvm-libc-macros/linux/signal-macros.h
index d220241a38206..e7f268df87cba 100644
--- a/libc/include/llvm-libc-macros/linux/signal-macros.h
+++ b/libc/include/llvm-libc-macros/linux/signal-macros.h
@@ -9,6 +9,8 @@
 #ifndef LLVM_LIBC_MACROS_LINUX_SIGNAL_MACROS_H
 #define LLVM_LIBC_MACROS_LINUX_SIGNAL_MACROS_H
 
+#include "__llvm-libc-common.h"
+
 #define SIGHUP 1
 #define SIGINT 2
 #define SIGQUIT 3
@@ -86,9 +88,10 @@
 #error "Signal stack sizes not defined for your platform."
 #endif
 
-#define SIG_DFL ((void (*)(int))0)
-#define SIG_IGN ((void (*)(int))1)
-#define SIG_ERR ((void (*)(int))(-1))
+#define SIG_ERR __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), -1)
+#define SIG_DFL __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 0)
+#define SIG_IGN __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 1)
+#define SIG_HOLD __LLVM_LIBC_CAST(reinterpret_cast, void (*)(int), 2)
 
 // SIGCHLD si_codes
 #define CLD_EXITED 1    // child has exited
diff --git a/libc/include/llvm-libc-macros/nl-types-macros.h b/libc/include/llvm-libc-macros/nl-types-macros.h
new file mode 100644
index 0000000000000..b6d0d3519b510
--- /dev/null
+++ b/libc/include/llvm-libc-macros/nl-types-macros.h
@@ -0,0 +1,15 @@
+//===-- Definition of macros from nl_types.h ------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
+#define LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
+
+#define NL_SETD 1
+#define NL_CAT_LOCALE 1
+
+#endif // LLVM_LIBC_MACROS_NL_TYPES_MACROS_H
diff --git a/libc/include/llvm-libc-types/CMakeLists.txt b/libc/include/llvm-libc-types/CMakeLists.txt
index a428a0e73aab2..fcd0e1245a4a8 100644
--- a/libc/include/llvm-libc-types/CMakeLists.txt
+++ b/libc/include/llvm-libc-types/CMakeLists.txt
@@ -174,7 +174,18 @@ DEPENDS
 )
 add_header(locale_t HDR locale_t.h)
 add_header(struct_lconv HDR struct_lconv.h)
-add_header(stdfix-types HDR stdfix-types.h)
+add_header(int_hk_t HDR int_hk_t.h)
+add_header(int_hr_t HDR int_hr_t.h)
+add_header(int_k_t HDR int_k_t.h)
+add_header(int_lk_t HDR int_lk_t.h)
+add_header(int_lr_t HDR int_lr_t.h)
+add_header(int_r_t HDR int_r_t.h)
+add_header(uint_uhk_t HDR uint_uhk_t.h)
+add_header(uint_uhr_t HDR uint_uhr_t.h)
+add_header(uint_uk_t HDR uint_uk_t.h)
+add_header(uint_ulk_t HDR uint_ulk_t.h)
+add_header(uint_ulr_t HDR uint_ulr_t.h)
+add_header(uint_ur_t HDR uint_ur_t.h)
 
 # UEFI
 add_header(EFI_GUID HDR EFI_GUID.h DEPENDS libc.include.llvm-libc-macros.stdint_macros)
diff --git a/libc/include/llvm-libc-types/int_hk_t.h b/libc/include/llvm-libc-types/int_hk_t.h
new file mode 100644
index 0000000000000..eb172245e457b
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_hk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_HK_T_H
+#define LLVM_LIBC_TYPES_INT_HK_T_H
+
+typedef signed short int_hk_t;
+
+#endif // LLVM_LIBC_TYPES_INT_HK_T_H
diff --git a/libc/include/llvm-libc-types/int_hr_t.h b/libc/include/llvm-libc-types/int_hr_t.h
new file mode 100644
index 0000000000000..89b57a80a15a6
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_hr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_HR_T_H
+#define LLVM_LIBC_TYPES_INT_HR_T_H
+
+typedef signed char int_hr_t;
+
+#endif // LLVM_LIBC_TYPES_INT_HR_T_H
diff --git a/libc/include/llvm-libc-types/int_k_t.h b/libc/include/llvm-libc-types/int_k_t.h
new file mode 100644
index 0000000000000..193f9d823808f
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_k_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_K_T_H
+#define LLVM_LIBC_TYPES_INT_K_T_H
+
+typedef signed int int_k_t;
+
+#endif // LLVM_LIBC_TYPES_INT_K_T_H
diff --git a/libc/include/llvm-libc-types/int_lk_t.h b/libc/include/llvm-libc-types/int_lk_t.h
new file mode 100644
index 0000000000000..c8f31373d0f46
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_lk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_LK_T_H
+#define LLVM_LIBC_TYPES_INT_LK_T_H
+
+typedef signed long long int_lk_t;
+
+#endif // LLVM_LIBC_TYPES_INT_LK_T_H
diff --git a/libc/include/llvm-libc-types/int_lr_t.h b/libc/include/llvm-libc-types/int_lr_t.h
new file mode 100644
index 0000000000000..f2987e8d708d1
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_lr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_LR_T_H
+#define LLVM_LIBC_TYPES_INT_LR_T_H
+
+typedef signed int int_lr_t;
+
+#endif // LLVM_LIBC_TYPES_INT_LR_T_H
diff --git a/libc/include/llvm-libc-types/int_r_t.h b/libc/include/llvm-libc-types/int_r_t.h
new file mode 100644
index 0000000000000..6b528f4a84bdf
--- /dev/null
+++ b/libc/include/llvm-libc-types/int_r_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_INT_R_T_H
+#define LLVM_LIBC_TYPES_INT_R_T_H
+
+typedef signed short int int_r_t;
+
+#endif // LLVM_LIBC_TYPES_INT_R_T_H
diff --git a/libc/include/llvm-libc-types/stdfix-types.h b/libc/include/llvm-libc-types/stdfix-types.h
deleted file mode 100644
index 542d45ea97e96..0000000000000
--- a/libc/include/llvm-libc-types/stdfix-types.h
+++ /dev/null
@@ -1,25 +0,0 @@
-//===-- Definition of stdfix integer types --------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIBC_TYPES_STDFIX_TYPES_H
-#define LLVM_LIBC_TYPES_STDFIX_TYPES_H
-
-typedef signed char int_hr_t;
-typedef signed short int int_r_t;
-typedef signed int int_lr_t;
-typedef signed short int_hk_t;
-typedef signed int int_k_t;
-typedef signed long long int_lk_t;
-typedef unsigned char uint_uhr_t;
-typedef unsigned short int uint_ur_t;
-typedef unsigned int uint_ulr_t;
-typedef unsigned short int uint_uhk_t;
-typedef unsigned int uint_uk_t;
-typedef unsigned long long uint_ulk_t;
-
-#endif // LLVM_LIBC_TYPES_STDFIX_TYPES_H
diff --git a/libc/include/llvm-libc-types/uint_uhk_t.h b/libc/include/llvm-libc-types/uint_uhk_t.h
new file mode 100644
index 0000000000000..ed0f427ebecac
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uhk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UHK_T_H
+#define LLVM_LIBC_TYPES_UINT_UHK_T_H
+
+typedef unsigned short int uint_uhk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UHK_T_H
diff --git a/libc/include/llvm-libc-types/uint_uhr_t.h b/libc/include/llvm-libc-types/uint_uhr_t.h
new file mode 100644
index 0000000000000..e3eab1c24d0dd
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uhr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UHR_T_H
+#define LLVM_LIBC_TYPES_UINT_UHR_T_H
+
+typedef unsigned char uint_uhr_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UHR_T_H
diff --git a/libc/include/llvm-libc-types/uint_uk_t.h b/libc/include/llvm-libc-types/uint_uk_t.h
new file mode 100644
index 0000000000000..f9bbc7b78f9af
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_uk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UK_T_H
+#define LLVM_LIBC_TYPES_UINT_UK_T_H
+
+typedef unsigned int uint_uk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UK_T_H
diff --git a/libc/include/llvm-libc-types/uint_ulk_t.h b/libc/include/llvm-libc-types/uint_ulk_t.h
new file mode 100644
index 0000000000000..b7a54b02d19d4
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ulk_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_ULK_T_H
+#define LLVM_LIBC_TYPES_UINT_ULK_T_H
+
+typedef unsigned long long uint_ulk_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_ULK_T_H
diff --git a/libc/include/llvm-libc-types/uint_ulr_t.h b/libc/include/llvm-libc-types/uint_ulr_t.h
new file mode 100644
index 0000000000000..9f4682c29c536
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ulr_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_ULR_T_H
+#define LLVM_LIBC_TYPES_UINT_ULR_T_H
+
+typedef unsigned int uint_ulr_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_ULR_T_H
diff --git a/libc/include/llvm-libc-types/uint_ur_t.h b/libc/include/llvm-libc-types/uint_ur_t.h
new file mode 100644
index 0000000000000..267c9c44e9fd9
--- /dev/null
+++ b/libc/include/llvm-libc-types/uint_ur_t.h
@@ -0,0 +1,14 @@
+//===-- LLVM LIBC TYPES --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_TYPES_UINT_UR_T_H
+#define LLVM_LIBC_TYPES_UINT_UR_T_H
+
+typedef unsigned short int uint_ur_t;
+
+#endif // LLVM_LIBC_TYPES_UINT_UR_T_H
diff --git a/libc/include/nl_types.yaml b/libc/include/nl_types.yaml
index aecbb44a02224..bdb59a82a3337 100644
--- a/libc/include/nl_types.yaml
+++ b/libc/include/nl_types.yaml
@@ -1,7 +1,11 @@
 header: nl_types.h
 standards:
   - posix
-macros: []
+macros:
+  - macro_name: NL_SETD
+    macro_header: nl-types-macros.h
+  - macro_name: NL_CAT_LOCALE
+    macro_header: nl-types-macros.h
 types:
   - type_name: nl_catd
 enums: []
diff --git a/libc/include/stdfix.yaml b/libc/include/stdfix.yaml
index 451330c3478d2..9f1a32f25c8b6 100644
--- a/libc/include/stdfix.yaml
+++ b/libc/include/stdfix.yaml
@@ -8,7 +8,19 @@ macros:
   - macro_name: roundfx
     macro_header: stdfix-macros.h
 types:
-  - type_name: stdfix-types
+  - type_name: int_hk_t
+  - type_name: int_hr_t
+  - type_name: int_k_t
+  - type_name: int_lk_t
+  - type_name: int_lr_t
+  - type_name: int_r_t
+  - type_name: uint_uhk_t
+  - type_name: uint_uhr_t
+  - type_name: uint_uk_t
+  - type_name: uint_ulk_t
+  - type_name: uint_ulr_t
+  - type_name: uint_ur_t
+
 enums: []
 objects: []
 functions:
diff --git a/libc/include/wchar.yaml b/libc/include/wchar.yaml
index 8178091ab2202..b8a0a748cd3ad 100644
--- a/libc/include/wchar.yaml
+++ b/libc/include/wchar.yaml
@@ -4,6 +4,7 @@ macros:
   - macro_name: NULL
     macro_header: null-macro.h
 types:
+  - type_name: FILE
   - type_name: size_t
   - type_name: wint_t
   - type_name: wchar_t
@@ -104,9 +105,9 @@ functions:
   - name: wmemset
     standards:
       - stdc
-    return_type: wchar_t*
+    return_type: wchar_t *
     arguments:
-      - type: wchar_t*
+      - type: wchar_t *
       - type: wchar_t
       - type: size_t
   - name: wcschr
@@ -246,7 +247,7 @@ functions:
       - type: const wchar_t **__restrict
       - type: size_t
       - type: size_t
-      - type: mbstate_t
+      - type: mbstate_t *__restrict
   - name: wcsrtombs
     standards:
       - stdc
@@ -255,7 +256,7 @@ functions:
       - type: char *__restrict
       - type: const wchar_t **__restrict
       - type: size_t
-      - type: mbstate_t
+      - type: mbstate_t *__restrict
   - name: wcrtomb
     standards:
       - stdc
@@ -299,7 +300,7 @@ functions:
     arguments:
       - type: wchar_t *__restrict
       - type: const wchar_t *__restrict
-      - type: wchar_t** __restrict
+      - type: wchar_t **__restrict
   - name: wcpcpy
     standards:
       - stdc
diff --git a/libc/src/__support/math/atan2f.h b/libc/src/__support/math/atan2f.h
index e3b19329126f4..0133d12c1e071 100644
--- a/libc/src/__support/math/atan2f.h
+++ b/libc/src/__support/math/atan2f.h
@@ -18,9 +18,11 @@
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h" // LIBC_UNLIKELY
+#include "src/__support/macros/properties/cpu_features.h" // LIBC_TARGET_CPU_HAS_FMA
 
 #if defined(LIBC_MATH_HAS_SKIP_ACCURATE_PASS) &&                               \
-    defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT)
+    defined(LIBC_MATH_HAS_INTERMEDIATE_COMP_IN_FLOAT) &&                       \
+    defined(LIBC_TARGET_CPU_HAS_FMA_FLOAT)
 
 // We use float-float implementation to reduce size.
 #include "atan2f_float.h"
diff --git a/libc/src/stdfix/CMakeLists.txt b/libc/src/stdfix/CMakeLists.txt
index 3cbabd17ad34c..e9841cd940315 100644
--- a/libc/src/stdfix/CMakeLists.txt
+++ b/libc/src/stdfix/CMakeLists.txt
@@ -58,7 +58,6 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
       ${libc_opt_high_flag}
     DEPENDS
       libc.src.__support.fixed_point.fx_bits
-      libc.include.llvm-libc-types.stdfix-types
       libc.include.llvm-libc-macros.stdfix_macros
   )
 
diff --git a/libc/src/stdfix/bitshk.cpp b/libc/src/stdfix/bitshk.cpp
index d0a3e128bdd65..be4e71a06557d 100644
--- a/libc/src/stdfix/bitshk.cpp
+++ b/libc/src/stdfix/bitshk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitshk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hk_t
+#include "include/llvm-libc-types/int_hk_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitshk.h b/libc/src/stdfix/bitshk.h
index a1505e2e56d85..24395762e0166 100644
--- a/libc/src/stdfix/bitshk.h
+++ b/libc/src/stdfix/bitshk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSHK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hk_t
+#include "include/llvm-libc-types/int_hk_t.h"       // int_hk_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitshr.cpp b/libc/src/stdfix/bitshr.cpp
index 394d1f08f6ae5..e70bf0a9e55a7 100644
--- a/libc/src/stdfix/bitshr.cpp
+++ b/libc/src/stdfix/bitshr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitshr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hr_t
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitshr.h b/libc/src/stdfix/bitshr.h
index d5b4b8f56a7e9..c1517a969f77b 100644
--- a/libc/src/stdfix/bitshr.h
+++ b/libc/src/stdfix/bitshr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSHR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_hr_t
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsk.cpp b/libc/src/stdfix/bitsk.cpp
index f8c9d77d56e9c..0f1a3f3d0175b 100644
--- a/libc/src/stdfix/bitsk.cpp
+++ b/libc/src/stdfix/bitsk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_k_t
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsk.h b/libc/src/stdfix/bitsk.h
index 32d5a724dfb0b..3e41b4359cc3f 100644
--- a/libc/src/stdfix/bitsk.h
+++ b/libc/src/stdfix/bitsk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_k_t
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitslk.cpp b/libc/src/stdfix/bitslk.cpp
index f4af2a8cd8b99..1065c3223ac45 100644
--- a/libc/src/stdfix/bitslk.cpp
+++ b/libc/src/stdfix/bitslk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitslk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lk_t
+#include "include/llvm-libc-types/int_lk_t.h"       // int_lk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitslk.h b/libc/src/stdfix/bitslk.h
index 821116b9a7c1b..e1a0c18b5e67e 100644
--- a/libc/src/stdfix/bitslk.h
+++ b/libc/src/stdfix/bitslk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lk_t
+#include "include/llvm-libc-types/int_lk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitslr.cpp b/libc/src/stdfix/bitslr.cpp
index 3b38aa21a6338..e8a499389db03 100644
--- a/libc/src/stdfix/bitslr.cpp
+++ b/libc/src/stdfix/bitslr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitslr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lr_t
+#include "include/llvm-libc-types/int_lr_t.h"       // int_lr_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitslr.h b/libc/src/stdfix/bitslr.h
index 0cb597214f550..d0e468b63e32a 100644
--- a/libc/src/stdfix/bitslr.h
+++ b/libc/src/stdfix/bitslr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_lr_t
+#include "include/llvm-libc-types/int_lr_t.h"       // int_lr_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsr.cpp b/libc/src/stdfix/bitsr.cpp
index 2b6ad2cfe189a..4eefe28a71f91 100644
--- a/libc/src/stdfix/bitsr.cpp
+++ b/libc/src/stdfix/bitsr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_r_t
+#include "include/llvm-libc-types/int_r_t.h"        // int_r_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsr.h b/libc/src/stdfix/bitsr.h
index e071f034cd107..5742a59bb1d61 100644
--- a/libc/src/stdfix/bitsr.h
+++ b/libc/src/stdfix/bitsr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // fract
-#include "include/llvm-libc-types/stdfix-types.h"   // int_r_t
+#include "include/llvm-libc-types/int_r_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuhk.cpp b/libc/src/stdfix/bitsuhk.cpp
index 1b0bf59a550f1..62f7cf3d454d5 100755
--- a/libc/src/stdfix/bitsuhk.cpp
+++ b/libc/src/stdfix/bitsuhk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuhk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhk_t
+#include "include/llvm-libc-types/uint_uhk_t.h"     // uint_uhk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuhk.h b/libc/src/stdfix/bitsuhk.h
index 1e80286d77099..b3082829892aa 100755
--- a/libc/src/stdfix/bitsuhk.h
+++ b/libc/src/stdfix/bitsuhk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUHK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhk_t
+#include "include/llvm-libc-types/uint_uhk_t.h"     // uint_uhk_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuhr.cpp b/libc/src/stdfix/bitsuhr.cpp
index 66152e14bb209..236b2a0879533 100755
--- a/libc/src/stdfix/bitsuhr.cpp
+++ b/libc/src/stdfix/bitsuhr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuhr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhr_t
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuhr.h b/libc/src/stdfix/bitsuhr.h
index 0311665bc17f3..662cd9cd2f5b6 100755
--- a/libc/src/stdfix/bitsuhr.h
+++ b/libc/src/stdfix/bitsuhr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUHR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned short fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uhr_t
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsuk.cpp b/libc/src/stdfix/bitsuk.cpp
index b0a92bd92d4ea..32a4fbea03ab4 100755
--- a/libc/src/stdfix/bitsuk.cpp
+++ b/libc/src/stdfix/bitsuk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"      // uint_uk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsuk.h b/libc/src/stdfix/bitsuk.h
index fce37e82d44c1..b73d64ba7e657 100755
--- a/libc/src/stdfix/bitsuk.h
+++ b/libc/src/stdfix/bitsuk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsulk.cpp b/libc/src/stdfix/bitsulk.cpp
index b8f61a16eb61e..d1000da7a2ec2 100755
--- a/libc/src/stdfix/bitsulk.cpp
+++ b/libc/src/stdfix/bitsulk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsulk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulk_t
+#include "include/llvm-libc-types/uint_ulk_t.h"     // uint_ulk_t
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsulk.h b/libc/src/stdfix/bitsulk.h
index 1bf681ee751c6..9a1a9e90d55d8 100755
--- a/libc/src/stdfix/bitsulk.h
+++ b/libc/src/stdfix/bitsulk.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSLK_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulk_t
+#include "include/llvm-libc-types/uint_ulk_t.h"
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsulr.cpp b/libc/src/stdfix/bitsulr.cpp
index 9fd1b15bedad9..4099b715846e1 100755
--- a/libc/src/stdfix/bitsulr.cpp
+++ b/libc/src/stdfix/bitsulr.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsulr.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulr_t
+#include "include/llvm-libc-types/uint_ulr_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsulr.h b/libc/src/stdfix/bitsulr.h
index cf0f6fbe6698d..7ca985d863349 100755
--- a/libc/src/stdfix/bitsulr.h
+++ b/libc/src/stdfix/bitsulr.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSULR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned long fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ulr_t
+#include "include/llvm-libc-types/uint_ulr_t.h"     // uint_ulr_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitsur.cpp b/libc/src/stdfix/bitsur.cpp
index ffb52de9257bf..8d422d4df6623 100755
--- a/libc/src/stdfix/bitsur.cpp
+++ b/libc/src/stdfix/bitsur.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsur.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ur_t
+#include "include/llvm-libc-types/uint_ur_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/bitsur.h b/libc/src/stdfix/bitsur.h
index 4c938bb65ec8d..c413ef6d0a00a 100755
--- a/libc/src/stdfix/bitsur.h
+++ b/libc/src/stdfix/bitsur.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_BITSUR_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned fract
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_ur_t
+#include "include/llvm-libc-types/uint_ur_t.h"      // uint_ur_t
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/bitusk.cpp b/libc/src/stdfix/bitusk.cpp
index ac0852e078c60..8309927f9ed82 100755
--- a/libc/src/stdfix/bitusk.cpp
+++ b/libc/src/stdfix/bitusk.cpp
@@ -8,7 +8,7 @@
 
 #include "bitsuk.h"
 #include "include/llvm-libc-macros/stdfix-macros.h" // unsigned accum
-#include "include/llvm-libc-types/stdfix-types.h"   // uint_uk_t
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/common.h"                   // LLVM_LIBC_FUNCTION
 #include "src/__support/fixed_point/fx_bits.h"      // fixed_point
 #include "src/__support/macros/config.h"            // LIBC_NAMESPACE_DECL
diff --git a/libc/src/stdfix/hkbits.h b/libc/src/stdfix/hkbits.h
index bf1bf82f9f445..f674492d570d7 100644
--- a/libc/src/stdfix/hkbits.h
+++ b/libc/src/stdfix/hkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_HKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_hk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/hrbits.h b/libc/src/stdfix/hrbits.h
index c6bd99a2acb19..7cfda64f8ce8a 100644
--- a/libc/src/stdfix/hrbits.h
+++ b/libc/src/stdfix/hrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_HRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_hr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/kbits.h b/libc/src/stdfix/kbits.h
index 7e709af442466..c45cf8ddc60f9 100644
--- a/libc/src/stdfix/kbits.h
+++ b/libc/src/stdfix/kbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_KBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_k_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/lkbits.h b/libc/src/stdfix/lkbits.h
index 177f121e2358e..2d30bdbaf0101 100644
--- a/libc/src/stdfix/lkbits.h
+++ b/libc/src/stdfix/lkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_LKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_lk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/lrbits.h b/libc/src/stdfix/lrbits.h
index 087aa19779e62..000594ff9a8f8 100644
--- a/libc/src/stdfix/lrbits.h
+++ b/libc/src/stdfix/lrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_LRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_lr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/rbits.h b/libc/src/stdfix/rbits.h
index 4922e1c3a2e5f..df11cec791ed1 100644
--- a/libc/src/stdfix/rbits.h
+++ b/libc/src/stdfix/rbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_RBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/int_r_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/uhkbits.h b/libc/src/stdfix/uhkbits.h
index 7ef15986f696c..3eef3aa4224a5 100644
--- a/libc/src/stdfix/uhkbits.h
+++ b/libc/src/stdfix/uhkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UHKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uhk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/uhrbits.h b/libc/src/stdfix/uhrbits.h
index 4011a0e118c7b..7198e7052ee4b 100644
--- a/libc/src/stdfix/uhrbits.h
+++ b/libc/src/stdfix/uhrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UHRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uhr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ukbits.h b/libc/src/stdfix/ukbits.h
index 2374f48a4dbb6..20302cd90ec1e 100644
--- a/libc/src/stdfix/ukbits.h
+++ b/libc/src/stdfix/ukbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_UKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_uk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ulkbits.h b/libc/src/stdfix/ulkbits.h
index 955c96c6a4957..04a74f35a2ffb 100644
--- a/libc/src/stdfix/ulkbits.h
+++ b/libc/src/stdfix/ulkbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_ULKBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ulk_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/ulrbits.h b/libc/src/stdfix/ulrbits.h
index 7ae49ba212b72..e5d55ab81539d 100644
--- a/libc/src/stdfix/ulrbits.h
+++ b/libc/src/stdfix/ulrbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_ULRBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ulr_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/stdfix/urbits.h b/libc/src/stdfix/urbits.h
index 8fc0881cf645b..ffa3b38529fda 100644
--- a/libc/src/stdfix/urbits.h
+++ b/libc/src/stdfix/urbits.h
@@ -10,7 +10,7 @@
 #define LLVM_LIBC_SRC_STDFIX_URBITS_H
 
 #include "include/llvm-libc-macros/stdfix-macros.h"
-#include "include/llvm-libc-types/stdfix-types.h"
+#include "include/llvm-libc-types/uint_ur_t.h"
 #include "src/__support/macros/config.h"
 
 namespace LIBC_NAMESPACE_DECL {
diff --git a/libc/src/wchar/wcsnrtombs.cpp b/libc/src/wchar/wcsnrtombs.cpp
index 7f25b248a0863..a344c2331b532 100644
--- a/libc/src/wchar/wcsnrtombs.cpp
+++ b/libc/src/wchar/wcsnrtombs.cpp
@@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(size_t, wcsnrtombs,
                    (char *__restrict s, const wchar_t **__restrict pwcs,
-                    size_t nwc, size_t len, mbstate_t *ps)) {
+                    size_t nwc, size_t len, mbstate_t *__restrict ps)) {
   LIBC_CRASH_ON_NULLPTR(pwcs);
   static internal::mbstate internal_mbstate;
   auto result = internal::wcsnrtombs(
diff --git a/libc/src/wchar/wcsnrtombs.h b/libc/src/wchar/wcsnrtombs.h
index bf8add75b2951..2ca42c71e2e9d 100644
--- a/libc/src/wchar/wcsnrtombs.h
+++ b/libc/src/wchar/wcsnrtombs.h
@@ -17,7 +17,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 size_t wcsnrtombs(char *__restrict s, const wchar_t **__restrict pwcs,
-                  size_t nwc, size_t len, mbstate_t *ps);
+                  size_t nwc, size_t len, mbstate_t *__restrict ps);
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/src/wchar/wcsrtombs.cpp b/libc/src/wchar/wcsrtombs.cpp
index 9d2508cb81a8c..0167e857128de 100644
--- a/libc/src/wchar/wcsrtombs.cpp
+++ b/libc/src/wchar/wcsrtombs.cpp
@@ -22,7 +22,7 @@ namespace LIBC_NAMESPACE_DECL {
 
 LLVM_LIBC_FUNCTION(size_t, wcsrtombs,
                    (char *__restrict s, const wchar_t **__restrict pwcs,
-                    size_t n, mbstate_t *ps)) {
+                    size_t n, mbstate_t *__restrict ps)) {
   LIBC_CRASH_ON_NULLPTR(pwcs);
   static internal::mbstate internal_mbstate;
   auto result = internal::wcsnrtombs(
diff --git a/libc/src/wchar/wcsrtombs.h b/libc/src/wchar/wcsrtombs.h
index d23573f5b9418..b85e2c6ed740a 100644
--- a/libc/src/wchar/wcsrtombs.h
+++ b/libc/src/wchar/wcsrtombs.h
@@ -17,7 +17,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 size_t wcsrtombs(char *__restrict s, const wchar_t **__restrict pwcs, size_t n,
-                 mbstate_t *ps);
+                 mbstate_t *__restrict ps);
 
 } // namespace LIBC_NAMESPACE_DECL
 
diff --git a/libc/test/UnitTest/FPMatcher.h b/libc/test/UnitTest/FPMatcher.h
index f74276f54eb25..430727e537107 100644
--- a/libc/test/UnitTest/FPMatcher.h
+++ b/libc/test/UnitTest/FPMatcher.h
@@ -16,6 +16,7 @@
 #include "src/__support/FPUtil/fpbits_str.h"
 #include "src/__support/libc_errno.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
 #include "src/__support/macros/properties/architectures.h"
 #include "test/UnitTest/ErrnoCheckingTest.h"
 #include "test/UnitTest/RoundingModeUtils.h"
@@ -294,42 +295,46 @@ struct ModifyMXCSR {
 
 #define EXPECT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
-    if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = libc_errno;                                                 \
-      libc_errno = 0;                                                          \
-      EXPECT_EQ(actual, expected);                                             \
-    }                                                                          \
+    if ((LIBC_MATH & LIBC_MATH_NO_ERRNO) == 0)                                 \
+      if (math_errhandling & MATH_ERRNO) {                                     \
+        int actual = libc_errno;                                               \
+        libc_errno = 0;                                                        \
+        EXPECT_EQ(actual, expected);                                           \
+      }                                                                        \
   } while (0)
 
 #define ASSERT_MATH_ERRNO(expected)                                            \
   do {                                                                         \
-    if (math_errhandling & MATH_ERRNO) {                                       \
-      int actual = libc_errno;                                                 \
-      libc_errno = 0;                                                          \
-      ASSERT_EQ(actual, expected);                                             \
-    }                                                                          \
+    if ((LIBC_MATH & LIBC_MATH_NO_ERRNO) == 0)                                 \
+      if (math_errhandling & MATH_ERRNO) {                                     \
+        int actual = libc_errno;                                               \
+        libc_errno = 0;                                                        \
+        ASSERT_EQ(actual, expected);                                           \
+      }                                                                        \
   } while (0)
 
 #define EXPECT_FP_EXCEPTION(expected)                                          \
   do {                                                                         \
-    if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      EXPECT_EQ(                                                               \
-          LIBC_NAMESPACE::fputil::test_except(                                 \
-              static_cast<int>(FE_ALL_EXCEPT)) &                               \
-              ((expected) ? (expected) : static_cast<int>(FE_ALL_EXCEPT)),     \
-          (expected));                                                         \
-    }                                                                          \
+    if ((LIBC_MATH & LIBC_MATH_NO_EXCEPT) == 0)                                \
+      if (math_errhandling & MATH_ERREXCEPT) {                                 \
+        EXPECT_EQ(                                                             \
+            LIBC_NAMESPACE::fputil::test_except(                               \
+                static_cast<int>(FE_ALL_EXCEPT)) &                             \
+                ((expected) ? (expected) : static_cast<int>(FE_ALL_EXCEPT)),   \
+            (expected));                                                       \
+      }                                                                        \
   } while (0)
 
 #define ASSERT_FP_EXCEPTION(expected)                                          \
   do {                                                                         \
-    if (math_errhandling & MATH_ERREXCEPT) {                                   \
-      ASSERT_EQ(                                                               \
-          LIBC_NAMESPACE::fputil::test_except(                                 \
-              static_cast<int>(FE_ALL_EXCEPT)) &                               \
-              ((expected) ? (expected) : static_cast<int>(FE_ALL_EXCEPT)),     \
-          (expected));                                                         \
-    }                                                                          \
+    if ((LIBC_MATH & LIBC_MATH_NO_EXCEPT) == 0)                                \
+      if (math_errhandling & MATH_ERREXCEPT) {                                 \
+        ASSERT_EQ(                                                             \
+            LIBC_NAMESPACE::fputil::test_except(                               \
+                static_cast<int>(FE_ALL_EXCEPT)) &                             \
+                ((expected) ? (expected) : static_cast<int>(FE_ALL_EXCEPT)),   \
+            (expected));                                                       \
+      }                                                                        \
   } while (0)
 
 #define EXPECT_FP_EQ_WITH_EXCEPTION(expected_val, actual_val, expected_except) \
diff --git a/libc/test/src/math/acos_test.cpp b/libc/test/src/math/acos_test.cpp
index 140488702f0bc..8678fe620d6d3 100644
--- a/libc/test/src/math/acos_test.cpp
+++ b/libc/test/src/math/acos_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/acos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 8
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAcosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -46,7 +53,7 @@ TEST_F(LlvmLibcAcosTest, InDoubleRange) {
       ++count;
 
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Acos, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Acos, x,
                                                   result, tol, rounding_mode)) {
diff --git a/libc/test/src/math/acosf16_test.cpp b/libc/test/src/math/acosf16_test.cpp
index f4890c81b0bcb..ca33550c6ce92 100644
--- a/libc/test/src/math/acosf16_test.cpp
+++ b/libc/test/src/math/acosf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/acosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAcosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -28,7 +35,7 @@ TEST_F(LlvmLibcAcosf16Test, PositiveRange) {
     float16 x = FPBits(v).get_val();
 
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
-                                   LIBC_NAMESPACE::acosf16(x), 0.5);
+                                   LIBC_NAMESPACE::acosf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -37,6 +44,6 @@ TEST_F(LlvmLibcAcosf16Test, NegativeRange) {
     float16 x = FPBits(v).get_val();
 
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
-                                   LIBC_NAMESPACE::acosf16(x), 0.5);
+                                   LIBC_NAMESPACE::acosf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/acosf_test.cpp b/libc/test/src/math/acosf_test.cpp
index 3b45749467b80..40b87021109f9 100644
--- a/libc/test/src/math/acosf_test.cpp
+++ b/libc/test/src/math/acosf_test.cpp
@@ -10,11 +10,18 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/acosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
 
 using LlvmLibcAcosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
@@ -72,8 +79,8 @@ TEST_F(LlvmLibcAcosfTest, SpecificBitPatterns) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, x,
-                                   LIBC_NAMESPACE::acosf(x), 0.5);
+                                   LIBC_NAMESPACE::acosf(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acos, -x,
-                                   LIBC_NAMESPACE::acosf(-x), 0.5);
+                                   LIBC_NAMESPACE::acosf(-x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/acoshf_test.cpp b/libc/test/src/math/acoshf_test.cpp
index 506f17680887e..a0e9b390b58de 100644
--- a/libc/test/src/math/acoshf_test.cpp
+++ b/libc/test/src/math/acoshf_test.cpp
@@ -10,11 +10,18 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/acoshf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAcoshfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -68,6 +75,6 @@ TEST_F(LlvmLibcAcoshfTest, SpecificBitPatterns) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Acosh, x,
-                                   LIBC_NAMESPACE::acoshf(x), 0.5);
+                                   LIBC_NAMESPACE::acoshf(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/asin_test.cpp b/libc/test/src/math/asin_test.cpp
index 03ae963e9f924..4e3638410b484 100644
--- a/libc/test/src/math/asin_test.cpp
+++ b/libc/test/src/math/asin_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/asin.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 6
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAsinTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -47,7 +54,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) {
       ++count;
 
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Asin, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Asin, x,
                                                   result, tol, rounding_mode)) {
@@ -72,6 +79,7 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) {
   tlog << " Test Rounding To Nearest...\n";
   test(mpfr::RoundingMode::Nearest);
 
+#ifndef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
   tlog << " Test Rounding Downward...\n";
   test(mpfr::RoundingMode::Downward);
 
@@ -80,4 +88,5 @@ TEST_F(LlvmLibcAsinTest, InDoubleRange) {
 
   tlog << " Test Rounding Toward Zero...\n";
   test(mpfr::RoundingMode::TowardZero);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
diff --git a/libc/test/src/math/asinf_test.cpp b/libc/test/src/math/asinf_test.cpp
index 824bc1ef868af..20702c5c15e88 100644
--- a/libc/test/src/math/asinf_test.cpp
+++ b/libc/test/src/math/asinf_test.cpp
@@ -11,11 +11,18 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/asinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAsinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -68,8 +75,8 @@ TEST_F(LlvmLibcAsinfTest, SpecificBitPatterns) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, x,
-                                   LIBC_NAMESPACE::asinf(x), 0.5);
+                                   LIBC_NAMESPACE::asinf(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asin, -x,
-                                   LIBC_NAMESPACE::asinf(-x), 0.5);
+                                   LIBC_NAMESPACE::asinf(-x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/asinhf16_test.cpp b/libc/test/src/math/asinhf16_test.cpp
index 929d13713d197..8d0f754a0645b 100644
--- a/libc/test/src/math/asinhf16_test.cpp
+++ b/libc/test/src/math/asinhf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/asinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAsinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -28,7 +35,8 @@ TEST_F(LlvmLibcAsinhf16Test, PositiveRange) {
     float16 x = FPBits(v).get_val();
 
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
-                                   LIBC_NAMESPACE::asinhf16(x), 0.5);
+                                   LIBC_NAMESPACE::asinhf16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -37,6 +45,7 @@ TEST_F(LlvmLibcAsinhf16Test, NegativeRange) {
     float16 x = FPBits(v).get_val();
 
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
-                                   LIBC_NAMESPACE::asinhf16(x), 0.5);
+                                   LIBC_NAMESPACE::asinhf16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/asinhf_test.cpp b/libc/test/src/math/asinhf_test.cpp
index 4681aad03187d..9d9c042208d28 100644
--- a/libc/test/src/math/asinhf_test.cpp
+++ b/libc/test/src/math/asinhf_test.cpp
@@ -9,11 +9,18 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/asinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAsinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -68,8 +75,8 @@ TEST_F(LlvmLibcAsinhfTest, SpecificBitPatterns) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, x,
-                                   LIBC_NAMESPACE::asinhf(x), 0.5);
+                                   LIBC_NAMESPACE::asinhf(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Asinh, -x,
-                                   LIBC_NAMESPACE::asinhf(-x), 0.5);
+                                   LIBC_NAMESPACE::asinhf(-x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/atan2f_test.cpp b/libc/test/src/math/atan2f_test.cpp
index 50ab38208089a..56b6967069523 100644
--- a/libc/test/src/math/atan2f_test.cpp
+++ b/libc/test/src/math/atan2f_test.cpp
@@ -8,11 +8,18 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/atan2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAtan2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 using LIBC_NAMESPACE::testing::tlog;
 
@@ -36,16 +43,20 @@ TEST_F(LlvmLibcAtan2fTest, TrickyInputs) {
     float x = INPUTS[i].x;
     float y = INPUTS[i].y;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i],
-                                   LIBC_NAMESPACE::atan2f(x, y), 0.5);
+                                   LIBC_NAMESPACE::atan2f(x, y),
+                                   TOLERANCE + 0.5);
     INPUTS[i].x = -INPUTS[i].x;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i],
-                                   LIBC_NAMESPACE::atan2f(-x, y), 0.5);
+                                   LIBC_NAMESPACE::atan2f(-x, y),
+                                   TOLERANCE + 0.5);
     INPUTS[i].y = -INPUTS[i].y;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i],
-                                   LIBC_NAMESPACE::atan2f(-x, -y), 0.5);
+                                   LIBC_NAMESPACE::atan2f(-x, -y),
+                                   TOLERANCE + 0.5);
     INPUTS[i].x = -INPUTS[i].x;
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan2, INPUTS[i],
-                                   LIBC_NAMESPACE::atan2f(x, -y), 0.5);
+                                   LIBC_NAMESPACE::atan2f(x, -y),
+                                   TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/atanf16_test.cpp b/libc/test/src/math/atanf16_test.cpp
index fa383e718834e..e91133fc6e4ca 100644
--- a/libc/test/src/math/atanf16_test.cpp
+++ b/libc/test/src/math/atanf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/atanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcAtanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcAtanf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
-                                   LIBC_NAMESPACE::atanf16(x), 0.5);
+                                   LIBC_NAMESPACE::atanf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcAtanf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Atan, x,
-                                   LIBC_NAMESPACE::atanf16(x), 0.5);
+                                   LIBC_NAMESPACE::atanf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/cbrt_test.cpp b/libc/test/src/math/cbrt_test.cpp
index 2e2de16fc859d..61b8583cbc7ff 100644
--- a/libc/test/src/math/cbrt_test.cpp
+++ b/libc/test/src/math/cbrt_test.cpp
@@ -8,11 +8,18 @@
 
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/cbrt.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcCbrtTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -49,7 +56,7 @@ TEST_F(LlvmLibcCbrtTest, InDoubleRange) {
       ++tested;
 
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Cbrt, x,
                                                   result, ulp, rounding_mode)) {
@@ -98,8 +105,8 @@ TEST_F(LlvmLibcCbrtTest, SpecialValues) {
   for (double v : INPUTS) {
     double x = FPBits(v).get_val();
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, x,
-                                   LIBC_NAMESPACE::cbrt(x), 0.5);
+                                   LIBC_NAMESPACE::cbrt(x), TOLERANCE + 0.5);
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cbrt, -x,
-                                   LIBC_NAMESPACE::cbrt(-x), 0.5);
+                                   LIBC_NAMESPACE::cbrt(-x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/cos_test.cpp b/libc/test/src/math/cos_test.cpp
index e2d47917e545e..40c43f9894639 100644
--- a/libc/test/src/math/cos_test.cpp
+++ b/libc/test/src/math/cos_test.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/cos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcCosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -57,7 +64,7 @@ TEST_F(LlvmLibcCosTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = INPUTS[i];
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
-                                   LIBC_NAMESPACE::cos(x), 0.5);
+                                   LIBC_NAMESPACE::cos(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/cosf16_test.cpp b/libc/test/src/math/cosf16_test.cpp
index b744e7817e4ba..fe6f4abd39009 100644
--- a/libc/test/src/math/cosf16_test.cpp
+++ b/libc/test/src/math/cosf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/cosf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcCosf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcCosf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
-                                   LIBC_NAMESPACE::cosf16(x), 0.5);
+                                   LIBC_NAMESPACE::cosf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcCosf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
-                                   LIBC_NAMESPACE::cosf16(x), 0.5);
+                                   LIBC_NAMESPACE::cosf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/cosf_test.cpp b/libc/test/src/math/cosf_test.cpp
index 6afaa42e1f57d..d5d5dc3b4cd50 100644
--- a/libc/test/src/math/cosf_test.cpp
+++ b/libc/test/src/math/cosf_test.cpp
@@ -10,12 +10,21 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/cosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 3
+#define FP_ASSERT ASSERT_MPFR_MATCH
+#else
+#define TOLERANCE 0
+#define FP_ASSERT ASSERT_MPFR_MATCH_ALL_ROUNDING
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LIBC_NAMESPACE::testing::SDCOMP26094_VALUES;
 using LlvmLibcCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -45,8 +54,8 @@ TEST_F(LlvmLibcCosfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
-                                   LIBC_NAMESPACE::cosf(x), 0.5);
+    FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x),
+              TOLERANCE + 0.5);
   }
 }
 
@@ -99,10 +108,10 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) {
 
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
-    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, x,
-                                   LIBC_NAMESPACE::cosf(x), 0.5);
-    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cos, -x,
-                                   LIBC_NAMESPACE::cosf(-x), 0.5);
+    FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x),
+              TOLERANCE + 0.5);
+    FP_ASSERT(mpfr::Operation::Cos, -x, LIBC_NAMESPACE::cosf(-x),
+              TOLERANCE + 0.5);
   }
 }
 
@@ -111,6 +120,7 @@ TEST_F(LlvmLibcCosfTest, SpecificBitPatterns) {
 TEST_F(LlvmLibcCosfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
     float x = FPBits(v).get_val();
-    ASSERT_MPFR_MATCH(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x), 0.5);
+    FP_ASSERT(mpfr::Operation::Cos, x, LIBC_NAMESPACE::cosf(x),
+              TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/coshf16_test.cpp b/libc/test/src/math/coshf16_test.cpp
index a0d1fd2110478..8e8f55e128edf 100644
--- a/libc/test/src/math/coshf16_test.cpp
+++ b/libc/test/src/math/coshf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/coshf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcCoshf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x,
-                                   LIBC_NAMESPACE::coshf16(x), 0.5);
+                                   LIBC_NAMESPACE::coshf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcCoshf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x,
-                                   LIBC_NAMESPACE::coshf16(x), 0.5);
+                                   LIBC_NAMESPACE::coshf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/erff_test.cpp b/libc/test/src/math/erff_test.cpp
index c40aacf852bc9..17ff6c43f7717 100644
--- a/libc/test/src/math/erff_test.cpp
+++ b/libc/test/src/math/erff_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/erff.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcErffTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -37,9 +43,9 @@ TEST_F(LlvmLibcErffTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, x,
-                                   LIBC_NAMESPACE::erff(x), 0.5);
+                                   LIBC_NAMESPACE::erff(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Erf, -x,
-                                   LIBC_NAMESPACE::erff(-x), 0.5);
+                                   LIBC_NAMESPACE::erff(-x), TOLERANCE + 0.5);
   }
 }
 
@@ -83,10 +89,10 @@ TEST_F(LlvmLibcErffTest, InFloatRange) {
         }
       }
     }
-    tlog << " Log failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Log failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Erf, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/exp10_test.cpp b/libc/test/src/math/exp10_test.cpp
index 2ddcef0d77672..4257dd0d1458b 100644
--- a/libc/test/src/math/exp10_test.cpp
+++ b/libc/test/src/math/exp10_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExp10Test = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -79,7 +85,7 @@ TEST_F(LlvmLibcExp10Test, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
-                                   LIBC_NAMESPACE::exp10(x), 0.5);
+                                   LIBC_NAMESPACE::exp10(x), TOLERANCE + 0.5);
   }
 }
 
@@ -112,7 +118,7 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
       ++count;
 
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp10, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp10, x,
                                                   result, tol, rounding_mode)) {
@@ -126,10 +132,10 @@ TEST_F(LlvmLibcExp10Test, InDoubleRange) {
         }
       }
     }
-    tlog << " Exp10 failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Exp10 failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Exp10, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/exp10f16_test.cpp b/libc/test/src/math/exp10f16_test.cpp
index fc49331b5917c..7bd085012e583 100644
--- a/libc/test/src/math/exp10f16_test.cpp
+++ b/libc/test/src/math/exp10f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExp10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp10f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
-                                   LIBC_NAMESPACE::exp10f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp10f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp10f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10, x,
-                                   LIBC_NAMESPACE::exp10f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp10f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp
index 41bb12f7d0973..4ba16e0e70b69 100644
--- a/libc/test/src/math/exp10m1f16_test.cpp
+++ b/libc/test/src/math/exp10m1f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp10m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
-                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp10m1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
-                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp10m1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/exp10m1f_test.cpp b/libc/test/src/math/exp10m1f_test.cpp
index 009c85d6cc453..b41a412b3bddc 100644
--- a/libc/test/src/math/exp10m1f_test.cpp
+++ b/libc/test/src/math/exp10m1f_test.cpp
@@ -7,14 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/libc_errno.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp10m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExp10m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -70,7 +76,8 @@ TEST_F(LlvmLibcExp10m1fTest, TrickyInputs) {
 
   for (float x : INPUTS) {
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
-                                   LIBC_NAMESPACE::exp10m1f(x), 0.5);
+                                   LIBC_NAMESPACE::exp10m1f(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/exp2_test.cpp b/libc/test/src/math/exp2_test.cpp
index a3dcb58c5c288..324cafd926756 100644
--- a/libc/test/src/math/exp2_test.cpp
+++ b/libc/test/src/math/exp2_test.cpp
@@ -101,10 +101,10 @@ TEST_F(LlvmLibcExp2Test, InDoubleRange) {
         }
       }
     }
-    tlog << " Exp2 failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Exp2 failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Exp2, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/exp2f16_test.cpp b/libc/test/src/math/exp2f16_test.cpp
index 503d8c2d89d94..d28632a77c4cb 100644
--- a/libc/test/src/math/exp2f16_test.cpp
+++ b/libc/test/src/math/exp2f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExp2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcExp2f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
-                                   LIBC_NAMESPACE::exp2f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp2f16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcExp2f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
-                                   LIBC_NAMESPACE::exp2f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp2f16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/exp2f_test.cpp b/libc/test/src/math/exp2f_test.cpp
index 94141dda71843..5e5fd699ab4ac 100644
--- a/libc/test/src/math/exp2f_test.cpp
+++ b/libc/test/src/math/exp2f_test.cpp
@@ -7,14 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/libc_errno.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp2f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExp2fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -71,7 +77,7 @@ TEST_F(LlvmLibcExp2fTest, TrickyInputs) {
     libc_errno = 0;
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2, x,
-                                   LIBC_NAMESPACE::exp2f(x), 0.5);
+                                   LIBC_NAMESPACE::exp2f(x), TOLERANCE + 0.5);
     EXPECT_MATH_ERRNO(0);
   }
 }
diff --git a/libc/test/src/math/exp2m1f16_test.cpp b/libc/test/src/math/exp2m1f16_test.cpp
index bf299118429d7..526484d8d196b 100644
--- a/libc/test/src/math/exp2m1f16_test.cpp
+++ b/libc/test/src/math/exp2m1f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp2m1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExp2m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcExp2m1f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
-                                   LIBC_NAMESPACE::exp2m1f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp2m1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcExp2m1f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
-                                   LIBC_NAMESPACE::exp2m1f16(x), 0.5);
+                                   LIBC_NAMESPACE::exp2m1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/exp2m1f_test.cpp b/libc/test/src/math/exp2m1f_test.cpp
index 7e9f6b50ce816..16b20d72e2958 100644
--- a/libc/test/src/math/exp2m1f_test.cpp
+++ b/libc/test/src/math/exp2m1f_test.cpp
@@ -7,15 +7,21 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/libc_errno.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp2m1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExp2m1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -39,7 +45,7 @@ TEST_F(LlvmLibcExp2m1fTest, TrickyInputs) {
 
   for (float x : INPUTS) {
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp2m1, x,
-                                   LIBC_NAMESPACE::exp2m1f(x), 0.5);
+                                   LIBC_NAMESPACE::exp2m1f(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/exp_test.cpp b/libc/test/src/math/exp_test.cpp
index 854bb5155e11d..5a785fbaee4fd 100644
--- a/libc/test/src/math/exp_test.cpp
+++ b/libc/test/src/math/exp_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/exp.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExpTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -52,7 +58,7 @@ TEST_F(LlvmLibcExpTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
-                                   LIBC_NAMESPACE::exp(x), 0.5);
+                                   LIBC_NAMESPACE::exp(x), TOLERANCE + 0.5);
   }
 }
 
@@ -85,7 +91,7 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
       ++count;
       // ASSERT_MPFR_MATCH(mpfr::Operation::Log, x, result, 0.5);
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Exp, x,
                                                   result, tol, rounding_mode)) {
@@ -99,10 +105,10 @@ TEST_F(LlvmLibcExpTest, InDoubleRange) {
         }
       }
     }
-    tlog << " Exp failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Exp failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Exp, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/expf16_test.cpp b/libc/test/src/math/expf16_test.cpp
index ee89a9c716e12..4ea10ea5c8ad4 100644
--- a/libc/test/src/math/expf16_test.cpp
+++ b/libc/test/src/math/expf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/expf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExpf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcExpf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
-                                   LIBC_NAMESPACE::expf16(x), 0.5);
+                                   LIBC_NAMESPACE::expf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcExpf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
-                                   LIBC_NAMESPACE::expf16(x), 0.5);
+                                   LIBC_NAMESPACE::expf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/expf_test.cpp b/libc/test/src/math/expf_test.cpp
index 1695a60fbd491..05167b80ec06a 100644
--- a/libc/test/src/math/expf_test.cpp
+++ b/libc/test/src/math/expf_test.cpp
@@ -7,14 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/libc_errno.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/expf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExpfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -94,7 +100,7 @@ TEST_F(LlvmLibcExpfTest, Borderline) {
 
   x = FPBits(0xc236bd8cU).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp, x,
-                                 LIBC_NAMESPACE::expf(x), 0.5);
+                                 LIBC_NAMESPACE::expf(x), TOLERANCE + 0.5);
   EXPECT_MATH_ERRNO(0);
 }
 
diff --git a/libc/test/src/math/expm1_test.cpp b/libc/test/src/math/expm1_test.cpp
index c185be9e35f60..8ca9b7710427b 100644
--- a/libc/test/src/math/expm1_test.cpp
+++ b/libc/test/src/math/expm1_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/expm1.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExpm1Test = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -36,9 +42,9 @@ TEST_F(LlvmLibcExpm1Test, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = INPUTS[i];
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
-                                   LIBC_NAMESPACE::expm1(x), 0.5);
+                                   LIBC_NAMESPACE::expm1(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, -x,
-                                   LIBC_NAMESPACE::expm1(-x), 0.5);
+                                   LIBC_NAMESPACE::expm1(-x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/expm1f16_test.cpp b/libc/test/src/math/expm1f16_test.cpp
index a6a6fcf73d383..952a8796ea7c3 100644
--- a/libc/test/src/math/expm1f16_test.cpp
+++ b/libc/test/src/math/expm1f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/expm1f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcExpm1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcExpm1f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
-                                   LIBC_NAMESPACE::expm1f16(x), 0.5);
+                                   LIBC_NAMESPACE::expm1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcExpm1f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
-                                   LIBC_NAMESPACE::expm1f16(x), 0.5);
+                                   LIBC_NAMESPACE::expm1f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/expm1f_test.cpp b/libc/test/src/math/expm1f_test.cpp
index 28d7106397801..2f7a9f2de78f3 100644
--- a/libc/test/src/math/expm1f_test.cpp
+++ b/libc/test/src/math/expm1f_test.cpp
@@ -7,14 +7,20 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/libc_errno.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/expm1f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcExpm1fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -93,7 +99,7 @@ TEST_F(LlvmLibcExpm1fTest, Borderline) {
 
   x = FPBits(0x3e35bec5U).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
-                                 LIBC_NAMESPACE::expm1f(x), 0.5);
+                                 LIBC_NAMESPACE::expm1f(x), TOLERANCE + 0.5);
   EXPECT_MATH_ERRNO(0);
 
   x = FPBits(0x942ed494U).get_val();
@@ -103,7 +109,7 @@ TEST_F(LlvmLibcExpm1fTest, Borderline) {
 
   x = FPBits(0xbdc1c6cbU).get_val();
   ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Expm1, x,
-                                 LIBC_NAMESPACE::expm1f(x), 0.5);
+                                 LIBC_NAMESPACE::expm1f(x), TOLERANCE + 0.5);
   EXPECT_MATH_ERRNO(0);
 }
 
diff --git a/libc/test/src/math/log10_test.cpp b/libc/test/src/math/log10_test.cpp
index 62a19d02309ad..3e48f7447b6cb 100644
--- a/libc/test/src/math/log10_test.cpp
+++ b/libc/test/src/math/log10_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/log10.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLog10Test = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -66,7 +72,7 @@ TEST_F(LlvmLibcLog10Test, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
-                                   LIBC_NAMESPACE::log10(x), 0.5);
+                                   LIBC_NAMESPACE::log10(x), TOLERANCE + 0.5);
   }
 }
 
@@ -118,10 +124,10 @@ TEST_F(LlvmLibcLog10Test, InDoubleRange) {
         }
       }
     }
-    tlog << " Log10 failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Log10 failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Log10, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/log10f16_test.cpp b/libc/test/src/math/log10f16_test.cpp
index a71e3309ac5f0..0e456dc96a6ce 100644
--- a/libc/test/src/math/log10f16_test.cpp
+++ b/libc/test/src/math/log10f16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/log10f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcLog10f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcLog10f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
-                                   LIBC_NAMESPACE::log10f16(x), 0.5);
+                                   LIBC_NAMESPACE::log10f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcLog10f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
-                                   LIBC_NAMESPACE::log10f16(x), 0.5);
+                                   LIBC_NAMESPACE::log10f16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/log10f_test.cpp b/libc/test/src/math/log10f_test.cpp
index c554948f7e647..1a1bd14a65d84 100644
--- a/libc/test/src/math/log10f_test.cpp
+++ b/libc/test/src/math/log10f_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/log10f.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLog10fTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -60,7 +66,7 @@ TEST_F(LlvmLibcLog10fTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log10, x,
-                                   LIBC_NAMESPACE::log10f(x), 0.5);
+                                   LIBC_NAMESPACE::log10f(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/log1p_test.cpp b/libc/test/src/math/log1p_test.cpp
index be83ce899db59..dd720739ac569 100644
--- a/libc/test/src/math/log1p_test.cpp
+++ b/libc/test/src/math/log1p_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/log1p.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLog1pTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -68,7 +74,7 @@ TEST_F(LlvmLibcLog1pTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
-                                   LIBC_NAMESPACE::log1p(x), 0.5);
+                                   LIBC_NAMESPACE::log1p(x), TOLERANCE + 0.5);
   }
 }
 
@@ -107,9 +113,8 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
         continue;
 
       ++count;
-      // ASSERT_MPFR_MATCH(mpfr::Operation::Log1p, x, result, 0.5);
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log1p, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log1p, x,
                                                   result, tol, rounding_mode)) {
@@ -119,10 +124,10 @@ TEST_F(LlvmLibcLog1pTest, InDoubleRange) {
         }
       }
     }
-    tlog << " Log1p failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Log1p failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Log1p, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/log1pf_test.cpp b/libc/test/src/math/log1pf_test.cpp
index 2df452629b98d..ef61f5ae41b1a 100644
--- a/libc/test/src/math/log1pf_test.cpp
+++ b/libc/test/src/math/log1pf_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/log1pf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLog1pfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -63,7 +69,7 @@ TEST_F(LlvmLibcLog1pfTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log1p, x,
-                                   LIBC_NAMESPACE::log1pf(x), 0.5);
+                                   LIBC_NAMESPACE::log1pf(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/log2_test.cpp b/libc/test/src/math/log2_test.cpp
index 824a8a5d8bb70..9eada859ced84 100644
--- a/libc/test/src/math/log2_test.cpp
+++ b/libc/test/src/math/log2_test.cpp
@@ -117,10 +117,10 @@ TEST_F(LlvmLibcLog2Test, InDoubleRange) {
         }
       }
     }
-    tlog << " Log2 failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Log2 failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Log2, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/log2f16_test.cpp b/libc/test/src/math/log2f16_test.cpp
index 6630ca877d8d1..d277b1233693d 100644
--- a/libc/test/src/math/log2f16_test.cpp
+++ b/libc/test/src/math/log2f16_test.cpp
@@ -6,11 +6,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/log2f16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 using LlvmLibcLog2f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +33,7 @@ TEST_F(LlvmLibcLog2f16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
-                                   LIBC_NAMESPACE::log2f16(x), 0.5);
+                                   LIBC_NAMESPACE::log2f16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +41,6 @@ TEST_F(LlvmLibcLog2f16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log2, x,
-                                   LIBC_NAMESPACE::log2f16(x), 0.5);
+                                   LIBC_NAMESPACE::log2f16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/log_test.cpp b/libc/test/src/math/log_test.cpp
index a68d168721e57..c9a030a89c2fd 100644
--- a/libc/test/src/math/log_test.cpp
+++ b/libc/test/src/math/log_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/log.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLogTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
@@ -106,7 +112,7 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
       ++count;
       // ASSERT_MPFR_MATCH(mpfr::Operation::Log, x, result, 0.5);
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Log, x,
                                                   result, tol, rounding_mode)) {
@@ -116,10 +122,10 @@ TEST_F(LlvmLibcLogTest, InDoubleRange) {
         }
       }
     }
-    tlog << " Log failed: " << fails << "/" << count << "/" << cc
-         << " tests.\n";
-    tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
     if (fails) {
+      tlog << " Log failed: " << fails << "/" << count << "/" << cc
+           << " tests.\n";
+      tlog << "   Max ULPs is at most: " << static_cast<uint64_t>(tol) << ".\n";
       EXPECT_MPFR_MATCH(mpfr::Operation::Log, mx, mr, 0.5, rounding_mode);
     }
   };
diff --git a/libc/test/src/math/logf16_test.cpp b/libc/test/src/math/logf16_test.cpp
index 922918b092b21..70d6304208567 100644
--- a/libc/test/src/math/logf16_test.cpp
+++ b/libc/test/src/math/logf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/logf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcLogf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcLogf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
-                                   LIBC_NAMESPACE::logf16(x), 0.5);
+                                   LIBC_NAMESPACE::logf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcLogf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
-                                   LIBC_NAMESPACE::logf16(x), 0.5);
+                                   LIBC_NAMESPACE::logf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/logf_test.cpp b/libc/test/src/math/logf_test.cpp
index 7658075c1738f..fb99080435915 100644
--- a/libc/test/src/math/logf_test.cpp
+++ b/libc/test/src/math/logf_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/logf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcLogfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -28,7 +34,11 @@ TEST_F(LlvmLibcLogfTest, SpecialNumbers) {
   EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, LIBC_NAMESPACE::logf(-0.0f),
                               FE_DIVBYZERO);
   EXPECT_FP_IS_NAN_WITH_EXCEPTION(LIBC_NAMESPACE::logf(-1.0f), FE_INVALID);
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  EXPECT_TRUE(0.0f == LIBC_NAMESPACE::logf(1.0f));
+#else
   EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::logf(1.0f));
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 }
 
 TEST_F(LlvmLibcLogfTest, TrickyInputs) {
@@ -73,7 +83,7 @@ TEST_F(LlvmLibcLogfTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Log, x,
-                                   LIBC_NAMESPACE::logf(x), 0.5);
+                                   LIBC_NAMESPACE::logf(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/powf_test.cpp b/libc/test/src/math/powf_test.cpp
index 1d7072463e0b6..fe8ef4fa85117 100644
--- a/libc/test/src/math/powf_test.cpp
+++ b/libc/test/src/math/powf_test.cpp
@@ -7,13 +7,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/powf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcPowfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 using LIBC_NAMESPACE::testing::tlog;
@@ -42,7 +48,7 @@ TEST_F(LlvmLibcPowfTest, TrickyInputs) {
     float x = INPUTS[i].x;
     float y = INPUTS[i].y;
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Pow, INPUTS[i],
-                                   LIBC_NAMESPACE::powf(x, y), 0.5);
+                                   LIBC_NAMESPACE::powf(x, y), TOLERANCE + 0.5);
   }
 }
 
@@ -88,7 +94,8 @@ TEST_F(LlvmLibcPowfTest, InFloatRange) {
         mpfr::BinaryInput<float> inputs{x, y};
 
         if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Pow, inputs,
-                                               result, 0.5, rounding_mode)) {
+                                               result, TOLERANCE + 0.5,
+                                               rounding_mode)) {
           ++fails;
           while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(
               mpfr::Operation::Pow, inputs, result, tol, rounding_mode)) {
diff --git a/libc/test/src/math/sin_test.cpp b/libc/test/src/math/sin_test.cpp
index 4d5d9ddf464b1..7eac0348ee6d4 100644
--- a/libc/test/src/math/sin_test.cpp
+++ b/libc/test/src/math/sin_test.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/sin.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcSinTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -46,7 +53,7 @@ TEST_F(LlvmLibcSinTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = INPUTS[i];
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sin(x), 0.5);
+                                   LIBC_NAMESPACE::sin(x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/sincos_test.cpp b/libc/test/src/math/sincos_test.cpp
index 09c87153890d0..30bfec646eee9 100644
--- a/libc/test/src/math/sincos_test.cpp
+++ b/libc/test/src/math/sincos_test.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/sincos.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcSincosTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -26,36 +33,36 @@ using LIBC_NAMESPACE::testing::tlog;
     mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
     if (__r1.success) {                                                        \
       LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Nearest);                          \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Nearest);                          \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
     if (__r2.success) {                                                        \
       LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Upward);                           \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Upward);                           \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
     if (__r3.success) {                                                        \
       LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Downward);                         \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::Downward);                         \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
     if (__r4.success) {                                                        \
       LIBC_NAMESPACE::sincos(input, &sin_x, &cos_x);                           \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Sin, input, sin_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::TowardZero);                       \
-      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, 0.5,               \
+      ASSERT_MPFR_MATCH(mpfr::Operation::Cos, input, cos_x, TOLERANCE + 0.5,   \
                         mpfr::RoundingMode::TowardZero);                       \
     }                                                                          \
   } while (0)
diff --git a/libc/test/src/math/sincosf_test.cpp b/libc/test/src/math/sincosf_test.cpp
index 87e995db51119..9e6263bb52a67 100644
--- a/libc/test/src/math/sincosf_test.cpp
+++ b/libc/test/src/math/sincosf_test.cpp
@@ -8,14 +8,20 @@
 
 #include "hdr/errno_macros.h"
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/sincosf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcSinCosfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -60,36 +66,36 @@ TEST_F(LlvmLibcSinCosfTest, SpecialNumbers) {
     mpfr::ForceRoundingMode __r1(mpfr::RoundingMode::Nearest);                 \
     if (__r1.success) {                                                        \
       LIBC_NAMESPACE::sincosf(input, &sin, &cos);                              \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Nearest);                          \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Nearest);                          \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r2(mpfr::RoundingMode::Upward);                  \
     if (__r2.success) {                                                        \
       LIBC_NAMESPACE::sincosf(input, &sin, &cos);                              \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Upward);                           \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Upward);                           \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r3(mpfr::RoundingMode::Downward);                \
     if (__r3.success) {                                                        \
       LIBC_NAMESPACE::sincosf(input, &sin, &cos);                              \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Downward);                         \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::Downward);                         \
     }                                                                          \
                                                                                \
     mpfr::ForceRoundingMode __r4(mpfr::RoundingMode::TowardZero);              \
     if (__r4.success) {                                                        \
       LIBC_NAMESPACE::sincosf(input, &sin, &cos);                              \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Sin, input, sin, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::TowardZero);                       \
-      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, 0.5,                 \
+      EXPECT_MPFR_MATCH(mpfr::Operation::Cos, input, cos, TOLERANCE + 0.5,     \
                         mpfr::RoundingMode::TowardZero);                       \
     }                                                                          \
   }
diff --git a/libc/test/src/math/sinf16_test.cpp b/libc/test/src/math/sinf16_test.cpp
index b05501cb0f145..d8e9e02f6b1b2 100644
--- a/libc/test/src/math/sinf16_test.cpp
+++ b/libc/test/src/math/sinf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/sinf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcSinf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcSinf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sinf16(x), 0.5);
+                                   LIBC_NAMESPACE::sinf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcSinf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sinf16(x), 0.5);
+                                   LIBC_NAMESPACE::sinf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/sinf_test.cpp b/libc/test/src/math/sinf_test.cpp
index 42de7afa2fb74..516941ba49b9e 100644
--- a/libc/test/src/math/sinf_test.cpp
+++ b/libc/test/src/math/sinf_test.cpp
@@ -8,6 +8,7 @@
 
 #include "hdr/errno_macros.h"
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
@@ -15,7 +16,13 @@
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 3
+#define FP_ASSERT ASSERT_MPFR_MATCH
+#else
+#define TOLERANCE 0
+#define FP_ASSERT ASSERT_MPFR_MATCH_ALL_ROUNDING
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcSinfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -30,8 +37,13 @@ TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
   EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::sinf(0.0f));
   EXPECT_MATH_ERRNO(0);
 
+// When accuracy is not required, signed zeros might not be preserved.
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  EXPECT_TRUE(0.0f == LIBC_NAMESPACE::sinf(-0.0f));
+#else
   EXPECT_FP_EQ(-0.0f, LIBC_NAMESPACE::sinf(-0.0f));
   EXPECT_MATH_ERRNO(0);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(inf));
   EXPECT_MATH_ERRNO(EDOM);
@@ -47,8 +59,8 @@ TEST_F(LlvmLibcSinfTest, InFloatRange) {
     float x = FPBits(v).get_val();
     if (FPBits(v).is_nan() || FPBits(v).is_inf())
       continue;
-    ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sinf(x), 0.5);
+    FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x),
+              TOLERANCE + 0.5);
   }
 }
 
@@ -95,22 +107,20 @@ TEST_F(LlvmLibcSinfTest, SpecificBitPatterns) {
 
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
-    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sinf(x), 0.5);
-    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, -x,
-                                   LIBC_NAMESPACE::sinf(-x), 0.5);
+    FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x),
+              TOLERANCE + 0.5);
+    FP_ASSERT(mpfr::Operation::Sin, -x, LIBC_NAMESPACE::sinf(-x),
+              TOLERANCE + 0.5);
   }
 }
 
 // For small values, sin(x) is x.
 TEST_F(LlvmLibcSinfTest, SmallValues) {
   float x = FPBits(0x1780'0000U).get_val();
-  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                 LIBC_NAMESPACE::sinf(x), 0.5);
+  FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), 0.5);
 
   x = FPBits(0x0040'0000U).get_val();
-  EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                 LIBC_NAMESPACE::sinf(x), 0.5);
+  FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x), 0.5);
 }
 
 // SDCOMP-26094: check sinf in the cases for which the range reducer
@@ -118,7 +128,7 @@ TEST_F(LlvmLibcSinfTest, SmallValues) {
 TEST_F(LlvmLibcSinfTest, SDCOMP_26094) {
   for (uint32_t v : SDCOMP26094_VALUES) {
     float x = FPBits((v)).get_val();
-    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sin, x,
-                                   LIBC_NAMESPACE::sinf(x), 0.5);
+    FP_ASSERT(mpfr::Operation::Sin, x, LIBC_NAMESPACE::sinf(x),
+              TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/sinhf16_test.cpp b/libc/test/src/math/sinhf16_test.cpp
index a16ab9279c457..04e0730f0efd4 100644
--- a/libc/test/src/math/sinhf16_test.cpp
+++ b/libc/test/src/math/sinhf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/sinhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcSinhf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
-                                   LIBC_NAMESPACE::sinhf16(x), 0.5);
+                                   LIBC_NAMESPACE::sinhf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcSinhf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
-                                   LIBC_NAMESPACE::sinhf16(x), 0.5);
+                                   LIBC_NAMESPACE::sinhf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/sinhf_test.cpp b/libc/test/src/math/sinhf_test.cpp
index 6f9ac2fde8278..93a1eecc6186c 100644
--- a/libc/test/src/math/sinhf_test.cpp
+++ b/libc/test/src/math/sinhf_test.cpp
@@ -8,14 +8,20 @@
 
 #include "hdr/errno_macros.h"
 #include "hdr/math_macros.h"
+#include "hdr/stdint_proxy.h"
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/sinhf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
-#include "hdr/stdint_proxy.h"
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
 using LlvmLibcSinhfTest = LIBC_NAMESPACE::testing::FPTest<float>;
 
@@ -79,9 +85,9 @@ TEST_F(LlvmLibcSinhfTest, Overflow) {
 TEST_F(LlvmLibcSinhfTest, ExceptionalValues) {
   float x = FPBits(uint32_t(0x3a12'85ffU)).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
-                                 LIBC_NAMESPACE::sinhf(x), 0.5);
+                                 LIBC_NAMESPACE::sinhf(x), TOLERANCE + 0.5);
 
   x = -FPBits(uint32_t(0x3a12'85ffU)).get_val();
   EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
-                                 LIBC_NAMESPACE::sinhf(x), 0.5);
+                                 LIBC_NAMESPACE::sinhf(x), TOLERANCE + 0.5);
 }
diff --git a/libc/test/src/math/smoke/sinf_test.cpp b/libc/test/src/math/smoke/sinf_test.cpp
index b0ba81e650d40..7b671c6cfa067 100644
--- a/libc/test/src/math/smoke/sinf_test.cpp
+++ b/libc/test/src/math/smoke/sinf_test.cpp
@@ -10,6 +10,7 @@
 #include "hdr/math_macros.h"
 #include "hdr/stdint_proxy.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/sinf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
@@ -26,8 +27,13 @@ TEST_F(LlvmLibcSinfTest, SpecialNumbers) {
   EXPECT_FP_EQ(0.0f, LIBC_NAMESPACE::sinf(0.0f));
   EXPECT_MATH_ERRNO(0);
 
+// When accuracy is not required, signed zeros might not be preserved.
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+  EXPECT_TRUE(0.0f == LIBC_NAMESPACE::sinf(-0.0f));
+#else
   EXPECT_FP_EQ(-0.0f, LIBC_NAMESPACE::sinf(-0.0f));
   EXPECT_MATH_ERRNO(0);
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
 
   EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinf(inf));
   EXPECT_MATH_ERRNO(EDOM);
diff --git a/libc/test/src/math/tan_test.cpp b/libc/test/src/math/tan_test.cpp
index 12dfc02bac111..62f8994c9143a 100644
--- a/libc/test/src/math/tan_test.cpp
+++ b/libc/test/src/math/tan_test.cpp
@@ -7,11 +7,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/tan.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 4
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcTanTest = LIBC_NAMESPACE::testing::FPTest<double>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -53,9 +60,9 @@ TEST_F(LlvmLibcTanTest, TrickyInputs) {
   for (int i = 0; i < N; ++i) {
     double x = INPUTS[i];
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
-                                   LIBC_NAMESPACE::tan(x), 0.5);
+                                   LIBC_NAMESPACE::tan(x), TOLERANCE + 0.5);
     ASSERT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, -x,
-                                   LIBC_NAMESPACE::tan(-x), 0.5);
+                                   LIBC_NAMESPACE::tan(-x), TOLERANCE + 0.5);
   }
 }
 
@@ -89,7 +96,7 @@ TEST_F(LlvmLibcTanTest, InDoubleRange) {
       ++tested;
 
       if (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Tan, x, result,
-                                             0.5, rounding_mode)) {
+                                             TOLERANCE + 0.5, rounding_mode)) {
         ++fails;
         while (!TEST_MPFR_MATCH_ROUNDING_SILENTLY(mpfr::Operation::Tan, x,
                                                   result, ulp, rounding_mode)) {
diff --git a/libc/test/src/math/tanf16_test.cpp b/libc/test/src/math/tanf16_test.cpp
index f2e874182efc1..cd4613bfb0ac4 100644
--- a/libc/test/src/math/tanf16_test.cpp
+++ b/libc/test/src/math/tanf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/tanf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcTanf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcTanf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
-                                   LIBC_NAMESPACE::tanf16(x), 0.5);
+                                   LIBC_NAMESPACE::tanf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcTanf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
-                                   LIBC_NAMESPACE::tanf16(x), 0.5);
+                                   LIBC_NAMESPACE::tanf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/tanf_test.cpp b/libc/test/src/math/tanf_test.cpp
index 2202bd623f5ed..b384d8cab4ea7 100644
--- a/libc/test/src/math/tanf_test.cpp
+++ b/libc/test/src/math/tanf_test.cpp
@@ -9,12 +9,19 @@
 #include "hdr/errno_macros.h"
 #include "hdr/math_macros.h"
 #include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/macros/optimization.h"
 #include "src/math/tanf.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "test/src/math/sdcomp26094.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 #include "hdr/stdint_proxy.h"
 
 using LlvmLibcTanfTest = LIBC_NAMESPACE::testing::FPTest<float>;
@@ -114,9 +121,9 @@ TEST_F(LlvmLibcTanfTest, SpecificBitPatterns) {
   for (int i = 0; i < N; ++i) {
     float x = FPBits(INPUTS[i]).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, x,
-                                   LIBC_NAMESPACE::tanf(x), 0.5);
+                                   LIBC_NAMESPACE::tanf(x), TOLERANCE + 0.5);
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tan, -x,
-                                   LIBC_NAMESPACE::tanf(-x), 0.5);
+                                   LIBC_NAMESPACE::tanf(-x), TOLERANCE + 0.5);
   }
 }
 
diff --git a/libc/test/src/math/tanhf16_test.cpp b/libc/test/src/math/tanhf16_test.cpp
index 7124a83f3d7bc..ff796047309bf 100644
--- a/libc/test/src/math/tanhf16_test.cpp
+++ b/libc/test/src/math/tanhf16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/tanhf16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcTanhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,7 @@ TEST_F(LlvmLibcTanhf16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
-                                   LIBC_NAMESPACE::tanhf16(x), 0.5);
+                                   LIBC_NAMESPACE::tanhf16(x), TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +42,6 @@ TEST_F(LlvmLibcTanhf16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanh, x,
-                                   LIBC_NAMESPACE::tanhf16(x), 0.5);
+                                   LIBC_NAMESPACE::tanhf16(x), TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/math/tanpif16_test.cpp b/libc/test/src/math/tanpif16_test.cpp
index 04d1f713574a1..f994781dd84b1 100644
--- a/libc/test/src/math/tanpif16_test.cpp
+++ b/libc/test/src/math/tanpif16_test.cpp
@@ -6,11 +6,18 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "src/__support/macros/optimization.h"
 #include "src/math/tanpif16.h"
 #include "test/UnitTest/FPMatcher.h"
 #include "test/UnitTest/Test.h"
 #include "utils/MPFRWrapper/MPFRUtils.h"
 
+#ifdef LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+#define TOLERANCE 1
+#else
+#define TOLERANCE 0
+#endif // LIBC_MATH_HAS_SKIP_ACCURATE_PASS
+
 using LlvmLibcTanpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
 
 namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
@@ -27,7 +34,8 @@ TEST_F(LlvmLibcTanpif16Test, PositiveRange) {
   for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanpi, x,
-                                   LIBC_NAMESPACE::tanpif16(x), 0.5);
+                                   LIBC_NAMESPACE::tanpif16(x),
+                                   TOLERANCE + 0.5);
   }
 }
 
@@ -35,6 +43,7 @@ TEST_F(LlvmLibcTanpif16Test, NegativeRange) {
   for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
     float16 x = FPBits(v).get_val();
     EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Tanpi, x,
-                                   LIBC_NAMESPACE::tanpif16(x), 0.5);
+                                   LIBC_NAMESPACE::tanpif16(x),
+                                   TOLERANCE + 0.5);
   }
 }
diff --git a/libc/test/src/nl_types/CMakeLists.txt b/libc/test/src/nl_types/CMakeLists.txt
index 4fce637baa726..6bafb32f15c0b 100644
--- a/libc/test/src/nl_types/CMakeLists.txt
+++ b/libc/test/src/nl_types/CMakeLists.txt
@@ -7,6 +7,7 @@ add_libc_test(
   SRCS
     nl_types_test.cpp
   DEPENDS
+    libc.include.llvm-libc-macros.nl_types_macros
     libc.include.llvm-libc-types.nl_catd
     libc.src.nl_types.catopen
     libc.src.nl_types.catclose
diff --git a/libc/test/src/nl_types/nl_types_test.cpp b/libc/test/src/nl_types/nl_types_test.cpp
index 5ae5c5ab28546..7392200272dfd 100644
--- a/libc/test/src/nl_types/nl_types_test.cpp
+++ b/libc/test/src/nl_types/nl_types_test.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/nl-types-macros.h"
 #include "include/llvm-libc-types/nl_catd.h"
 #include "src/nl_types/catclose.h"
 #include "src/nl_types/catgets.h"
@@ -15,7 +16,7 @@
 using LlvmLibcNlTypesTest = LIBC_NAMESPACE::testing::ErrnoCheckingTest;
 
 TEST_F(LlvmLibcNlTypesTest, CatopenFails) {
-  ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", 0),
+  ASSERT_EQ(LIBC_NAMESPACE::catopen("/somepath", NL_CAT_LOCALE),
             reinterpret_cast<nl_catd>(-1));
   ASSERT_ERRNO_EQ(EINVAL);
 }
@@ -28,6 +29,6 @@ TEST_F(LlvmLibcNlTypesTest, CatgetsFails) {
   const char *message = "message";
   // Note that we test for pointer equality here, since catgets
   // is expected to return the input argument as-is.
-  ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, 0, 0, message),
+  ASSERT_EQ(LIBC_NAMESPACE::catgets(nullptr, NL_SETD, 1, message),
             const_cast<char *>(message));
 }
diff --git a/libc/test/src/stdfix/CMakeLists.txt b/libc/test/src/stdfix/CMakeLists.txt
index 741522276feaa..04bedc3e00fe2 100644
--- a/libc/test/src/stdfix/CMakeLists.txt
+++ b/libc/test/src/stdfix/CMakeLists.txt
@@ -4,7 +4,6 @@ endif()
 
 add_custom_target(libc-stdfix-tests)
 
-
 foreach(suffix IN ITEMS hr r lr hk k lk)
   add_libc_test(
     abs${suffix}_test
@@ -84,6 +83,18 @@ foreach(suffix IN ITEMS hr r lr hk k lk uhr ur ulr uhk uk ulk)
     SRCS
       bits${suffix}_test.cpp
     DEPENDS
+      libc.include.llvm-libc-types.int_k_t
+      libc.include.llvm-libc-types.int_hr_t
+      libc.include.llvm-libc-types.int_lk_t
+      libc.include.llvm-libc-types.int_lr_t
+      libc.include.llvm-libc-types.int_r_t
+      libc.include.llvm-libc-types.uint_uhk_t
+      libc.include.llvm-libc-types.uint_uhr_t
+      libc.include.llvm-libc-types.uint_uk_t
+      libc.include.llvm-libc-types.uint_ulk_t
+      libc.include.llvm-libc-types.uint_ulr_t
+      libc.include.llvm-libc-types.uint_ur_t
+      libc.include.llvm-libc-types.int_hk_t
       libc.src.stdfix.bits${suffix}
       libc.src.__support.fixed_point.fx_rep
       libc.src.__support.fixed_point.fx_bits
diff --git a/libc/test/src/stdfix/FxBitsTest.h b/libc/test/src/stdfix/FxBitsTest.h
index ae04cc145f271..18dacadf08b03 100644
--- a/libc/test/src/stdfix/FxBitsTest.h
+++ b/libc/test/src/stdfix/FxBitsTest.h
@@ -8,7 +8,6 @@
 
 #include "test/UnitTest/Test.h"
 
-#include "include/llvm-libc-types/stdfix-types.h"
 #include "src/__support/CPP/bit.h"
 #include "src/__support/fixed_point/fx_bits.h"
 
diff --git a/libc/test/src/stdfix/bitshk_test.cpp b/libc/test/src/stdfix/bitshk_test.cpp
index ca83162d439a6..85dbf0a8067ab 100644
--- a/libc/test/src/stdfix/bitshk_test.cpp
+++ b/libc/test/src/stdfix/bitshk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_hk_t
+#include "llvm-libc-types/int_hk_t.h" // int_hk_t
 #include "src/stdfix/bitshk.h"
 
 LIST_BITSFX_TESTS(hk, short accum, int_hk_t, LIBC_NAMESPACE::bitshk);
diff --git a/libc/test/src/stdfix/bitshr_test.cpp b/libc/test/src/stdfix/bitshr_test.cpp
index 220d7f6a69c16..d9583ad2bf4df 100644
--- a/libc/test/src/stdfix/bitshr_test.cpp
+++ b/libc/test/src/stdfix/bitshr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_hr_t
+#include "llvm-libc-types/int_hr_t.h" // int_hr_t
 #include "src/stdfix/bitshr.h"
 
 LIST_BITSFX_TESTS(hr, short fract, int_hr_t, LIBC_NAMESPACE::bitshr);
diff --git a/libc/test/src/stdfix/bitsk_test.cpp b/libc/test/src/stdfix/bitsk_test.cpp
index 7e0057bae4657..8d38ab5eb5fda 100644
--- a/libc/test/src/stdfix/bitsk_test.cpp
+++ b/libc/test/src/stdfix/bitsk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_k_t
+#include "llvm-libc-types/int_k_t.h" // int_k_t
 #include "src/stdfix/bitsk.h"
 
 LIST_BITSFX_TESTS(k, accum, int_k_t, LIBC_NAMESPACE::bitsk);
diff --git a/libc/test/src/stdfix/bitslk_test.cpp b/libc/test/src/stdfix/bitslk_test.cpp
index 46c04e2f75511..fc6fcbb0d94e1 100644
--- a/libc/test/src/stdfix/bitslk_test.cpp
+++ b/libc/test/src/stdfix/bitslk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_lk_t
+#include "llvm-libc-types/int_lk_t.h" // int_lk_t
 #include "src/stdfix/bitslk.h"
 
 LIST_BITSFX_TESTS(lk, long accum, int_lk_t, LIBC_NAMESPACE::bitslk);
diff --git a/libc/test/src/stdfix/bitslr_test.cpp b/libc/test/src/stdfix/bitslr_test.cpp
index ef68d2831fb9d..f88580c4b5a25 100644
--- a/libc/test/src/stdfix/bitslr_test.cpp
+++ b/libc/test/src/stdfix/bitslr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_lr_t
+#include "llvm-libc-types/int_lr_t.h" // int_lr_t
 #include "src/stdfix/bitslr.h"
 
 LIST_BITSFX_TESTS(hk, long fract, int_lr_t, LIBC_NAMESPACE::bitslr);
diff --git a/libc/test/src/stdfix/bitsr_test.cpp b/libc/test/src/stdfix/bitsr_test.cpp
index 0aeb980e30382..e987a7f80f568 100644
--- a/libc/test/src/stdfix/bitsr_test.cpp
+++ b/libc/test/src/stdfix/bitsr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // int_r_t
+#include "llvm-libc-types/int_r_t.h" // int_r_t
 #include "src/stdfix/bitsr.h"
 
 LIST_BITSFX_TESTS(r, fract, int_r_t, LIBC_NAMESPACE::bitsr);
diff --git a/libc/test/src/stdfix/bitsuhk_test.cpp b/libc/test/src/stdfix/bitsuhk_test.cpp
index 5ddb78383df02..5b09453641205 100644
--- a/libc/test/src/stdfix/bitsuhk_test.cpp
+++ b/libc/test/src/stdfix/bitsuhk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uhk_t
+#include "llvm-libc-types/uint_uhk_t.h" // uint_uhk_t
 #include "src/stdfix/bitsuhk.h"
 
 LIST_BITSFX_TESTS(uhk, unsigned short accum, uint_uhk_t,
diff --git a/libc/test/src/stdfix/bitsuhr_test.cpp b/libc/test/src/stdfix/bitsuhr_test.cpp
index 6f5d559859456..0d006a35ece6d 100644
--- a/libc/test/src/stdfix/bitsuhr_test.cpp
+++ b/libc/test/src/stdfix/bitsuhr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uhr_t
+#include "llvm-libc-types/uint_uhr_t.h" // uint_uhr_t
 #include "src/stdfix/bitsuhr.h"
 
 LIST_BITSFX_TESTS(uhr, unsigned short fract, uint_uhr_t,
diff --git a/libc/test/src/stdfix/bitsuk_test.cpp b/libc/test/src/stdfix/bitsuk_test.cpp
index 309c525f3fd2d..06a8a4e515890 100644
--- a/libc/test/src/stdfix/bitsuk_test.cpp
+++ b/libc/test/src/stdfix/bitsuk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_uk_t
+#include "llvm-libc-types/uint_uk_t.h" // uint_uk_t
 #include "src/stdfix/bitsuk.h"
 
 LIST_BITSFX_TESTS(uk, unsigned accum, uint_uk_t, LIBC_NAMESPACE::bitsuk);
diff --git a/libc/test/src/stdfix/bitsulk_test.cpp b/libc/test/src/stdfix/bitsulk_test.cpp
index cba011d5f222d..9d56a183cef46 100644
--- a/libc/test/src/stdfix/bitsulk_test.cpp
+++ b/libc/test/src/stdfix/bitsulk_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ulk_t
+#include "llvm-libc-types/uint_ulk_t.h" // uint_ulk_t
 #include "src/stdfix/bitsulk.h"
 
 LIST_BITSFX_TESTS(ulk, unsigned long accum, uint_ulk_t,
diff --git a/libc/test/src/stdfix/bitsulr_test.cpp b/libc/test/src/stdfix/bitsulr_test.cpp
index 39b21c424199e..3ebbf8acd14d3 100644
--- a/libc/test/src/stdfix/bitsulr_test.cpp
+++ b/libc/test/src/stdfix/bitsulr_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ulr_t
+#include "llvm-libc-types/uint_ulr_t.h" // uint_ulr_t
 #include "src/stdfix/bitsulr.h"
 
 LIST_BITSFX_TESTS(ulr, unsigned long fract, uint_ulr_t,
diff --git a/libc/test/src/stdfix/bitsur_test.cpp b/libc/test/src/stdfix/bitsur_test.cpp
index b7c4b0617eb6e..4f368075d8279 100644
--- a/libc/test/src/stdfix/bitsur_test.cpp
+++ b/libc/test/src/stdfix/bitsur_test.cpp
@@ -8,7 +8,7 @@
 
 #include "BitsFxTest.h"
 
-#include "llvm-libc-types/stdfix-types.h" // uint_ur_t
+#include "llvm-libc-types/uint_ur_t.h" // uint_ur_t
 #include "src/stdfix/bitsur.h"
 
 LIST_BITSFX_TESTS(ur, unsigned fract, uint_ur_t, LIBC_NAMESPACE::bitsur);
diff --git a/libclc/cmake/modules/AddLibclc.cmake b/libclc/cmake/modules/AddLibclc.cmake
index d8c22191678d4..3228926e765f4 100644
--- a/libclc/cmake/modules/AddLibclc.cmake
+++ b/libclc/cmake/modules/AddLibclc.cmake
@@ -392,7 +392,7 @@ function(add_libclc_builtin_set)
   list( PREPEND bytecode_files ${bytecode_ir_files} )
 
   if( NOT bytecode_files )
-    message(FATAL_ERROR "Cannot create an empty builtins library")
+    message(FATAL_ERROR "Cannot create an empty builtins library for ${ARG_ARCH_SUFFIX}")
   endif()
 
   set( builtins_link_lib_tgt builtins.link.${ARG_ARCH_SUFFIX} )
diff --git a/libcxx/cmake/caches/Armv7Arm.cmake b/libcxx/cmake/caches/Armv7Arm.cmake
index e60d43f33dde7..be8f9ff74a8e4 100644
--- a/libcxx/cmake/caches/Armv7Arm.cmake
+++ b/libcxx/cmake/caches/Armv7Arm.cmake
@@ -1,3 +1,5 @@
 set(CMAKE_CXX_COMPILER_TARGET "armv7l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv7Thumb-no-exceptions.cmake b/libcxx/cmake/caches/Armv7Thumb-no-exceptions.cmake
index 70b619f8ec31f..5ddfa15d77a49 100644
--- a/libcxx/cmake/caches/Armv7Thumb-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Armv7Thumb-no-exceptions.cmake
@@ -3,3 +3,5 @@ set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv8Arm.cmake b/libcxx/cmake/caches/Armv8Arm.cmake
index a289471603530..e0379b5870bdf 100644
--- a/libcxx/cmake/caches/Armv8Arm.cmake
+++ b/libcxx/cmake/caches/Armv8Arm.cmake
@@ -1,3 +1,5 @@
 set(CMAKE_CXX_COMPILER_TARGET "armv8l-linux-gnueabihf" CACHE STRING "")
 set(CMAKE_CXX_FLAGS "-marm" CACHE STRING "")
 set(CMAKE_C_FLAGS "-marm" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Armv8Thumb-no-exceptions.cmake b/libcxx/cmake/caches/Armv8Thumb-no-exceptions.cmake
index b4c7dfca279cd..b0a53484961e4 100644
--- a/libcxx/cmake/caches/Armv8Thumb-no-exceptions.cmake
+++ b/libcxx/cmake/caches/Armv8Thumb-no-exceptions.cmake
@@ -3,3 +3,5 @@ set(CMAKE_CXX_FLAGS "-mthumb" CACHE STRING "")
 set(CMAKE_C_FLAGS "-mthumb" CACHE STRING "")
 set(LIBCXX_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_EXCEPTIONS OFF CACHE BOOL "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake b/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake
index 1d401ba69ab80..483e8a71da73d 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-debug.cmake
@@ -1 +1,3 @@
 set(LIBCXX_HARDENING_MODE "debug" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
index 72263dfd84635..d4d506eba53f3 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-extensive.cmake
@@ -1 +1,3 @@
 set(LIBCXX_HARDENING_MODE "extensive" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
index d4ce32ce5b17f..2d617737e9635 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast-with-abi-breaks.cmake
@@ -8,3 +8,5 @@ set(_defines
     _LIBCPP_ABI_BOUNDED_ITERATORS_IN_OPTIONAL
 )
 set(LIBCXX_ABI_DEFINES "${_defines}" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-hardening-mode-fast.cmake b/libcxx/cmake/caches/Generic-hardening-mode-fast.cmake
index cad5a1c4837fc..acc9cbd69afda 100644
--- a/libcxx/cmake/caches/Generic-hardening-mode-fast.cmake
+++ b/libcxx/cmake/caches/Generic-hardening-mode-fast.cmake
@@ -1 +1,3 @@
 set(LIBCXX_HARDENING_MODE "fast" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-merged.cmake b/libcxx/cmake/caches/Generic-merged.cmake
index 7ebb8026236dd..461d10336e2fb 100644
--- a/libcxx/cmake/caches/Generic-merged.cmake
+++ b/libcxx/cmake/caches/Generic-merged.cmake
@@ -9,3 +9,6 @@ set(LIBCXXABI_ENABLE_STATIC_UNWINDER ON CACHE BOOL "")
 set(LIBCXXABI_STATICALLY_LINK_UNWINDER_IN_STATIC_LIBRARY ON CACHE BOOL "")
 
 set(LIBUNWIND_ENABLE_SHARED OFF CACHE BOOL "")
+
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-msan.cmake b/libcxx/cmake/caches/Generic-msan.cmake
index a8fbd285a9adf..a7e3128399285 100644
--- a/libcxx/cmake/caches/Generic-msan.cmake
+++ b/libcxx/cmake/caches/Generic-msan.cmake
@@ -1,4 +1,4 @@
 set(LLVM_USE_SANITIZER "MemoryWithOrigins" CACHE STRING "")
-set(LIBCXX_TEST_PARAMS "long_tests=False" CACHE STRING "") # MSAN is really slow and tests can sometimes timeout otherwise
+set(LIBCXX_TEST_PARAMS "long_tests=False;enable_modules=clang" CACHE STRING "") # MSAN is really slow and tests can sometimes timeout otherwise
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
 set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") # MSAN is compiled against the system unwinder, which leads to false positives
diff --git a/libcxx/cmake/caches/Generic-optimized-speed.cmake b/libcxx/cmake/caches/Generic-optimized-speed.cmake
index 577a5de9f34c5..b3a845caa0d54 100644
--- a/libcxx/cmake/caches/Generic-optimized-speed.cmake
+++ b/libcxx/cmake/caches/Generic-optimized-speed.cmake
@@ -2,3 +2,5 @@ set(CMAKE_BUILD_TYPE RelWithDebInfo CACHE STRING "")
 set(LIBCXX_TEST_PARAMS "optimization=speed" CACHE STRING "")
 set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
 set(LIBUNWIND_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-static.cmake b/libcxx/cmake/caches/Generic-static.cmake
index ed2bf857146dc..0866a215b9507 100644
--- a/libcxx/cmake/caches/Generic-static.cmake
+++ b/libcxx/cmake/caches/Generic-static.cmake
@@ -1,3 +1,5 @@
 set(LIBCXX_ENABLE_SHARED OFF CACHE BOOL "")
 set(LIBCXXABI_ENABLE_SHARED OFF CACHE BOOL "")
 set(LIBUNWIND_ENABLE_SHARED OFF CACHE BOOL "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-tsan.cmake b/libcxx/cmake/caches/Generic-tsan.cmake
index c42c1bb8e7222..662fb1f46ebe5 100644
--- a/libcxx/cmake/caches/Generic-tsan.cmake
+++ b/libcxx/cmake/caches/Generic-tsan.cmake
@@ -1,2 +1,4 @@
 set(LLVM_USE_SANITIZER "Thread" CACHE STRING "")
 set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "") # TSAN is compiled against the system unwinder, which leads to false positives
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/cmake/caches/Generic-ubsan.cmake b/libcxx/cmake/caches/Generic-ubsan.cmake
index 7ad891e4aed9e..f1a36c355b47c 100644
--- a/libcxx/cmake/caches/Generic-ubsan.cmake
+++ b/libcxx/cmake/caches/Generic-ubsan.cmake
@@ -1,2 +1,4 @@
 set(LLVM_USE_SANITIZER "Undefined" CACHE STRING "")
 set(LIBCXX_ABI_UNSTABLE ON CACHE BOOL "")
+set(LIBCXX_TEST_PARAMS "enable_modules=clang" CACHE STRING "")
+set(LIBCXXABI_TEST_PARAMS "${LIBCXX_TEST_PARAMS}" CACHE STRING "")
diff --git a/libcxx/docs/Status/Cxx17Issues.csv b/libcxx/docs/Status/Cxx17Issues.csv
index 4e8cbf9abb893..2fb7f14996be0 100644
--- a/libcxx/docs/Status/Cxx17Issues.csv
+++ b/libcxx/docs/Status/Cxx17Issues.csv
@@ -1,4 +1,4 @@
-"Issue #","Issue Name","Meeting","Status","First released version","Notes"
+"Issue #","Issue Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`LWG2016 <https://wg21.link/LWG2016>`__","Allocators must be no-throw swappable","2014-11 (Urbana)","|Complete|","","`#103224 <https://github.com/llvm/llvm-project/issues/103224>`__",""
 "`LWG2118 <https://wg21.link/LWG2118>`__","``unique_ptr``\  for array does not support cv qualification conversion of actual argument","2014-11 (Urbana)","|Complete|","","`#103225 <https://github.com/llvm/llvm-project/issues/103225>`__",""
 "`LWG2170 <https://wg21.link/LWG2170>`__","Aggregates cannot be ``DefaultConstructible``\ ","2014-11 (Urbana)","|Complete|","","`#103226 <https://github.com/llvm/llvm-project/issues/103226>`__",""
@@ -23,7 +23,7 @@
 "`LWG2365 <https://wg21.link/LWG2365>`__","Missing noexcept in ``shared_ptr::shared_ptr(nullptr_t)``\ ","2014-11 (Urbana)","|Complete|","","`#103246 <https://github.com/llvm/llvm-project/issues/103246>`__",""
 "`LWG2376 <https://wg21.link/LWG2376>`__","``bad_weak_ptr::what()``\  overspecified","2014-11 (Urbana)","|Complete|","","`#103247 <https://github.com/llvm/llvm-project/issues/103247>`__",""
 "`LWG2387 <https://wg21.link/LWG2387>`__","More nested types that must be accessible and unambiguous","2014-11 (Urbana)","|Complete|","","`#103248 <https://github.com/llvm/llvm-project/issues/103248>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2059 <https://wg21.link/LWG2059>`__","C++0x ambiguity problem with map::erase","2015-05 (Lenexa)","|Complete|","","`#103249 <https://github.com/llvm/llvm-project/issues/103249>`__",""
 "`LWG2063 <https://wg21.link/LWG2063>`__","Contradictory requirements for string move assignment","2015-05 (Lenexa)","|Complete|","","`#103250 <https://github.com/llvm/llvm-project/issues/103250>`__",""
 "`LWG2076 <https://wg21.link/LWG2076>`__","Bad CopyConstructible requirement in set constructors","2015-05 (Lenexa)","|Complete|","","`#103251 <https://github.com/llvm/llvm-project/issues/103251>`__",""
@@ -59,7 +59,7 @@
 "`LWG2470 <https://wg21.link/LWG2470>`__","Allocator's destroy function should be allowed to fail to instantiate","2015-05 (Lenexa)","|Complete|","","`#103283 <https://github.com/llvm/llvm-project/issues/103283>`__",""
 "`LWG2482 <https://wg21.link/LWG2482>`__","[c.strings] Table 73 mentions nonexistent functions","2015-05 (Lenexa)","|Complete|","","`#103284 <https://github.com/llvm/llvm-project/issues/103284>`__",""
 "`LWG2488 <https://wg21.link/LWG2488>`__","Placeholders should be allowed and encouraged to be constexpr","2015-05 (Lenexa)","|Complete|","","`#103285 <https://github.com/llvm/llvm-project/issues/103285>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG1169 <https://wg21.link/LWG1169>`__","``num_get``\  not fully compatible with ``strto*``\ ","2015-10 (Kona)","|Complete|","","`#103286 <https://github.com/llvm/llvm-project/issues/103286>`__",""
 "`LWG2072 <https://wg21.link/LWG2072>`__","Unclear wording about capacity of temporary buffers","2015-10 (Kona)","|Complete|","","`#103287 <https://github.com/llvm/llvm-project/issues/103287>`__",""
 "`LWG2101 <https://wg21.link/LWG2101>`__","Some transformation types can produce impossible types","2015-10 (Kona)","|Complete|","","`#103288 <https://github.com/llvm/llvm-project/issues/103288>`__",""
@@ -98,7 +98,7 @@
 "`LWG2489 <https://wg21.link/LWG2489>`__","``mem_fn()``\  should be ``noexcept``\ ","2015-10 (Kona)","|Complete|","","`#103335 <https://github.com/llvm/llvm-project/issues/103335>`__",""
 "`LWG2492 <https://wg21.link/LWG2492>`__","Clarify requirements for ``comp``\ ","2015-10 (Kona)","|Complete|","","`#103336 <https://github.com/llvm/llvm-project/issues/103336>`__",""
 "`LWG2495 <https://wg21.link/LWG2495>`__","There is no such thing as an Exception Safety element","2015-10 (Kona)","|Complete|","","`#103337 <https://github.com/llvm/llvm-project/issues/103337>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2192 <https://wg21.link/LWG2192>`__","Validity and return type of ``std::abs(0u)``\  is unclear","2016-02 (Jacksonville)","|Complete|","","`#103338 <https://github.com/llvm/llvm-project/issues/103338>`__",""
 "`LWG2276 <https://wg21.link/LWG2276>`__","Missing requirement on ``std::promise::set_exception``\ ","2016-02 (Jacksonville)","|Complete|","","`#103339 <https://github.com/llvm/llvm-project/issues/103339>`__",""
 "`LWG2296 <https://wg21.link/LWG2296>`__","``std::addressof``\  should be ``constexpr``\ ","2016-02 (Jacksonville)","|Complete|","","`#103340 <https://github.com/llvm/llvm-project/issues/103340>`__",""
@@ -128,7 +128,7 @@
 "`LWG2585 <https://wg21.link/LWG2585>`__","``forward_list::resize(size_type, const value_type&)``\  effects incorrect","2016-02 (Jacksonville)","|Complete|","","`#103363 <https://github.com/llvm/llvm-project/issues/103363>`__",""
 "`LWG2586 <https://wg21.link/LWG2586>`__","Wrong value category used in ``scoped_allocator_adaptor::construct()``\ ","2016-02 (Jacksonville)","|Complete|","","`#103364 <https://github.com/llvm/llvm-project/issues/103364>`__",""
 "`LWG2590 <https://wg21.link/LWG2590>`__","Aggregate initialization for ``std::array``\ ","2016-02 (Jacksonville)","|Complete|","","`#103365 <https://github.com/llvm/llvm-project/issues/103365>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2181 <https://wg21.link/LWG2181>`__","Exceptions from seed sequence operations","2016-06 (Oulu)","|Complete|","","`#103367 <https://github.com/llvm/llvm-project/issues/103367>`__",""
 "`LWG2309 <https://wg21.link/LWG2309>`__","mutex::lock() should not throw device_or_resource_busy","2016-06 (Oulu)","|Complete|","","`#103368 <https://github.com/llvm/llvm-project/issues/103368>`__",""
 "`LWG2310 <https://wg21.link/LWG2310>`__","Public exposition only member in std::array","2016-06 (Oulu)","|Complete|","","`#103369 <https://github.com/llvm/llvm-project/issues/103369>`__",""
@@ -179,7 +179,7 @@
 "`LWG2726 <https://wg21.link/LWG2726>`__","``[recursive_]directory_iterator::increment(error_code&)`` is underspecified","2016-06 (Oulu)","|Complete|","","`#103429 <https://github.com/llvm/llvm-project/issues/103429>`__",""
 "`LWG2727 <https://wg21.link/LWG2727>`__","Parallel algorithms with constexpr specifier","2016-06 (Oulu)","","","`#99968 <https://github.com/llvm/llvm-project/issues/99968>`__",""
 "`LWG2728 <https://wg21.link/LWG2728>`__","status(p).permissions() and symlink_status(p).permissions() are not specified","2016-06 (Oulu)","|Complete|","","`#103430 <https://github.com/llvm/llvm-project/issues/103430>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2062 <https://wg21.link/LWG2062>`__","Effect contradictions w/o no-throw guarantee of std::function swaps","2016-11 (Issaquah)","|Complete|","","`#103431 <https://github.com/llvm/llvm-project/issues/103431>`__",""
 "`LWG2166 <https://wg21.link/LWG2166>`__","Heap property underspecified?","2016-11 (Issaquah)","","","`#99969 <https://github.com/llvm/llvm-project/issues/99969>`__",""
 "`LWG2221 <https://wg21.link/LWG2221>`__","No formatted output operator for nullptr","2016-11 (Issaquah)","|Complete|","","`#103432 <https://github.com/llvm/llvm-project/issues/103432>`__",""
@@ -257,7 +257,7 @@
 "`LWG2773 <https://wg21.link/LWG2773>`__","Making std::ignore constexpr","2016-11 (Issaquah)","|Complete|","","`#103541 <https://github.com/llvm/llvm-project/issues/103541>`__",""
 "`LWG2777 <https://wg21.link/LWG2777>`__","basic_string_view::copy should use char_traits::copy","2016-11 (Issaquah)","|Complete|","","`#103542 <https://github.com/llvm/llvm-project/issues/103542>`__",""
 "`LWG2778 <https://wg21.link/LWG2778>`__","basic_string_view is missing constexpr","2016-11 (Issaquah)","|Complete|","","`#103543 <https://github.com/llvm/llvm-project/issues/103543>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2260 <https://wg21.link/LWG2260>`__","Missing requirement for Allocator::pointer","2017-02 (Kona)","|Complete|","","`#103544 <https://github.com/llvm/llvm-project/issues/103544>`__",""
 "`LWG2676 <https://wg21.link/LWG2676>`__","Provide filesystem::path overloads for File-based streams","2017-02 (Kona)","|Complete|","","`#103545 <https://github.com/llvm/llvm-project/issues/103545>`__",""
 "`LWG2768 <https://wg21.link/LWG2768>`__","any_cast and move semantics","2017-02 (Kona)","|Complete|","","`#103546 <https://github.com/llvm/llvm-project/issues/103546>`__",""
@@ -309,7 +309,7 @@
 "`LWG2911 <https://wg21.link/LWG2911>`__","An is_aggregate type trait is needed","2017-02 (Kona)","|Complete|","","`#103597 <https://github.com/llvm/llvm-project/issues/103597>`__",""
 "`LWG2921 <https://wg21.link/LWG2921>`__","packaged_task and type-erased allocators","2017-02 (Kona)","|Complete|","20","`#103598 <https://github.com/llvm/llvm-project/issues/103598>`__","Originally implemented in LLVM 6 but reverted later. Old documentation incorrectly said it was implemented."
 "`LWG2934 <https://wg21.link/LWG2934>`__","optional<const T> doesn't compare with T","2017-02 (Kona)","|Complete|","","`#103599 <https://github.com/llvm/llvm-project/issues/103599>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2901 <https://wg21.link/LWG2901>`__","Variants cannot properly support allocators","2017-07 (Toronto)","|Complete|","","`#103600 <https://github.com/llvm/llvm-project/issues/103600>`__",""
 "`LWG2955 <https://wg21.link/LWG2955>`__","``to_chars / from_chars``\  depend on ``std::string``\ ","2017-07 (Toronto)","|Nothing To Do|","","`#103601 <https://github.com/llvm/llvm-project/issues/103601>`__","Resolved by `P0682R1 <https://wg21.link/P0682R1>`__"
 "`LWG2956 <https://wg21.link/LWG2956>`__","``filesystem::canonical()``\  still defined in terms of ``absolute(p, base)``\ ","2017-07 (Toronto)","|Complete|","","`#103602 <https://github.com/llvm/llvm-project/issues/103602>`__",""
diff --git a/libcxx/docs/Status/Cxx17Papers.csv b/libcxx/docs/Status/Cxx17Papers.csv
index ac9bb06e33ae0..1a9d3b08f3ec3 100644
--- a/libcxx/docs/Status/Cxx17Papers.csv
+++ b/libcxx/docs/Status/Cxx17Papers.csv
@@ -1,4 +1,4 @@
-"Paper #","Paper Name","Meeting","Status","First released version","Notes"
+"Paper #","Paper Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`N3911 <https://wg21.link/N3911>`__","TransformationTrait Alias ``void_t``\ .","2014-11 (Urbana)","|Complete|","3.6","`#103603 <https://github.com/llvm/llvm-project/issues/103603>`__",""
 "`N4089 <https://wg21.link/N4089>`__","Safe conversions in ``unique_ptr<T[]>``\ .","2014-11 (Urbana)","|Complete|","5","`#99936 <https://github.com/llvm/llvm-project/issues/99936>`__",""
 "`N4169 <https://wg21.link/N4169>`__","A proposal to add invoke function template","2014-11 (Urbana)","|Complete|","3.7","`#103604 <https://github.com/llvm/llvm-project/issues/103604>`__",""
@@ -10,13 +10,13 @@
 "`N4280 <https://wg21.link/N4280>`__","Non-member size() and more","2014-11 (Urbana)","|Complete|","3.6","`#103610 <https://github.com/llvm/llvm-project/issues/103610>`__",""
 "`N4284 <https://wg21.link/N4284>`__","Contiguous Iterators.","2014-11 (Urbana)","|Complete|","3.6","`#103611 <https://github.com/llvm/llvm-project/issues/103611>`__",""
 "`N4285 <https://wg21.link/N4285>`__","Cleanup for exception-specification and throw-expression.","2014-11 (Urbana)","|Complete|","4","`#103612 <https://github.com/llvm/llvm-project/issues/103612>`__",""
-"","","","","",""
+"","","","","","",""
 "`N4387 <https://wg21.link/N4387>`__","improving pair and tuple","2015-05 (Lenexa)","|Complete|","4","`#103613 <https://github.com/llvm/llvm-project/issues/103613>`__",""
 "`N4389 <https://wg21.link/N4389>`__","bool_constant","2015-05 (Lenexa)","|Complete|","3.7","`#103614 <https://github.com/llvm/llvm-project/issues/103614>`__",""
 "`N4508 <https://wg21.link/N4508>`__","shared_mutex for C++17","2015-05 (Lenexa)","|Complete|","3.7","`#103615 <https://github.com/llvm/llvm-project/issues/103615>`__",""
 "`N4366 <https://wg21.link/N4366>`__","LWG 2228 missing SFINAE rule","2015-05 (Lenexa)","|Complete|","3.1","`#103616 <https://github.com/llvm/llvm-project/issues/103616>`__",""
 "`N4510 <https://wg21.link/N4510>`__","Minimal incomplete type support for standard containers, revision 4","2015-05 (Lenexa)","|Complete|","3.6","`#103617 <https://github.com/llvm/llvm-project/issues/103617>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0004R1 <https://wg21.link/P0004R1>`__","Remove Deprecated iostreams aliases.","2015-10 (Kona)","|Complete|","3.8","`#103618 <https://github.com/llvm/llvm-project/issues/103618>`__",""
 "`P0006R0 <https://wg21.link/P0006R0>`__","Adopt Type Traits Variable Templates for C++17.","2015-10 (Kona)","|Complete|","3.8","`#103619 <https://github.com/llvm/llvm-project/issues/103619>`__",""
 "`P0092R1 <https://wg21.link/P0092R1>`__","Polishing <chrono>","2015-10 (Kona)","|Complete|","3.8","`#103620 <https://github.com/llvm/llvm-project/issues/103620>`__",""
@@ -24,7 +24,7 @@
 "`P0156R0 <https://wg21.link/P0156R0>`__","Variadic lock_guard(rev 3).","2015-10 (Kona)","|Nothing To Do|","","`#103622 <https://github.com/llvm/llvm-project/issues/103622>`__","Pulled at the 2017-01 meeting in Kona"
 "`P0074R0 <https://wg21.link/P0074R0>`__","Making ``std::owner_less``\  more flexible","2015-10 (Kona)","|Complete|","3.8","`#103623 <https://github.com/llvm/llvm-project/issues/103623>`__",""
 "`P0013R1 <https://wg21.link/P0013R1>`__","Logical type traits rev 2","2015-10 (Kona)","|Complete|","3.8","`#103624 <https://github.com/llvm/llvm-project/issues/103624>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0024R2 <https://wg21.link/P0024R2>`__","The Parallelism TS Should be Standardized","2016-02 (Jacksonville)","|Partial|","","`#99938 <https://github.com/llvm/llvm-project/issues/99938>`__",""
 "`P0226R1 <https://wg21.link/P0226R1>`__","Mathematical Special Functions for C++17","2016-02 (Jacksonville)","|In Progress|","","`#99939 <https://github.com/llvm/llvm-project/issues/99939>`__","Progress is tracked `here <https://github.com/llvm/llvm-project/issues/99939>`__"
 "`P0220R1 <https://wg21.link/P0220R1>`__","Adopt Library Fundamentals V1 TS Components for C++17","2016-02 (Jacksonville)","|Complete|","16","`#103625 <https://github.com/llvm/llvm-project/issues/103625>`__",""
@@ -40,7 +40,7 @@
 "`P0031R0 <https://wg21.link/P0031R0>`__","A Proposal to Add Constexpr Modifiers to reverse_iterator, move_iterator, array and Range Access","2016-02 (Jacksonville)","|Complete|","4","`#103636 <https://github.com/llvm/llvm-project/issues/103636>`__",""
 "`P0272R1 <https://wg21.link/P0272R1>`__","Give ``std::string``\  a non-const ``.data()``\  member function","2016-02 (Jacksonville)","|Complete|","3.9","`#103637 <https://github.com/llvm/llvm-project/issues/103637>`__",""
 "`P0077R2 <https://wg21.link/P0077R2>`__","``is_callable``\ , the missing INVOKE related trait","2016-02 (Jacksonville)","|Complete|","3.9","`#103638 <https://github.com/llvm/llvm-project/issues/103638>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0032R3 <https://wg21.link/P0032R3>`__","Homogeneous interface for variant, any and optional","2016-06 (Oulu)","|Complete|","4","`#103639 <https://github.com/llvm/llvm-project/issues/103639>`__",""
 "`P0040R3 <https://wg21.link/P0040R3>`__","Extending memory management tools","2016-06 (Oulu)","|Complete|","4","`#103640 <https://github.com/llvm/llvm-project/issues/103640>`__",""
 "`P0063R3 <https://wg21.link/P0063R3>`__","C++17 should refer to C11 instead of C99","2016-06 (Oulu)","|Complete|","7","`#103642 <https://github.com/llvm/llvm-project/issues/103642>`__",""
@@ -69,7 +69,7 @@
 "`P0392R0 <https://wg21.link/P0392R0>`__","Adapting string_view by filesystem paths","2016-06 (Oulu)","|Complete|","4","`#103665 <https://github.com/llvm/llvm-project/issues/103665>`__",""
 "`P0393R3 <https://wg21.link/P0393R3>`__","Making Variant Greater Equal","2016-06 (Oulu)","|Complete|","4","`#103666 <https://github.com/llvm/llvm-project/issues/103666>`__",""
 "`P0394R4 <https://wg21.link/P0394R4>`__","Hotel Parallelifornia: terminate() for Parallel Algorithms Exception Handling","2016-06 (Oulu)","|Complete|","17","`#103667 <https://github.com/llvm/llvm-project/issues/103667>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0003R5 <https://wg21.link/P0003R5>`__","Removing Deprecated Exception Specifications from C++17","2016-11 (Issaquah)","|Complete|","5","`#103668 <https://github.com/llvm/llvm-project/issues/103668>`__",""
 "`P0067R5 <https://wg21.link/P0067R5>`__","Elementary string conversions, revision 5","2016-11 (Issaquah)","|Partial|","","`#99940 <https://github.com/llvm/llvm-project/issues/99940>`__","For integer types, ``std::(to|from)_chars`` has been available since v7; for ``float`` and ``double``, ``std::to_chars`` since v14 and ``std::from_chars`` since v20. Support is complete except for ``long double``."
 "`P0403R1 <https://wg21.link/P0403R1>`__","Literal suffixes for ``basic_string_view``\ ","2016-11 (Issaquah)","|Complete|","4","`#103669 <https://github.com/llvm/llvm-project/issues/103669>`__",""
@@ -88,7 +88,7 @@
 "`P0516R0 <https://wg21.link/P0516R0>`__","Clarify That shared_future's Copy Operations have Wide Contracts","2016-11 (Issaquah)","|Complete|","4","`#103679 <https://github.com/llvm/llvm-project/issues/103679>`__",""
 "`P0517R0 <https://wg21.link/P0517R0>`__","Make future_error Constructible","2016-11 (Issaquah)","|Complete|","4","`#103680 <https://github.com/llvm/llvm-project/issues/103680>`__",""
 "`P0521R0 <https://wg21.link/P0521R0>`__","Proposed Resolution for CA 14 (shared_ptr use_count/unique)","2016-11 (Issaquah)","|Complete|","18","`#103681 <https://github.com/llvm/llvm-project/issues/103681>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0156R2 <https://wg21.link/P0156R2>`__","Variadic Lock guard(rev 5)","2017-02 (Kona)","|Complete|","5","`#103682 <https://github.com/llvm/llvm-project/issues/103682>`__",""
 "`P0270R3 <https://wg21.link/P0270R3>`__","Removing C dependencies from signal handler wording","2017-02 (Kona)","|Nothing To Do|","","`#99946 <https://github.com/llvm/llvm-project/issues/99946>`__",""
 "`P0298R3 <https://wg21.link/P0298R3>`__","A byte type definition","2017-02 (Kona)","|Complete|","5","`#103683 <https://github.com/llvm/llvm-project/issues/103683>`__",""
@@ -108,6 +108,6 @@
 "`P0607R0 <https://wg21.link/P0607R0>`__","Inline Variables for the Standard Library","2017-02 (Kona)","|In Progress|","6","`#99951 <https://github.com/llvm/llvm-project/issues/99951>`__","The parts of P0607 that are not done are the ``<regex>`` bits"
 "`P0618R0 <https://wg21.link/P0618R0>`__","Deprecating <codecvt>","2017-02 (Kona)","|Complete|","15","`#103697 <https://github.com/llvm/llvm-project/issues/103697>`__",""
 "`P0623R0 <https://wg21.link/P0623R0>`__","Final C++17 Parallel Algorithms Fixes","2017-02 (Kona)","|Nothing To Do|","","`#103698 <https://github.com/llvm/llvm-project/issues/103698>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0682R1 <https://wg21.link/P0682R1>`__","Repairing elementary string conversions","2017-07 (Toronto)","","","`#99952 <https://github.com/llvm/llvm-project/issues/99952>`__",""
 "`P0739R0 <https://wg21.link/P0739R0>`__","Some improvements to class template argument deduction integration into the standard library","2017-07 (Toronto)","|Complete|","5","`#103699 <https://github.com/llvm/llvm-project/issues/103699>`__",""
diff --git a/libcxx/docs/Status/Cxx20Issues.csv b/libcxx/docs/Status/Cxx20Issues.csv
index 646615b2d8d7f..3d01ff5bbbdfb 100644
--- a/libcxx/docs/Status/Cxx20Issues.csv
+++ b/libcxx/docs/Status/Cxx20Issues.csv
@@ -1,4 +1,4 @@
-"Issue #","Issue Name","Meeting","Status","First released version","Notes"
+"Issue #","Issue Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`LWG2070 <https://wg21.link/LWG2070>`__","``allocate_shared``\  should use ``allocator_traits<A>::construct``\ ","2017-07 (Toronto)","|Nothing To Do|","","`#103733 <https://github.com/llvm/llvm-project/issues/103733>`__","Resolved by `P0674R1 <https://wg21.link/P0674R1>`__"
 "`LWG2444 <https://wg21.link/LWG2444>`__","Inconsistent complexity for ``std::sort_heap``\ ","2017-07 (Toronto)","|Nothing To Do|","","`#103734 <https://github.com/llvm/llvm-project/issues/103734>`__",""
 "`LWG2593 <https://wg21.link/LWG2593>`__","Moved-from state of Allocators","2017-07 (Toronto)","|Nothing To Do|","","`#100220 <https://github.com/llvm/llvm-project/issues/100220>`__",""
@@ -12,7 +12,7 @@
 "`LWG2961 <https://wg21.link/LWG2961>`__","Bad postcondition for ``set_default_resource``\ ","2017-07 (Toronto)","|Complete|","16","`#103740 <https://github.com/llvm/llvm-project/issues/103740>`__",""
 "`LWG2966 <https://wg21.link/LWG2966>`__","Incomplete resolution of US 74","2017-07 (Toronto)","|Nothing To Do|","","`#103741 <https://github.com/llvm/llvm-project/issues/103741>`__",""
 "`LWG2974 <https://wg21.link/LWG2974>`__","Diagnose out of bounds ``tuple_element/variant_alternative``\ ","2017-07 (Toronto)","|Complete|","","`#103742 <https://github.com/llvm/llvm-project/issues/103742>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2779 <https://wg21.link/LWG2779>`__","[networking.ts] Relax requirements on buffer sequence iterators","2017-11 (Albuquerque)","|Nothing To Do|","","`#100223 <https://github.com/llvm/llvm-project/issues/100223>`__",""
 "`LWG2870 <https://wg21.link/LWG2870>`__","Default value of parameter theta of polar should be dependent","2017-11 (Albuquerque)","|Complete|","","`#103743 <https://github.com/llvm/llvm-project/issues/103743>`__",""
 "`LWG2935 <https://wg21.link/LWG2935>`__","What should create_directories do when p already exists but is not a directory?","2017-11 (Albuquerque)","|Nothing To Do|","","`#103744 <https://github.com/llvm/llvm-project/issues/103744>`__",""
@@ -39,7 +39,7 @@
 "`LWG2998 <https://wg21.link/LWG2998>`__","Requirements on function objects passed to {``forward_``,}list-specific algorithms","2017-11 (Albuquerque)","|Nothing To Do|","","`#103762 <https://github.com/llvm/llvm-project/issues/103762>`__",""
 "`LWG3001 <https://wg21.link/LWG3001>`__","weak_ptr::element_type needs remove_extent_t","2017-11 (Albuquerque)","|Complete|","14","`#103763 <https://github.com/llvm/llvm-project/issues/103763>`__",""
 "`LWG3024 <https://wg21.link/LWG3024>`__","variant's copies must be deleted instead of disabled via SFINAE","2017-11 (Albuquerque)","|Complete|","","`#103764 <https://github.com/llvm/llvm-project/issues/103764>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2164 <https://wg21.link/LWG2164>`__","What are the semantics of ``vector.emplace(vector.begin(), vector.back())``\ ?","2018-03 (Jacksonville)","|Complete|","","`#103765 <https://github.com/llvm/llvm-project/issues/103765>`__",""
 "`LWG2243 <https://wg21.link/LWG2243>`__","``istream::putback``\  problem","2018-03 (Jacksonville)","|Complete|","","`#103766 <https://github.com/llvm/llvm-project/issues/103766>`__",""
 "`LWG2816 <https://wg21.link/LWG2816>`__","``resize_file``\  has impossible postcondition","2018-03 (Jacksonville)","|Nothing To Do|","","`#103770 <https://github.com/llvm/llvm-project/issues/103770>`__",""
@@ -74,7 +74,7 @@
 "`LWG3048 <https://wg21.link/LWG3048>`__","``transform_reduce(exec, first1, last1, first2, init)``\  discards execution policy","2018-03 (Jacksonville)","|Complete|","17","`#103794 <https://github.com/llvm/llvm-project/issues/103794>`__",""
 "`LWG3051 <https://wg21.link/LWG3051>`__","Floating point classifications were inadvertently changed in P0175","2018-03 (Jacksonville)","|Nothing To Do|","","`#103795 <https://github.com/llvm/llvm-project/issues/103795>`__",""
 "`LWG3075 <https://wg21.link/LWG3075>`__","``basic_string``\  needs deduction guides from ``basic_string_view``\ ","2018-03 (Jacksonville)","|Complete|","","`#103796 <https://github.com/llvm/llvm-project/issues/103796>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2139 <https://wg21.link/LWG2139>`__","What is a user-defined type?","2018-06 (Rapperswil)","","","`#100233 <https://github.com/llvm/llvm-project/issues/100233>`__",""
 "`LWG2970 <https://wg21.link/LWG2970>`__","Return type of std::visit misspecified","2018-06 (Rapperswil)","|Complete|","11","`#103797 <https://github.com/llvm/llvm-project/issues/103797>`__",""
 "`LWG3058 <https://wg21.link/LWG3058>`__","Parallel adjacent_difference shouldn't require creating temporaries","2018-06 (Rapperswil)","","","`#100234 <https://github.com/llvm/llvm-project/issues/100234>`__",""
@@ -90,7 +90,7 @@
 "`LWG3100 <https://wg21.link/LWG3100>`__","Unnecessary and confusing ""empty span"" wording","2018-06 (Rapperswil)","|Nothing To Do|","","`#103803 <https://github.com/llvm/llvm-project/issues/103803>`__",""
 "`LWG3102 <https://wg21.link/LWG3102>`__","Clarify span iterator and ``const_iterator`` behavior","2018-06 (Rapperswil)","|Complete|","","`#103804 <https://github.com/llvm/llvm-project/issues/103804>`__",""
 "`LWG3104 <https://wg21.link/LWG3104>`__","Fixing duration division","2018-06 (Rapperswil)","|Complete|","","`#103805 <https://github.com/llvm/llvm-project/issues/103805>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2183 <https://wg21.link/LWG2183>`__","Muddled allocator requirements for ``match_results``\  constructors","2018-11 (San Diego)","|Complete|","","`#103806 <https://github.com/llvm/llvm-project/issues/103806>`__",""
 "`LWG2184 <https://wg21.link/LWG2184>`__","Muddled allocator requirements for ``match_results``\  assignments","2018-11 (San Diego)","|Complete|","","`#103807 <https://github.com/llvm/llvm-project/issues/103807>`__",""
 "`LWG2412 <https://wg21.link/LWG2412>`__","``promise::set_value()``\  and ``promise::get_future()``\  should not race","2018-11 (San Diego)","","","`#100239 <https://github.com/llvm/llvm-project/issues/100239>`__",""
@@ -128,7 +128,7 @@
 "`LWG3148 <https://wg21.link/LWG3148>`__","``<concepts>``\  should be freestanding","2018-11 (San Diego)","","","`#100250 <https://github.com/llvm/llvm-project/issues/100250>`__",""
 "`LWG3153 <https://wg21.link/LWG3153>`__","``Common``\  and ``common_type``\  have too little in common","2018-11 (San Diego)","|Complete|","13","`#103831 <https://github.com/llvm/llvm-project/issues/103831>`__",""
 "`LWG3154 <https://wg21.link/LWG3154>`__","``Common``\  and ``CommonReference``\  have a common defect","2018-11 (San Diego)","|Nothing To Do|","","`#103832 <https://github.com/llvm/llvm-project/issues/103832>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3012 <https://wg21.link/LWG3012>`__","``atomic<T>``\  is unimplementable for non-``is_trivially_copy_constructible T``\ ","2019-02 (Kona)","","","`#100252 <https://github.com/llvm/llvm-project/issues/100252>`__",""
 "`LWG3040 <https://wg21.link/LWG3040>`__","``basic_string_view::starts_with``\  *Effects* are incorrect","2019-02 (Kona)","|Complete|","","`#103834 <https://github.com/llvm/llvm-project/issues/103834>`__",""
 "`LWG3077 <https://wg21.link/LWG3077>`__","``(push|emplace)_back``\  should invalidate the ``end``\  iterator","2019-02 (Kona)","|Nothing To Do|","","`#103835 <https://github.com/llvm/llvm-project/issues/103835>`__",""
@@ -142,7 +142,7 @@
 "`LWG3179 <https://wg21.link/LWG3179>`__","``subrange``\  should always model ``Range``\ ","2019-02 (Kona)","|Nothing To Do|","","`#103842 <https://github.com/llvm/llvm-project/issues/103842>`__",""
 "`LWG3180 <https://wg21.link/LWG3180>`__","Inconsistently named return type for ``ranges::minmax_element``\ ","2019-02 (Kona)","|Complete|","15","`#103844 <https://github.com/llvm/llvm-project/issues/103844>`__",""
 "`LWG3182 <https://wg21.link/LWG3182>`__","Specification of ``Same``\  could be clearer","2019-02 (Kona)","|Complete|","15","`#103845 <https://github.com/llvm/llvm-project/issues/103845>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2899 <https://wg21.link/LWG2899>`__","``is_(nothrow_)move_constructible``\  and ``tuple``\ , ``optional``\  and ``unique_ptr``\ ","2019-07 (Cologne)","","","`#100255 <https://github.com/llvm/llvm-project/issues/100255>`__",""
 "`LWG3055 <https://wg21.link/LWG3055>`__","``path::operator+=(*single-character*)``\  misspecified","2019-07 (Cologne)","|Complete|","7","`#103846 <https://github.com/llvm/llvm-project/issues/103846>`__",""
 "`LWG3158 <https://wg21.link/LWG3158>`__","``tuple(allocator_arg_t, const Alloc&)``\  should be conditionally explicit","2019-07 (Cologne)","|Complete|","10","`#103847 <https://github.com/llvm/llvm-project/issues/103847>`__",""
@@ -160,7 +160,7 @@
 "`LWG3206 <https://wg21.link/LWG3206>`__","``year_month_day``\  conversion to ``sys_days``\  uses not-existing member function","2019-07 (Cologne)","|Complete|","","`#103858 <https://github.com/llvm/llvm-project/issues/103858>`__",""
 "`LWG3208 <https://wg21.link/LWG3208>`__","``Boolean``\ 's expression requirements are ordered inconsistently","2019-07 (Cologne)","|Nothing To Do|","","`#103860 <https://github.com/llvm/llvm-project/issues/103860>`__",""
 "`LWG3209 <https://wg21.link/LWG3209>`__","Expression in ``year::ok()``\  returns clause is ill-formed","2019-07 (Cologne)","|Complete|","","`#103861 <https://github.com/llvm/llvm-project/issues/103861>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3231 <https://wg21.link/LWG3231>`__","``year_month_day_last::day``\  specification does not cover ``!ok()``\  values","2019-11 (Belfast)","|Nothing To Do|","","`#103862 <https://github.com/llvm/llvm-project/issues/103862>`__",""
 "`LWG3225 <https://wg21.link/LWG3225>`__","``zoned_time``\  converting constructor shall not be ``noexcept``\ ","2019-11 (Belfast)","|Complete|","19","`#103863 <https://github.com/llvm/llvm-project/issues/103863>`__",""
 "`LWG3190 <https://wg21.link/LWG3190>`__","``std::allocator::allocate``\  sometimes returns too little storage","2019-11 (Belfast)","|Complete|","14","`#103864 <https://github.com/llvm/llvm-project/issues/103864>`__",""
@@ -188,7 +188,7 @@
 "`LWG3276 <https://wg21.link/LWG3276>`__","Class ``split_view::outer_iterator::value_type``\  should inherit from ``view_interface``\ ","2019-11 (Belfast)","|Complete|","15","`#103881 <https://github.com/llvm/llvm-project/issues/103881>`__",""
 "`LWG3277 <https://wg21.link/LWG3277>`__","Pre-increment on prvalues is not a requirement of ``weakly_incrementable``\ ","2019-11 (Belfast)","|Nothing To Do|","","`#103882 <https://github.com/llvm/llvm-project/issues/103882>`__",""
 "`LWG3149 <https://wg21.link/LWG3149>`__","``DefaultConstructible``\  should require default initialization","2019-11 (Belfast)","|Complete|","13","`#103883 <https://github.com/llvm/llvm-project/issues/103883>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG1203 <https://wg21.link/LWG1203>`__","More useful rvalue stream insertion","2020-02 (Prague)","|Complete|","12","`#103884 <https://github.com/llvm/llvm-project/issues/103884>`__",""
 "`LWG2859 <https://wg21.link/LWG2859>`__","Definition of *reachable* in [ptr.launder] misses pointer arithmetic from pointer-interconvertible object","2020-02 (Prague)","","","`#100269 <https://github.com/llvm/llvm-project/issues/100269>`__",""
 "`LWG3018 <https://wg21.link/LWG3018>`__","``shared_ptr``\  of function type","2020-02 (Prague)","|Nothing To Do|","","`#103885 <https://github.com/llvm/llvm-project/issues/103885>`__",""
@@ -298,5 +298,5 @@
 "`LWG3396 <https://wg21.link/LWG3396>`__","Clarify point of reference for ``source_location::current()``\  (DE 169)","2020-02 (Prague)","|Nothing To Do|","16","`#103995 <https://github.com/llvm/llvm-project/issues/103995>`__",""
 "`LWG3397 <https://wg21.link/LWG3397>`__","``ranges::basic_istream_view::iterator``\  should not provide ``iterator_category``\ ","2020-02 (Prague)","|Complete|","16","`#103996 <https://github.com/llvm/llvm-project/issues/103996>`__",""
 "`LWG3398 <https://wg21.link/LWG3398>`__","``tuple_element_t``\  is also wrong for ``const subrange``\ ","2020-02 (Prague)","|Complete|","14","`#103997 <https://github.com/llvm/llvm-project/issues/103997>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3446 <https://wg21.link/LWG3446>`__","``indirectly_readable_traits``\  ambiguity for types with both ``value_type``\  and ``element_type``\ ","2020-11 (Virtual)","|Complete|","13","`#103998 <https://github.com/llvm/llvm-project/issues/103998>`__",""
diff --git a/libcxx/docs/Status/Cxx20Papers.csv b/libcxx/docs/Status/Cxx20Papers.csv
index a0c49df0a3f87..7fbccde688009 100644
--- a/libcxx/docs/Status/Cxx20Papers.csv
+++ b/libcxx/docs/Status/Cxx20Papers.csv
@@ -1,7 +1,7 @@
-"Paper #","Paper Name","Meeting","Status","First released version","Notes"
+"Paper #","Paper Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`P0463R1 <https://wg21.link/P0463R1>`__","Endian just Endian","2017-07 (Toronto)","|Complete|","7","`#103999 <https://github.com/llvm/llvm-project/issues/103999>`__",""
 "`P0674R1 <https://wg21.link/P0674R1>`__","Extending make_shared to Support Arrays","2017-07 (Toronto)","|Complete|","15","`#104000 <https://github.com/llvm/llvm-project/issues/104000>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0020R6 <https://wg21.link/P0020R6>`__","Floating Point Atomic","2017-11 (Albuquerque)","|Complete|","18","`#104001 <https://github.com/llvm/llvm-project/issues/104001>`__","The feature-test macro was not set until LLVM 20."
 "`P0053R7 <https://wg21.link/P0053R7>`__","C++ Synchronized Buffered Ostream","2017-11 (Albuquerque)","|Complete|","18","`#104002 <https://github.com/llvm/llvm-project/issues/104002>`__",""
 "`P0202R3 <https://wg21.link/P0202R3>`__","Add constexpr modifiers to functions in <algorithm> and <utility> Headers","2017-11 (Albuquerque)","|Complete|","12","`#104003 <https://github.com/llvm/llvm-project/issues/104003>`__",""
@@ -16,7 +16,7 @@
 "`P0767R1 <https://wg21.link/P0767R1>`__","Deprecate POD","2017-11 (Albuquerque)","|Complete|","21","`#104013 <https://github.com/llvm/llvm-project/issues/104013>`__","It was previously erroneously marked as complete in LLVM 7."
 "`P0768R1 <https://wg21.link/P0768R1>`__","Library Support for the Spaceship (Comparison) Operator","2017-11 (Albuquerque)","|Complete|","","`#104014 <https://github.com/llvm/llvm-project/issues/104014>`__",""
 "`P0777R1 <https://wg21.link/P0777R1>`__","Treating Unnecessary ``decay``\ ","2017-11 (Albuquerque)","|Complete|","7","`#104016 <https://github.com/llvm/llvm-project/issues/104016>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0122R7 <https://wg21.link/P0122R7>`__","<span>","2018-03 (Jacksonville)","|Complete|","7","`#104017 <https://github.com/llvm/llvm-project/issues/104017>`__",""
 "`P0355R7 <https://wg21.link/P0355R7>`__","Extending chrono to Calendars and Time Zones","2018-03 (Jacksonville)","|Partial|","","`#99982 <https://github.com/llvm/llvm-project/issues/99982>`__","See the `Github issue <https://github.com/llvm/llvm-project/issues/99982>`__ for detailed status"
 "`P0551R3 <https://wg21.link/P0551R3>`__","Thou Shalt Not Specialize ``std``\  Function Templates!","2018-03 (Jacksonville)","|Complete|","11","`#104018 <https://github.com/llvm/llvm-project/issues/104018>`__",""
@@ -26,7 +26,7 @@
 "`P0858R0 <https://wg21.link/P0858R0>`__","Constexpr iterator requirements","2018-03 (Jacksonville)","|Complete|","12","`#104024 <https://github.com/llvm/llvm-project/issues/104024>`__",""
 "`P0905R1 <https://wg21.link/P0905R1>`__","Symmetry for spaceship","2018-03 (Jacksonville)","|Complete|","7","`#104025 <https://github.com/llvm/llvm-project/issues/104025>`__",""
 "`P0966R1 <https://wg21.link/P0966R1>`__","``string::reserve``\  Should Not Shrink","2018-03 (Jacksonville)","|Complete|","12","`#104026 <https://github.com/llvm/llvm-project/issues/104026>`__","It was previously erroneously marked as complete in version 8. See `bug 45368 <https://llvm.org/PR45368>`__"
-"","","","","",""
+"","","","","","",""
 "`P0019R8 <https://wg21.link/P0019R8>`__","Atomic Ref","2018-06 (Rapperswil)","|Complete|","19","`#104028 <https://github.com/llvm/llvm-project/issues/104028>`__",""
 "`P0458R2 <https://wg21.link/P0458R2>`__","Checking for Existence of an Element in Associative Containers","2018-06 (Rapperswil)","|Complete|","13","`#104029 <https://github.com/llvm/llvm-project/issues/104029>`__",""
 "`P0475R1 <https://wg21.link/P0475R1>`__","LWG 2511: guaranteed copy elision for piecewise construction","2018-06 (Rapperswil)","|Complete|","","`#104031 <https://github.com/llvm/llvm-project/issues/104031>`__",""
@@ -50,7 +50,7 @@
 "`P1023R0 <https://wg21.link/P1023R0>`__","constexpr comparison operators for std::array","2018-06 (Rapperswil)","|Complete|","8","`#104103 <https://github.com/llvm/llvm-project/issues/104103>`__",""
 "`P1025R1 <https://wg21.link/P1025R1>`__","Update The Reference To The Unicode Standard","2018-06 (Rapperswil)","","","`#100598 <https://github.com/llvm/llvm-project/issues/100598>`__",""
 "`P1120R0 <https://wg21.link/P1120R0>`__","Consistency improvements for <=> and other comparison operators","2018-06 (Rapperswil)","","","`#99989 <https://github.com/llvm/llvm-project/issues/99989>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0318R1 <https://wg21.link/P0318R1>`__","unwrap_ref_decay and unwrap_reference","2018-11 (San Diego)","|Complete|","8","`#104105 <https://github.com/llvm/llvm-project/issues/104105>`__",""
 "`P0356R5 <https://wg21.link/P0356R5>`__","Simplified partial function application","2018-11 (San Diego)","|Complete|","13","`#104106 <https://github.com/llvm/llvm-project/issues/104106>`__",""
 "`P0357R3 <https://wg21.link/P0357R3>`__","reference_wrapper for incomplete types","2018-11 (San Diego)","|Complete|","8","`#104107 <https://github.com/llvm/llvm-project/issues/104107>`__",""
@@ -80,7 +80,7 @@
 "`P1248R1 <https://wg21.link/P1248R1>`__","Remove CommonReference requirement from StrictWeakOrdering (a.k.a Fixing Relations)","2018-11 (San Diego)","|Complete|","13","`#104131 <https://github.com/llvm/llvm-project/issues/104131>`__",""
 "`P1285R0 <https://wg21.link/P1285R0>`__","Improving Completeness Requirements for Type Traits","2018-11 (San Diego)","","","`#99995 <https://github.com/llvm/llvm-project/issues/99995>`__",""
 "`P1353R0 <https://wg21.link/P1353R0>`__","Missing feature test macros","2018-11 (San Diego)","|Complete|","19","`#100006 <https://github.com/llvm/llvm-project/issues/100006>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0339R6 <https://wg21.link/P0339R6>`__","polymorphic_allocator<> as a vocabulary type","2019-02 (Kona)","|Complete|","16","`#104132 <https://github.com/llvm/llvm-project/issues/104132>`__",""
 "`P0340R3 <https://wg21.link/P0340R3>`__","Making std::underlying_type SFINAE-friendly","2019-02 (Kona)","|Complete|","9","`#104133 <https://github.com/llvm/llvm-project/issues/104133>`__",""
 "`P0738R2 <https://wg21.link/P0738R2>`__","I Stream, You Stream, We All Stream for istream_iterator","2019-02 (Kona)","","","`#100007 <https://github.com/llvm/llvm-project/issues/100007>`__",""
@@ -98,7 +98,7 @@
 "`P1462R1 <https://wg21.link/P1462R1>`__","Mandating the Standard Library: Clause 20 - Strings library","2019-02 (Kona)","|Complete|","9","`#104145 <https://github.com/llvm/llvm-project/issues/104145>`__",""
 "`P1463R1 <https://wg21.link/P1463R1>`__","Mandating the Standard Library: Clause 21 - Containers library","2019-02 (Kona)","","","`#100010 <https://github.com/llvm/llvm-project/issues/100010>`__",""
 "`P1464R1 <https://wg21.link/P1464R1>`__","Mandating the Standard Library: Clause 22 - Iterators library","2019-02 (Kona)","|Complete|","9","`#104147 <https://github.com/llvm/llvm-project/issues/104147>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0325R4 <https://wg21.link/P0325R4>`__","to_array from LFTS with updates","2019-07 (Cologne)","|Complete|","10","`#104149 <https://github.com/llvm/llvm-project/issues/104149>`__",""
 "`P0408R7 <https://wg21.link/P0408R7>`__","Efficient Access to basic_stringbuf's Buffer","2019-07 (Cologne)","|Complete|","17","`#104150 <https://github.com/llvm/llvm-project/issues/104150>`__",""
 "`P0466R5 <https://wg21.link/P0466R5>`__","Layout-compatibility and Pointer-interconvertibility Traits","2019-07 (Cologne)","","","`#100013 <https://github.com/llvm/llvm-project/issues/100013>`__",""
@@ -133,7 +133,7 @@
 "`P1652R1 <https://wg21.link/P1652R1>`__","Printf corner cases in std::format","2019-07 (Cologne)","|Complete|","14","`#104177 <https://github.com/llvm/llvm-project/issues/104177>`__",""
 "`P1661R1 <https://wg21.link/P1661R1>`__","Remove dedicated precalculated hash lookup interface","2019-07 (Cologne)","|Nothing To Do|","","`#104178 <https://github.com/llvm/llvm-project/issues/104178>`__",""
 "`P1754R1 <https://wg21.link/P1754R1>`__","Rename concepts to standard_case for C++20, while we still can","2019-07 (Cologne)","|Complete|","15","`#104179 <https://github.com/llvm/llvm-project/issues/104179>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0883R2 <https://wg21.link/P0883R2>`__","Fixing Atomic Initialization","2019-11 (Belfast)","|Complete|","14","`#104180 <https://github.com/llvm/llvm-project/issues/104180>`__","shared_ptr and floating-point changes weren't applied as they themselves aren't implemented yet. Also, ``ATOMIC_FLAG_INIT`` was marked deprecated in version 14, but was undeprecated with the implementation of LWG3659 in version 15."
 "`P1391R4 <https://wg21.link/P1391R4>`__","Range constructor for std::string_view","2019-11 (Belfast)","|Complete|","14","`#104181 <https://github.com/llvm/llvm-project/issues/104181>`__",""
 "`P1394R4 <https://wg21.link/P1394R4>`__","Range constructor for std::span","2019-11 (Belfast)","|Complete|","14","`#104182 <https://github.com/llvm/llvm-project/issues/104182>`__",""
@@ -163,7 +163,7 @@
 "`P1960R0 <https://wg21.link/P1960R0>`__","NB Comment Changes Reviewed by SG1","2019-11 (Belfast)","","","`#100033 <https://github.com/llvm/llvm-project/issues/100033>`__",""
 "`P1961R0 <https://wg21.link/P1961R0>`__","Harmonizing the definitions of total order for pointers","2019-11 (Belfast)","","","`#100611 <https://github.com/llvm/llvm-project/issues/100611>`__",""
 "`P1965R0 <https://wg21.link/P1965R0>`__","Blanket Wording for Specifying ""Hidden Friends""","2019-11 (Belfast)","","","`#100034 <https://github.com/llvm/llvm-project/issues/100034>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0586R2 <https://wg21.link/P0586R2>`__","Safe integral comparisons","2020-02 (Prague)","|Complete|","13","`#104199 <https://github.com/llvm/llvm-project/issues/104199>`__",""
 "`P0593R6 <https://wg21.link/P0593R6>`__","Implicit creation of objects for low-level object manipulation","2020-02 (Prague)","","","`#100035 <https://github.com/llvm/llvm-project/issues/100035>`__",""
 "`P1115R3 <https://wg21.link/P1115R3>`__","Improving the Return Value of Erase-Like Algorithms II: Free erase/erase if","2020-02 (Prague)","|Complete|","11","`#104200 <https://github.com/llvm/llvm-project/issues/104200>`__",""
@@ -191,7 +191,7 @@
 "`P2102R0 <https://wg21.link/P2102R0>`__","Make 'implicit expression variations' more explicit (Wording for US185)","2020-02 (Prague)","|Complete|","15","`#100042 <https://github.com/llvm/llvm-project/issues/100042>`__",""
 "`P2106R0 <https://wg21.link/P2106R0>`__","Alternative wording for GB315 and GB316","2020-02 (Prague)","|Complete|","15","`#104215 <https://github.com/llvm/llvm-project/issues/104215>`__",""
 "`P2116R0 <https://wg21.link/P2116R0>`__","Remove tuple-like protocol support from fixed-extent span","2020-02 (Prague)","|Complete|","11","`#104216 <https://github.com/llvm/llvm-project/issues/104216>`__",""
-"","","","","",""
+"","","","","","",""
 "`P2231R1 <https://wg21.link/P2231R1>`__","Missing constexpr in std::optional and std::variant","2021-06 (Virtual)","|Complete|","19","`#104217 <https://github.com/llvm/llvm-project/issues/104217>`__","Changes of feature-test macros are completed in LLVM 20."
 "`P2325R3 <https://wg21.link/P2325R3>`__","Views should not be required to be default constructible","2021-06 (Virtual)","|Complete|","16","`#104218 <https://github.com/llvm/llvm-project/issues/104218>`__",""
 "`P2210R2 <https://wg21.link/P2210R2>`__","Superior String Splitting","2021-06 (Virtual)","|Complete|","16","`#104219 <https://github.com/llvm/llvm-project/issues/104219>`__",""
@@ -199,7 +199,7 @@
 "`P2281R1 <https://wg21.link/P2281R1>`__","Clarifying range adaptor objects","2021-06 (Virtual)","|Complete|","14","`#104221 <https://github.com/llvm/llvm-project/issues/104221>`__",""
 "`P2328R1 <https://wg21.link/P2328R1>`__","join_view should join all views of ranges","2021-06 (Virtual)","|Complete|","15","`#104222 <https://github.com/llvm/llvm-project/issues/104222>`__",""
 "`P2367R0 <https://wg21.link/P2367R0>`__","Remove misuses of list-initialization from Clause 24","2021-06 (Virtual)","|Complete|","15","`#104223 <https://github.com/llvm/llvm-project/issues/104223>`__",""
-"","","","","",""
+"","","","","","",""
 "`P2372R3 <https://wg21.link/P2372R3>`__","Fixing locale handling in chrono formatters","2021-10 (Virtual)","|Complete|","21","`#100043 <https://github.com/llvm/llvm-project/issues/100043>`__",""
 "`P2415R2 <https://wg21.link/P2415R2>`__","What is a ``view``","2021-10 (Virtual)","|Complete|","14","`#104224 <https://github.com/llvm/llvm-project/issues/104224>`__",""
 "`P2418R2 <https://wg21.link/P2418R2>`__","Add support for ``std::generator``-like types to ``std::format``","2021-10 (Virtual)","|Complete|","15","`#104225 <https://github.com/llvm/llvm-project/issues/104225>`__",""
diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 9318176ac964a..5a68b51ec85fb 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -1,4 +1,4 @@
-"Issue #","Issue Name","Meeting","Status","First released version","Notes"
+"Issue #","Issue Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`LWG2839 <https://wg21.link/LWG2839>`__","Self-move-assignment of library types, again","2020-11 (Virtual)","|Nothing To Do|","","`#104255 <https://github.com/llvm/llvm-project/issues/104255>`__",""
 "`LWG3117 <https://wg21.link/LWG3117>`__","Missing ``packaged_task`` deduction guides","2020-11 (Virtual)","|Complete|","16","`#104256 <https://github.com/llvm/llvm-project/issues/104256>`__",""
 "`LWG3143 <https://wg21.link/LWG3143>`__","``monotonic_buffer_resource`` growth policy is unclear","2020-11 (Virtual)","","","`#104258 <https://github.com/llvm/llvm-project/issues/104258>`__",""
@@ -49,7 +49,7 @@
 "`LWG3477 <https://wg21.link/LWG3477>`__","Simplify constraints for ``semiregular-box``","2020-11 (Virtual)","|Complete|","13","`#104308 <https://github.com/llvm/llvm-project/issues/104308>`__",""
 "`LWG3482 <https://wg21.link/LWG3482>`__","``drop_view``'s const begin should additionally require ``sized_range``","2020-11 (Virtual)","|Complete|","14","`#104309 <https://github.com/llvm/llvm-project/issues/104309>`__",""
 "`LWG3483 <https://wg21.link/LWG3483>`__","``transform_view::iterator``'s difference is overconstrained","2020-11 (Virtual)","|Complete|","14","`#104310 <https://github.com/llvm/llvm-project/issues/104310>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3391 <https://wg21.link/LWG3391>`__","Problems with ``counted_iterator``/``move_iterator::base() const &``","2021-02 (Virtual)","","","`#104311 <https://github.com/llvm/llvm-project/issues/104311>`__",""
 "`LWG3433 <https://wg21.link/LWG3433>`__","``subrange::advance(n)`` has UB when ``n < 0``","2021-02 (Virtual)","|Complete|","14","`#104312 <https://github.com/llvm/llvm-project/issues/104312>`__",""
 "`LWG3490 <https://wg21.link/LWG3490>`__","``ranges::drop_while_view::begin()`` is missing a precondition","2021-02 (Virtual)","|Nothing To Do|","","`#104313 <https://github.com/llvm/llvm-project/issues/104313>`__",""
@@ -59,7 +59,7 @@
 "`LWG3500 <https://wg21.link/LWG3500>`__","``join_view::iterator::operator->()`` is bogus","2021-02 (Virtual)","|Complete|","14","`#104318 <https://github.com/llvm/llvm-project/issues/104318>`__",""
 "`LWG3502 <https://wg21.link/LWG3502>`__","``elements_view`` should not be allowed to return dangling reference","2021-02 (Virtual)","|Complete|","16","`#104319 <https://github.com/llvm/llvm-project/issues/104319>`__",""
 "`LWG3505 <https://wg21.link/LWG3505>`__","``split_view::outer-iterator::operator++`` misspecified","2021-02 (Virtual)","","","`#104320 <https://github.com/llvm/llvm-project/issues/104320>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2774 <https://wg21.link/LWG2774>`__","``std::function`` construction vs assignment","2021-06 (Virtual)","","","`#104321 <https://github.com/llvm/llvm-project/issues/104321>`__",""
 "`LWG2818 <https://wg21.link/LWG2818>`__","``::std::`` everywhere rule needs tweaking","2021-06 (Virtual)","|Nothing To Do|","","`#104322 <https://github.com/llvm/llvm-project/issues/104322>`__",""
 "`LWG2997 <https://wg21.link/LWG2997>`__","LWG 491 and the specification of ``{forward_,}list::unique``","2021-06 (Virtual)","","","`#104323 <https://github.com/llvm/llvm-project/issues/104323>`__",""
@@ -96,7 +96,7 @@
 "`LWG3552 <https://wg21.link/LWG3552>`__","Parallel specialized memory algorithms should require forward iterators","2021-06 (Virtual)","","","`#104357 <https://github.com/llvm/llvm-project/issues/104357>`__",""
 "`LWG3553 <https://wg21.link/LWG3553>`__","Useless constraint in ``split_view::outer-iterator::value_type::begin()``","2021-06 (Virtual)","","","`#104358 <https://github.com/llvm/llvm-project/issues/104358>`__",""
 "`LWG3555 <https://wg21.link/LWG3555>`__","``{transform,elements}_view::iterator::iterator_concept`` should consider const-qualification of the underlying range","2021-06 (Virtual)","","","`#104359 <https://github.com/llvm/llvm-project/issues/104359>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2191 <https://wg21.link/LWG2191>`__","Incorrect specification of ``match_results(match_results&&)``","2021-10 (Virtual)","|Nothing To Do|","","`#104360 <https://github.com/llvm/llvm-project/issues/104360>`__",""
 "`LWG2381 <https://wg21.link/LWG2381>`__","Inconsistency in parsing floating point numbers","2021-10 (Virtual)","|Complete|","19","`#104361 <https://github.com/llvm/llvm-project/issues/104361>`__",""
 "`LWG2762 <https://wg21.link/LWG2762>`__","``unique_ptr operator*()`` should be ``noexcept``","2021-10 (Virtual)","|Complete|","19","`#104362 <https://github.com/llvm/llvm-project/issues/104362>`__",""
@@ -136,7 +136,7 @@
 "`LWG3592 <https://wg21.link/LWG3592>`__","``lazy_split_view`` needs to check the simpleness of Pattern","2021-10 (Virtual)","","","`#104938 <https://github.com/llvm/llvm-project/issues/104938>`__",""
 "`LWG3593 <https://wg21.link/LWG3593>`__","Several iterators' ``base() const &`` and ``lazy_split_view::outer-iterator::value_type::end()`` missing ``noexcept``","2021-10 (Virtual)","","","`#104939 <https://github.com/llvm/llvm-project/issues/104939>`__",""
 "`LWG3595 <https://wg21.link/LWG3595>`__","Exposition-only classes proxy and postfix-proxy for ``common_iterator`` should be fully ``constexpr``","2021-10 (Virtual)","|Complete|","14","`#104940 <https://github.com/llvm/llvm-project/issues/104940>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3088 <https://wg21.link/LWG3088>`__","``forward_list::merge`` behaviour unclear when passed ``*this``","2022-02 (Virtual)","|Complete|","Yes","`#104942 <https://github.com/llvm/llvm-project/issues/104942>`__",""
 "`LWG3471 <https://wg21.link/LWG3471>`__","``polymorphic_allocator::allocate`` does not satisfy ``Cpp17Allocator`` requirements","2022-02 (Virtual)","","","`#104943 <https://github.com/llvm/llvm-project/issues/104943>`__",""
 "`LWG3525 <https://wg21.link/LWG3525>`__","``uses_allocator_construction_args`` fails to handle types convertible to ``pair``","2022-02 (Virtual)","","","`#104944 <https://github.com/llvm/llvm-project/issues/104944>`__",""
@@ -158,7 +158,7 @@
 "`LWG3657 <https://wg21.link/LWG3657>`__","``std::hash<std::filesystem::path>`` is not enabled","2022-02 (Virtual)","|Complete|","17","`#104960 <https://github.com/llvm/llvm-project/issues/104960>`__",""
 "`LWG3660 <https://wg21.link/LWG3660>`__","``iterator_traits<common_iterator>::pointer`` should conform to §[iterator.traits]","2022-02 (Virtual)","|Complete|","14","`#104961 <https://github.com/llvm/llvm-project/issues/104961>`__",""
 "`LWG3661 <https://wg21.link/LWG3661>`__","``constinit atomic<shared_ptr<T>> a(nullptr);`` should work","2022-02 (Virtual)","","","`#104962 <https://github.com/llvm/llvm-project/issues/104962>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3564 <https://wg21.link/LWG3564>`__","``transform_view::iterator<true>::value_type`` and ``iterator_category`` should use ``const F&``","2022-07 (Virtual)","|Complete|","20","`#104963 <https://github.com/llvm/llvm-project/issues/104963>`__",""
 "`LWG3617 <https://wg21.link/LWG3617>`__","``function``/``packaged_task`` deduction guides and deducing ``this``","2022-07 (Virtual)","","","`#104965 <https://github.com/llvm/llvm-project/issues/104965>`__",""
 "`LWG3656 <https://wg21.link/LWG3656>`__","Inconsistent bit operations returning a count","2022-07 (Virtual)","|Complete|","15","`#104966 <https://github.com/llvm/llvm-project/issues/104966>`__",""
@@ -185,7 +185,7 @@
 "`LWG3719 <https://wg21.link/LWG3719>`__","Directory iterators should be usable with default sentinel","2022-07 (Virtual)","|Complete|","17","`#104992 <https://github.com/llvm/llvm-project/issues/104992>`__",""
 "`LWG3721 <https://wg21.link/LWG3721>`__","Allow an ``arg-id`` with a value of zero for ``width`` in ``std-format-spec``","2022-07 (Virtual)","|Complete|","16","`#104993 <https://github.com/llvm/llvm-project/issues/104993>`__",""
 "`LWG3724 <https://wg21.link/LWG3724>`__","``decay-copy`` should be constrained","2022-07 (Virtual)","|Complete|","14","`#104994 <https://github.com/llvm/llvm-project/issues/104994>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3028 <https://wg21.link/LWG3028>`__","Container requirements tables should distinguish ``const`` and non-``const`` variables","2022-11 (Kona)","","","`#104995 <https://github.com/llvm/llvm-project/issues/104995>`__",""
 "`LWG3118 <https://wg21.link/LWG3118>`__","``fpos`` equality comparison unspecified","2022-11 (Kona)","","","`#104996 <https://github.com/llvm/llvm-project/issues/104996>`__",""
 "`LWG3177 <https://wg21.link/LWG3177>`__","Limit permission to specialize variable templates to program-defined types","2022-11 (Kona)","|Nothing To Do|","","`#104997 <https://github.com/llvm/llvm-project/issues/104997>`__",""
@@ -244,7 +244,7 @@
 "`LWG3823 <https://wg21.link/LWG3823>`__","Unnecessary precondition for ``is_aggregate``","2022-11 (Kona)","|Nothing To Do|","","`#105058 <https://github.com/llvm/llvm-project/issues/105058>`__",""
 "`LWG3824 <https://wg21.link/LWG3824>`__","Number of ``bind`` placeholders is underspecified","2022-11 (Kona)","|Nothing To Do|","","`#105060 <https://github.com/llvm/llvm-project/issues/105060>`__",""
 "`LWG3826 <https://wg21.link/LWG3826>`__","Redundant specification [for overload of yield_value]","2022-11 (Kona)","|Nothing To Do|","","`#105061 <https://github.com/llvm/llvm-project/issues/105061>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2195 <https://wg21.link/LWG2195>`__","Missing constructors for ``match_results``","2023-02 (Issaquah)","","","`#105062 <https://github.com/llvm/llvm-project/issues/105062>`__",""
 "`LWG2295 <https://wg21.link/LWG2295>`__","Locale name when the provided ``Facet`` is a ``nullptr``","2023-02 (Issaquah)","","","`#105063 <https://github.com/llvm/llvm-project/issues/105063>`__",""
 "`LWG3032 <https://wg21.link/LWG3032>`__","``ValueSwappable`` requirement missing for ``push_heap`` and ``make_heap``","2023-02 (Issaquah)","","","`#105064 <https://github.com/llvm/llvm-project/issues/105064>`__",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index 3a87e64339e1f..b655384bad7f2 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -1,16 +1,16 @@
-"Paper #","Paper Name","Meeting","Status","First released version","Notes"
+"Paper #","Paper Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`P0881R7 <https://wg21.link/P0881R7>`__","A Proposal to add stacktrace library","2020-11 (Virtual)","","","`#105131 <https://github.com/llvm/llvm-project/issues/105131>`__",""
 "`P0943R6 <https://wg21.link/P0943R6>`__","Support C atomics in C++","2020-11 (Virtual)","|Complete|","15","`#105132 <https://github.com/llvm/llvm-project/issues/105132>`__",""
 "`P1048R1 <https://wg21.link/P1048R1>`__","A proposal for a type trait to detect scoped enumerations","2020-11 (Virtual)","|Complete|","12","`#105134 <https://github.com/llvm/llvm-project/issues/105134>`__",""
 "`P1679R3 <https://wg21.link/P1679R3>`__","string contains function","2020-11 (Virtual)","|Complete|","12","`#105135 <https://github.com/llvm/llvm-project/issues/105135>`__",""
-"","","","","",""
+"","","","","","",""
 "`P1682R3 <https://wg21.link/P1682R3>`__","std::to_underlying for enumerations","2021-02 (Virtual)","|Complete|","13","`#105136 <https://github.com/llvm/llvm-project/issues/105136>`__",""
 "`P2017R1 <https://wg21.link/P2017R1>`__","Conditionally borrowed ranges","2021-02 (Virtual)","|Complete|","16","`#105138 <https://github.com/llvm/llvm-project/issues/105138>`__",""
 "`P2160R1 <https://wg21.link/P2160R1>`__","Locks lock lockables","2021-02 (Virtual)","|Nothing To Do|","","`#105139 <https://github.com/llvm/llvm-project/issues/105139>`__",""
 "`P2162R2 <https://wg21.link/P2162R2>`__","Inheriting from std::variant","2021-02 (Virtual)","|Complete|","13","`#105140 <https://github.com/llvm/llvm-project/issues/105140>`__",""
 "`P2212R2 <https://wg21.link/P2212R2>`__","Relax Requirements for time_point::clock","2021-02 (Virtual)","|Nothing To Do|","","`#105141 <https://github.com/llvm/llvm-project/issues/105141>`__",""
 "`P2259R1 <https://wg21.link/P2259R1>`__","Repairing input range adaptors and counted_iterator","2021-02 (Virtual)","","","`#105142 <https://github.com/llvm/llvm-project/issues/105142>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0401R6 <https://wg21.link/P0401R6>`__","Providing size feedback in the Allocator interface","2021-06 (Virtual)","|Complete|","15","`#105144 <https://github.com/llvm/llvm-project/issues/105144>`__",""
 "`P0448R4 <https://wg21.link/P0448R4>`__","A strstream replacement using span<charT> as buffer","2021-06 (Virtual)","","","`#105145 <https://github.com/llvm/llvm-project/issues/105145>`__",""
 "`P1132R8 <https://wg21.link/P1132R8>`__","out_ptr - a scalable output pointer abstraction","2021-06 (Virtual)","|Complete|","19","`#105146 <https://github.com/llvm/llvm-project/issues/105146>`__",""
@@ -22,7 +22,7 @@
 "`P1989R2 <https://wg21.link/P1989R2>`__","Range constructor for std::string_view","2021-06 (Virtual)","|Complete|","14","`#105154 <https://github.com/llvm/llvm-project/issues/105154>`__",""
 "`P2136R3 <https://wg21.link/P2136R3>`__","invoke_r","2021-06 (Virtual)","|Complete|","17","`#105155 <https://github.com/llvm/llvm-project/issues/105155>`__",""
 "`P2166R1 <https://wg21.link/P2166R1>`__","A Proposal to Prohibit std::basic_string and std::basic_string_view construction from nullptr","2021-06 (Virtual)","|Complete|","13","`#105156 <https://github.com/llvm/llvm-project/issues/105156>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0288R9 <https://wg21.link/P0288R9>`__","``any_invocable``","2021-10 (Virtual)","","","`#105157 <https://github.com/llvm/llvm-project/issues/105157>`__",""
 "`P0798R8 <https://wg21.link/P0798R8>`__","Monadic operations for ``std::optional``","2021-10 (Virtual)","|Complete|","14","`#105158 <https://github.com/llvm/llvm-project/issues/105158>`__",""
 "`P0849R8 <https://wg21.link/P0849R8>`__","``auto(x)``: ``DECAY_COPY`` in the language","2021-10 (Virtual)","|Complete|","14","`#105159 <https://github.com/llvm/llvm-project/issues/105159>`__",""
@@ -37,7 +37,7 @@
 "`P2340R1 <https://wg21.link/P2340R1>`__","Clarifying the status of the 'C headers'","2021-10 (Virtual)","|Nothing To Do|","","`#105170 <https://github.com/llvm/llvm-project/issues/105170>`__",""
 "`P2393R1 <https://wg21.link/P2393R1>`__","Cleaning up ``integer``-class types","2021-10 (Virtual)","","","`#105171 <https://github.com/llvm/llvm-project/issues/105171>`__",""
 "`P2401R0 <https://wg21.link/P2401R0>`__","Add a conditional ``noexcept`` specification to ``std::exchange``","2021-10 (Virtual)","|Complete|","14","`#105172 <https://github.com/llvm/llvm-project/issues/105172>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0323R12 <https://wg21.link/P0323R12>`__","``std::expected``","2022-02 (Virtual)","|Complete|","16","`#105173 <https://github.com/llvm/llvm-project/issues/105173>`__",""
 "`P0533R9 <https://wg21.link/P0533R9>`__","``constexpr`` for ``<cmath>`` and ``<cstdlib>``","2022-02 (Virtual)","|In Progress|","","`#105174 <https://github.com/llvm/llvm-project/issues/105174>`__","``isfinite``, ``isinf``, ``isnan`` and ``isnormal`` are implemented"
 "`P0627R6 <https://wg21.link/P0627R6>`__","Function to mark unreachable code","2022-02 (Virtual)","|Complete|","15","`#105175 <https://github.com/llvm/llvm-project/issues/105175>`__",""
@@ -50,7 +50,7 @@
 "`P2441R2 <https://wg21.link/P2441R2>`__","``views::join_with``","2022-02 (Virtual)","|Complete|","21","`#105185 <https://github.com/llvm/llvm-project/issues/105185>`__",""
 "`P2442R1 <https://wg21.link/P2442R1>`__","Windowing range adaptors: ``views::chunk`` and ``views::slide``","2022-02 (Virtual)","","","`#105187 <https://github.com/llvm/llvm-project/issues/105187>`__",""
 "`P2443R1 <https://wg21.link/P2443R1>`__","``views::chunk_by``","2022-02 (Virtual)","|Complete|","18","`#105188 <https://github.com/llvm/llvm-project/issues/105188>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0009R18 <https://wg21.link/P0009R18>`__","mdspan: A Non-Owning Multidimensional Array Reference","2022-07 (Virtual)","|Complete|","18","`#105189 <https://github.com/llvm/llvm-project/issues/105189>`__",""
 "`P0429R9 <https://wg21.link/P0429R9>`__","A Standard ``flat_map``","2022-07 (Virtual)","|Complete|","20","`#105190 <https://github.com/llvm/llvm-project/issues/105190>`__",""
 "`P1169R4 <https://wg21.link/P1169R4>`__","``static operator()``","2022-07 (Virtual)","|Complete|","16","`#105192 <https://github.com/llvm/llvm-project/issues/105192>`__",""
@@ -91,7 +91,7 @@
 "`P2599R2 <https://wg21.link/P2599R2>`__","``mdspan::size_type`` should be ``index_type``","2022-07 (Virtual)","|Complete|","18","`#105235 <https://github.com/llvm/llvm-project/issues/105235>`__",""
 "`P2604R0 <https://wg21.link/P2604R0>`__","mdspan: rename pointer and contiguous","2022-07 (Virtual)","|Complete|","18","`#105236 <https://github.com/llvm/llvm-project/issues/105236>`__",""
 "`P2613R1 <https://wg21.link/P2613R1>`__","Add the missing ``empty`` to ``mdspan``","2022-07 (Virtual)","|Complete|","18","`#105237 <https://github.com/llvm/llvm-project/issues/105237>`__",""
-"","","","","",""
+"","","","","","",""
 "`P1202R5 <https://wg21.link/P1202R5>`__","Asymmetric Fences","2022-11 (Kona)","","","`#105238 <https://github.com/llvm/llvm-project/issues/105238>`__",""
 "`P1264R2 <https://wg21.link/P1264R2>`__","Revising the wording of ``stream`` input operations","2022-11 (Kona)","|Complete|","9","`#105239 <https://github.com/llvm/llvm-project/issues/105239>`__",""
 "`P1478R8 <https://wg21.link/P1478R8>`__","``Byte-wise`` ``atomic`` ``memcpy``","2022-11 (Kona)","","","`#105240 <https://github.com/llvm/llvm-project/issues/105240>`__",""
@@ -101,7 +101,7 @@
 "`P2539R4 <https://wg21.link/P2539R4>`__","Should the output of ``std::print`` to a terminal be synchronized with the underlying stream?","2022-11 (Kona)","|Complete|","18","`#105245 <https://github.com/llvm/llvm-project/issues/105245>`__",""
 "`P2602R2 <https://wg21.link/P2602R2>`__","Poison Pills are Too Toxic","2022-11 (Kona)","|Complete|","19","`#105246 <https://github.com/llvm/llvm-project/issues/105246>`__","Implemented as a DR in C++20."
 "`P2708R1 <https://wg21.link/P2708R1>`__","No Further Fundamentals TSes","2022-11 (Kona)","|Nothing To Do|","","`#105247 <https://github.com/llvm/llvm-project/issues/105247>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0290R4 <https://wg21.link/P0290R4>`__","``apply()`` for ``synchronized_value<T>``","2023-02 (Issaquah)","","","`#105249 <https://github.com/llvm/llvm-project/issues/105249>`__",""
 "`P2770R0 <https://wg21.link/P2770R0>`__","Stashing stashing ``iterators`` for proper flattening","2023-02 (Issaquah)","|Complete|","21","`#105250 <https://github.com/llvm/llvm-project/issues/105250>`__",""
 "`P2164R9 <https://wg21.link/P2164R9>`__","``views::enumerate``","2023-02 (Issaquah)","","","`#105251 <https://github.com/llvm/llvm-project/issues/105251>`__",""
diff --git a/libcxx/docs/Status/Cxx2cIssues.csv b/libcxx/docs/Status/Cxx2cIssues.csv
index 237217a31b9ea..859c49af17ec9 100644
--- a/libcxx/docs/Status/Cxx2cIssues.csv
+++ b/libcxx/docs/Status/Cxx2cIssues.csv
@@ -1,4 +1,4 @@
-"Issue #","Issue Name","Meeting","Status","First released version","Notes"
+"Issue #","Issue Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`LWG2994 <https://wg21.link/LWG2994>`__","Needless UB for ``basic_string`` and ``basic_string_view``","2023-06 (Varna)","|Complete|","5","`#105268 <https://github.com/llvm/llvm-project/issues/105268>`__",""
 "`LWG3884 <https://wg21.link/LWG3884>`__","``flat_foo`` is missing allocator-extended copy/move constructors","2023-06 (Varna)","","","`#105269 <https://github.com/llvm/llvm-project/issues/105269>`__",""
 "`LWG3885 <https://wg21.link/LWG3885>`__","``op`` should be in [zombie.names]","2023-06 (Varna)","|Nothing To Do|","","`#105270 <https://github.com/llvm/llvm-project/issues/105270>`__",""
@@ -16,7 +16,7 @@
 "`LWG3935 <https://wg21.link/LWG3935>`__","``template<class X> constexpr complex& operator=(const complex<X>&)`` has no specification","2023-06 (Varna)","|Complete|","3.4","`#105284 <https://github.com/llvm/llvm-project/issues/105284>`__",""
 "`LWG3938 <https://wg21.link/LWG3938>`__","Cannot use ``std::expected`` monadic ops with move-only ``error_type``","2023-06 (Varna)","|Complete|","18","`#105285 <https://github.com/llvm/llvm-project/issues/105285>`__",""
 "`LWG3940 <https://wg21.link/LWG3940>`__","``std::expected<void, E>::value()`` also needs ``E`` to be copy constructible","2023-06 (Varna)","|Complete|","18","`#105286 <https://github.com/llvm/llvm-project/issues/105286>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG2392 <https://wg21.link/LWG2392>`__","""character type"" is used but not defined","2023-11 (Kona)","","","`#105287 <https://github.com/llvm/llvm-project/issues/105287>`__",""
 "`LWG3203 <https://wg21.link/LWG3203>`__","``span`` element access invalidation","2023-11 (Kona)","|Nothing To Do|","","`#105288 <https://github.com/llvm/llvm-project/issues/105288>`__",""
 "`LWG3305 <https://wg21.link/LWG3305>`__","``any_cast<void>``","2023-11 (Kona)","|Complete|","18","`#105289 <https://github.com/llvm/llvm-project/issues/105289>`__",""
@@ -39,7 +39,7 @@
 "`LWG3987 <https://wg21.link/LWG3987>`__","Including `<flat_foo>` doesn't provide `std::begin`/`end`","2023-11 (Kona)","|Complete|","","`#105309 <https://github.com/llvm/llvm-project/issues/105309>`__",""
 "`LWG3990 <https://wg21.link/LWG3990>`__","Program-defined specializations of ``std::tuple`` and ``std::variant`` can't be properly supported","2023-11 (Kona)","|Complete|","21","`#105310 <https://github.com/llvm/llvm-project/issues/105310>`__",""
 "`LWG4001 <https://wg21.link/LWG4001>`__","``iota_view`` should provide ``empty``","2023-11 (Kona)","|Complete|","19","`#105311 <https://github.com/llvm/llvm-project/issues/105311>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3767 <https://wg21.link/LWG3767>`__","``codecvt<charN_t, char8_t, mbstate_t>`` incorrectly added to locale","2024-03 (Tokyo)","","","`#105313 <https://github.com/llvm/llvm-project/issues/105313>`__",""
 "`LWG3919 <https://wg21.link/LWG3919>`__","``enumerate_view`` may invoke UB for sized common non-forward underlying ranges","2024-03 (Tokyo)","","","`#105315 <https://github.com/llvm/llvm-project/issues/105315>`__",""
 "`LWG3950 <https://wg21.link/LWG3950>`__","``std::basic_string_view`` comparison operators are overspecified","2024-03 (Tokyo)","|Complete|","18","`#105316 <https://github.com/llvm/llvm-project/issues/105316>`__",""
@@ -61,7 +61,7 @@
 "`LWG4045 <https://wg21.link/LWG4045>`__","``tuple`` can create dangling references from ``tuple-like``","2024-03 (Tokyo)","","","`#105337 <https://github.com/llvm/llvm-project/issues/105337>`__",""
 "`LWG4053 <https://wg21.link/LWG4053>`__","Unary call to ``std::views::repeat`` does not decay the argument","2024-03 (Tokyo)","|Complete|","19","`#105338 <https://github.com/llvm/llvm-project/issues/105338>`__",""
 "`LWG4054 <https://wg21.link/LWG4054>`__","Repeating a ``repeat_view`` should repeat the view","2024-03 (Tokyo)","|Complete|","19","`#105340 <https://github.com/llvm/llvm-project/issues/105340>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3944 <https://wg21.link/LWG3944>`__","Formatters converting sequences of ``char`` to sequences of ``wchar_t``","2024-06 (St. Louis)","|Complete|","21","`#105342 <https://github.com/llvm/llvm-project/issues/105342>`__",""
 "`LWG4060 <https://wg21.link/LWG4060>`__","``submdspan`` preconditions do not forbid creating invalid pointer","2024-06 (St. Louis)","","","`#105343 <https://github.com/llvm/llvm-project/issues/105343>`__",""
 "`LWG4061 <https://wg21.link/LWG4061>`__","Should ``std::basic_format_context`` be default-constructible/copyable/movable?","2024-06 (St. Louis)","|Complete|","19","`#105344 <https://github.com/llvm/llvm-project/issues/105344>`__",""
@@ -75,7 +75,7 @@
 "`LWG4098 <https://wg21.link/LWG4098>`__","``views::adjacent<0>`` should reject non-forward ranges","2024-06 (St. Louis)","","","`#105353 <https://github.com/llvm/llvm-project/issues/105353>`__",""
 "`LWG4105 <https://wg21.link/LWG4105>`__","``ranges::ends_with``\`s Returns misses difference casting","2024-06 (St. Louis)","","","`#105354 <https://github.com/llvm/llvm-project/issues/105354>`__",""
 "`LWG4106 <https://wg21.link/LWG4106>`__","``basic_format_args`` should not be default-constructible","2024-06 (St. Louis)","|Complete|","19","`#105355 <https://github.com/llvm/llvm-project/issues/105355>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3216 <https://wg21.link/LWG3216>`__","Rebinding the allocator before calling ``construct``/``destroy`` in ``allocate_shared``","2024-11 (Wrocław)","","","`#118332 <https://github.com/llvm/llvm-project/issues/118332>`__",""
 "`LWG3436 <https://wg21.link/LWG3436>`__","``std::construct_at`` should support arrays","2024-11 (Wrocław)","","","`#118335 <https://github.com/llvm/llvm-project/issues/118335>`__",""
 "`LWG3886 <https://wg21.link/LWG3886>`__","Monad mo' problems","2024-11 (Wrocław)","|Complete|","22","`#118336 <https://github.com/llvm/llvm-project/issues/118336>`__",""
@@ -110,7 +110,7 @@
 "`LWG4164 <https://wg21.link/LWG4164>`__","Missing guarantees for ``forward_list`` modifiers","2024-11 (Wrocław)","|Complete|","21","`#118366 <https://github.com/llvm/llvm-project/issues/118366>`__",""
 "`LWG4169 <https://wg21.link/LWG4169>`__","``std::atomic<T>``'s default constructor should be constrained","2024-11 (Wrocław)","","","`#118367 <https://github.com/llvm/llvm-project/issues/118367>`__",""
 "`LWG4170 <https://wg21.link/LWG4170>`__","``contiguous_iterator`` should require ``to_address(I{})``","2024-11 (Wrocław)","","","`#118368 <https://github.com/llvm/llvm-project/issues/118368>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3578 <https://wg21.link/LWG3578>`__","Iterator SCARYness in the context of associative container merging","2025-02 (Hagenberg)","","","`#127859 <https://github.com/llvm/llvm-project/issues/127859>`__",""
 "`LWG3956 <https://wg21.link/LWG3956>`__","``chrono::parse`` uses ``from_stream`` as a customization point","2025-02 (Hagenberg)","","","`#127860 <https://github.com/llvm/llvm-project/issues/127860>`__",""
 "`LWG4172 <https://wg21.link/LWG4172>`__","``unique_lock`` self-move-assignment is broken","2025-02 (Hagenberg)","","","`#127861 <https://github.com/llvm/llvm-project/issues/127861>`__",""
@@ -120,7 +120,7 @@
 "`LWG4189 <https://wg21.link/LWG4189>`__","``cache_latest_view`` should be freestanding","2025-02 (Hagenberg)","","","`#127865 <https://github.com/llvm/llvm-project/issues/127865>`__",""
 "`LWG4191 <https://wg21.link/LWG4191>`__","P1467 changed the return type of ``pow(complex<float>, int)``","2025-02 (Hagenberg)","","","`#127866 <https://github.com/llvm/llvm-project/issues/127866>`__",""
 "`LWG4196 <https://wg21.link/LWG4196>`__","Complexity of ``inplace_merge()`` is incorrect","2025-02 (Hagenberg)","","","`#127867 <https://github.com/llvm/llvm-project/issues/127867>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG4198 <https://wg21.link/LWG4198>`__","``schedule_from`` isn't starting the schedule sender if decay-copying results throws","2025-06 (Sofia)","","","`#148188 <https://github.com/llvm/llvm-project/issues/148188>`__",""
 "`LWG4202 <https://wg21.link/LWG4202>`__","``enable-sender`` should be a variable template","2025-06 (Sofia)","","","`#148189 <https://github.com/llvm/llvm-project/issues/148189>`__",""
 "`LWG4203 <https://wg21.link/LWG4203>`__","Constraints on ``get-state`` functions are incorrect","2025-06 (Sofia)","","","`#148190 <https://github.com/llvm/llvm-project/issues/148190>`__",""
@@ -145,9 +145,9 @@
 "`LWG4242 <https://wg21.link/LWG4242>`__","``ranges::distance`` does not work with volatile iterators","2025-06 (Sofia)","","","`#148224 <https://github.com/llvm/llvm-project/issues/148224>`__",""
 "`LWG4245 <https://wg21.link/LWG4245>`__","Operators that interact with ``counted_iterator`` and ``default_sentinel_t`` should be ``noexcept``","2025-06 (Sofia)","","","`#148226 <https://github.com/llvm/llvm-project/issues/148226>`__",""
 "`LWG4247 <https://wg21.link/LWG4247>`__","Header ``<stdbit.h>`` is not yet freestanding","2025-06 (Sofia)","","","`#148227 <https://github.com/llvm/llvm-project/issues/148227>`__",""
-"","","","","",""
+"","","","","","",""
 "`LWG3343 <https://wg21.link/LWG3343>`__","Ordering of calls to ``unlock()`` and ``notify_all()`` in Effects element of ``notify_all_at_thread_exit()`` should be reversed","Not Adopted Yet","|Complete|","16","`#105356 <https://github.com/llvm/llvm-project/issues/105356>`__",""
 "`LWG4139 <https://wg21.link/LWG4139>`__","§[time.zone.leap] recursive constraint in ``<=>``","Not Adopted Yet","|Complete|","20","`#118369 <https://github.com/llvm/llvm-project/issues/118369>`__",""
 "`LWG3456 <https://wg21.link/LWG3456>`__","Pattern used by ``std::from_chars`` is underspecified (option B)","Not Adopted Yet","|Complete|","20","`#118370 <https://github.com/llvm/llvm-project/issues/118370>`__",""
 "`LWG3882 <https://wg21.link/LWG3882>`__","``tuple`` relational operators have confused friendships","Not Adopted Yet","|Complete|","22","The comparsion operators are constrained harder than the proposed resolution. libstdc++ and MSVC STL do the same.",""
-"","","","","",""
+"","","","","","",""
diff --git a/libcxx/docs/Status/Cxx2cPapers.csv b/libcxx/docs/Status/Cxx2cPapers.csv
index 0eedc82e3dbf5..a5423acf0d419 100644
--- a/libcxx/docs/Status/Cxx2cPapers.csv
+++ b/libcxx/docs/Status/Cxx2cPapers.csv
@@ -1,4 +1,4 @@
-"Paper #","Paper Name","Meeting","Status","First released version","Notes"
+"Paper #","Paper Name","Meeting","Status","First released version","GitHub issue","Notes"
 "`P2497R0 <https://wg21.link/P2497R0>`__","Testing for success or failure of ``<charconv>`` functions","2023-06 (Varna)","|Complete|","18","`#105357 <https://github.com/llvm/llvm-project/issues/105357>`__",""
 "`P2592R3 <https://wg21.link/P2592R3>`__","Hashing support for ``std::chrono`` value classes","2023-06 (Varna)","","","`#105358 <https://github.com/llvm/llvm-project/issues/105358>`__",""
 "`P2587R3 <https://wg21.link/P2587R3>`__","``to_string`` or not ``to_string``","2023-06 (Varna)","","","`#105359 <https://github.com/llvm/llvm-project/issues/105359>`__",""
@@ -26,7 +26,7 @@
 "`P2548R6 <https://wg21.link/P2548R6>`__","``copyable_function``","2023-06 (Varna)","","","`#105387 <https://github.com/llvm/llvm-project/issues/105387>`__",""
 "`P2714R1 <https://wg21.link/P2714R1>`__","Bind front and back to NTTP callables","2023-06 (Varna)","|Partial|","20","`#105388 <https://github.com/llvm/llvm-project/issues/105388>`__",""
 "`P2630R4 <https://wg21.link/P2630R4>`__","``submdspan``","2023-06 (Varna)","","","`#105389 <https://github.com/llvm/llvm-project/issues/105389>`__",""
-"","","","","",""
+"","","","","","",""
 "`P0543R3 <https://wg21.link/P0543R3>`__","Saturation arithmetic","2023-11 (Kona)","|Complete|","18","`#105390 <https://github.com/llvm/llvm-project/issues/105390>`__",""
 "`P2407R5 <https://wg21.link/P2407R5>`__","Freestanding Library: Partial Classes","2023-11 (Kona)","","","`#105391 <https://github.com/llvm/llvm-project/issues/105391>`__",""
 "`P2546R5 <https://wg21.link/P2546R5>`__","Debugging Support","2023-11 (Kona)","","","`#105392 <https://github.com/llvm/llvm-project/issues/105392>`__",""
@@ -45,7 +45,7 @@
 "`P2836R1 <https://wg21.link/P2836R1>`__","``std::basic_const_iterator`` should follow its underlying type's convertibility","2023-11 (Kona)","","","`#105406 <https://github.com/llvm/llvm-project/issues/105406>`__",""
 "`P2264R7 <https://wg21.link/P2264R7>`__","Make ``assert()`` macro user friendly for C and C++","2023-11 (Kona)","","","`#105407 <https://github.com/llvm/llvm-project/issues/105407>`__",""
 "`P1673R13 <https://wg21.link/P1673R13>`__","A free function linear algebra interface based on the BLAS","2023-11 (Kona)","","","`#105408 <https://github.com/llvm/llvm-project/issues/105408>`__",""
-"","","","","",""
+"","","","","","",""
 "`P2875R4 <https://wg21.link/P2875R4>`__","Undeprecate ``polymorphic_allocator::destroy`` for C++26","2024-03 (Tokyo)","|Complete|","15","`#105410 <https://github.com/llvm/llvm-project/issues/105410>`__",""
 "`P2867R2 <https://wg21.link/P2867R2>`__","Remove Deprecated ``strstreams`` From C++26","2024-03 (Tokyo)","|Complete|","19","`#105411 <https://github.com/llvm/llvm-project/issues/105411>`__",""
 "`P2869R4 <https://wg21.link/P2869R4>`__","Remove Deprecated ``shared_ptr`` Atomic Access APIs from C++26","2024-03 (Tokyo)","","","`#105412 <https://github.com/llvm/llvm-project/issues/105412>`__",""
@@ -62,7 +62,7 @@
 "`P2944R3 <https://wg21.link/P2944R3>`__","Comparisons for ``reference_wrapper``","2024-03 (Tokyo)","|Complete|","22","`#105424 <https://github.com/llvm/llvm-project/issues/105424>`__","The changes to ``tuple``'s equality overload from P2165R4 are not yet implemented."
 "`P2642R6 <https://wg21.link/P2642R6>`__","Padded ``mdspan`` layouts","2024-03 (Tokyo)","","","`#105425 <https://github.com/llvm/llvm-project/issues/105425>`__",""
 "`P3029R1 <https://wg21.link/P3029R1>`__","Better ``mdspan``'s CTAD","2024-03 (Tokyo)","|Complete|","19","`#105426 <https://github.com/llvm/llvm-project/issues/105426>`__",""
-"","","","","",""
+"","","","","","",""
 "`P2747R2 <https://wg21.link/P2747R2>`__","``constexpr`` placement new","2024-06 (St. Louis)","|Complete|","20","`#105427 <https://github.com/llvm/llvm-project/issues/105427>`__",""
 "`P2997R1 <https://wg21.link/P2997R1>`__","Removing the common reference requirement from the indirectly invocable concepts","2024-06 (St. Louis)","|Complete|","19","`#105428 <https://github.com/llvm/llvm-project/issues/105428>`__","Implemented as a DR against C++20. (MSVC STL and libstdc++ will do the same.)"
 "`P2389R2 <https://wg21.link/P2389R2>`__","``dextents`` Index Type Parameter","2024-06 (St. Louis)","|Complete|","19","`#105429 <https://github.com/llvm/llvm-project/issues/105429>`__",""
@@ -75,7 +75,7 @@
 "`P2075R6 <https://wg21.link/P2075R6>`__","Philox as an extension of the C++ RNG engines","2024-06 (St. Louis)","","","`#105437 <https://github.com/llvm/llvm-project/issues/105437>`__",""
 "`P2422R1 <https://wg21.link/P2422R1>`__","Remove ``nodiscard`` annotations from the standard library specification","2024-06 (St. Louis)","|Complete|","19","`#105438 <https://github.com/llvm/llvm-project/issues/105438>`__","``nodiscard`` attributes were kept as a conforming extension"
 "`P2300R10 <https://wg21.link/P2300R10>`__","``std::execution``","2024-06 (St. Louis)","","","`#105440 <https://github.com/llvm/llvm-project/issues/105440>`__",""
-"","","","","",""
+"","","","","","",""
 "`P3136R1 <https://wg21.link/P3136R1>`__","Retiring niebloids","2024-11 (Wrocław)","|Complete|","14","`#118133 <https://github.com/llvm/llvm-project/issues/118133>`__",""
 "`P3138R5 <https://wg21.link/P3138R5>`__","``views::cache_latest``","2024-11 (Wrocław)","","","`#118134 <https://github.com/llvm/llvm-project/issues/118134>`__",""
 "`P3379R0 <https://wg21.link/P3379R0>`__","Constrain ``std::expected`` equality operators","2024-11 (Wrocław)","|Complete|","21","`#118135 <https://github.com/llvm/llvm-project/issues/118135>`__",""
@@ -95,7 +95,7 @@
 "`P3325R5 <https://wg21.link/P3325R5>`__","A Utility for Creating Execution Environments","2024-11 (Wrocław)","","","`#118385 <https://github.com/llvm/llvm-project/issues/118385>`__",""
 "`P3068R6 <https://wg21.link/P3068R6>`__","Allowing exception throwing in constant-evaluation","2024-11 (Wrocław)","","","`#118386 <https://github.com/llvm/llvm-project/issues/118386>`__",""
 "`P3247R2 <https://wg21.link/P3247R2>`__","Deprecate the notion of trivial types","2024-11 (Wrocław)","|Complete|","21","`#118387 <https://github.com/llvm/llvm-project/issues/118387>`__",""
-"","","","","",""
+"","","","","","",""
 "`P3074R7 <https://wg21.link/P3074R7>`__","trivial unions (was ``std::uninitialized``)","2025-02 (Hagenberg)","","","`#127868 <https://github.com/llvm/llvm-project/issues/127868>`__",""
 "`P1494R5 <https://wg21.link/P1494R5>`__","Partial program correctness","2025-02 (Hagenberg)","","","`#127869 <https://github.com/llvm/llvm-project/issues/127869>`__",""
 "`P2900R14 <https://wg21.link/P2900R14>`__","Contracts for C++","2025-02 (Hagenberg)","","","`#127870 <https://github.com/llvm/llvm-project/issues/127870>`__",""
@@ -116,7 +116,7 @@
 "`P3471R4 <https://wg21.link/P3471R4>`__","Standard Library Hardening","2025-02 (Hagenberg)","","","`#127885 <https://github.com/llvm/llvm-project/issues/127885>`__",""
 "`P0447R28 <https://wg21.link/P0447R28>`__","Introduction of ``std::hive`` to the standard library","2025-02 (Hagenberg)","","","`#127886 <https://github.com/llvm/llvm-project/issues/127886>`__",""
 "`P3019R14 <https://wg21.link/P3019R14>`__","``indirect`` and ``polymorphic``: Vocabulary Types for Composite Class Design","2025-02 (Hagenberg)","","","`#127887 <https://github.com/llvm/llvm-project/issues/127887>`__",""
-"","","","","",""
+"","","","","","",""
 "`P2996R13 <https://wg21.link/P2996R13>`__","Reflection for C++26","2025-06 (Sofia)","","","`#148123 <https://github.com/llvm/llvm-project/issues/148123>`__",""
 "`P3394R4 <https://wg21.link/P3394R4>`__","Annotations for Reflection","2025-06 (Sofia)","","","`#148124 <https://github.com/llvm/llvm-project/issues/148124>`__",""
 "`P3293R3 <https://wg21.link/P3293R3>`__","Splicing a base class subobject","2025-06 (Sofia)","","","`#148125 <https://github.com/llvm/llvm-project/issues/148125>`__",""
@@ -156,4 +156,4 @@
 "`P3697R1 <https://wg21.link/P3697R1>`__","Minor additions to C++26 standard library hardening","2025-06 (Sofia)","","","`#148180 <https://github.com/llvm/llvm-project/issues/148180>`__",""
 "`P3552R3 <https://wg21.link/P3552R3>`__","Add a Coroutine Task Type","2025-06 (Sofia)","","","`#148182 <https://github.com/llvm/llvm-project/issues/148182>`__",""
 "`P1317R2 <https://wg21.link/P1317R2>`__","Remove return type deduction in ``std::apply``","2025-06 (Sofia)","","","`#148183 <https://github.com/llvm/llvm-project/issues/148183>`__",""
-"","","","","",""
+"","","","","","",""
diff --git a/libcxx/include/CMakeLists.txt b/libcxx/include/CMakeLists.txt
index dd1e71380e7fc..37259a7e6e7dd 100644
--- a/libcxx/include/CMakeLists.txt
+++ b/libcxx/include/CMakeLists.txt
@@ -1593,7 +1593,6 @@ set(files
   __cxx03/cmath
   __cxx03/codecvt
   __cxx03/complex
-  __cxx03/complex.h
   __cxx03/condition_variable
   __cxx03/csetjmp
   __cxx03/csignal
@@ -1606,25 +1605,20 @@ set(files
   __cxx03/cstring
   __cxx03/ctgmath
   __cxx03/ctime
-  __cxx03/ctype.h
   __cxx03/cuchar
   __cxx03/cwchar
   __cxx03/cwctype
   __cxx03/deque
-  __cxx03/errno.h
   __cxx03/exception
   __cxx03/experimental/__config
   __cxx03/experimental/utility
   __cxx03/ext/__hash
   __cxx03/ext/hash_map
   __cxx03/ext/hash_set
-  __cxx03/fenv.h
-  __cxx03/float.h
   __cxx03/forward_list
   __cxx03/fstream
   __cxx03/functional
   __cxx03/future
-  __cxx03/inttypes.h
   __cxx03/iomanip
   __cxx03/ios
   __cxx03/iosfwd
@@ -1651,11 +1645,8 @@ set(files
   __cxx03/sstream
   __cxx03/stack
   __cxx03/stdatomic.h
-  __cxx03/stdbool.h
-  __cxx03/stddef.h
   __cxx03/stdexcept
   __cxx03/stdint.h
-  __cxx03/stdio.h
   __cxx03/stdlib.h
   __cxx03/streambuf
   __cxx03/string
@@ -1663,7 +1654,6 @@ set(files
   __cxx03/string_view
   __cxx03/strstream
   __cxx03/system_error
-  __cxx03/tgmath.h
   __cxx03/thread
   __cxx03/type_traits
   __cxx03/typeindex
@@ -1676,7 +1666,6 @@ set(files
   __cxx03/vector
   __cxx03/version
   __cxx03/wchar.h
-  __cxx03/wctype.h
   )
 
 configure_file("__config_site.in" "${LIBCXX_GENERATED_INCLUDE_TARGET_DIR}/__config_site" @ONLY)
diff --git a/libcxx/include/__compare/strong_order.h b/libcxx/include/__compare/strong_order.h
index 8c363b5638222..ba6de446433ad 100644
--- a/libcxx/include/__compare/strong_order.h
+++ b/libcxx/include/__compare/strong_order.h
@@ -13,7 +13,6 @@
 #include <__compare/compare_three_way.h>
 #include <__compare/ordering.h>
 #include <__config>
-#include <__math/exponential_functions.h>
 #include <__math/traits.h>
 #include <__type_traits/conditional.h>
 #include <__type_traits/decay.h>
@@ -53,38 +52,21 @@ struct __fn {
   template <class _Tp, class _Up, class _Dp = decay_t<_Tp>>
     requires is_same_v<_Dp, decay_t<_Up>> && is_floating_point_v<_Dp>
   _LIBCPP_HIDE_FROM_ABI static constexpr strong_ordering __go(_Tp&& __t, _Up&& __u, __priority_tag<1>) noexcept {
-    if constexpr (numeric_limits<_Dp>::is_iec559 && sizeof(_Dp) == sizeof(int32_t)) {
-      int32_t __rx = std::bit_cast<int32_t>(__t);
-      int32_t __ry = std::bit_cast<int32_t>(__u);
-      __rx         = (__rx < 0) ? (numeric_limits<int32_t>::min() - __rx - 1) : __rx;
-      __ry         = (__ry < 0) ? (numeric_limits<int32_t>::min() - __ry - 1) : __ry;
-      return (__rx <=> __ry);
-    } else if constexpr (numeric_limits<_Dp>::is_iec559 && sizeof(_Dp) == sizeof(int64_t)) {
-      int64_t __rx = std::bit_cast<int64_t>(__t);
-      int64_t __ry = std::bit_cast<int64_t>(__u);
-      __rx         = (__rx < 0) ? (numeric_limits<int64_t>::min() - __rx - 1) : __rx;
-      __ry         = (__ry < 0) ? (numeric_limits<int64_t>::min() - __ry - 1) : __ry;
+    if constexpr (numeric_limits<_Dp>::is_iec559 &&
+                  (sizeof(_Dp) == sizeof(int32_t) || sizeof(_Dp) == sizeof(int64_t))) {
+      using _IntT = conditional_t<sizeof(_Dp) == sizeof(int32_t), int32_t, int64_t>;
+      _IntT __rx  = std::bit_cast<_IntT>(__t);
+      _IntT __ry  = std::bit_cast<_IntT>(__u);
+      __rx        = (__rx < 0) ? (numeric_limits<_IntT>::min() - __rx - 1) : __rx;
+      __ry        = (__ry < 0) ? (numeric_limits<_IntT>::min() - __ry - 1) : __ry;
       return (__rx <=> __ry);
     } else if (__t < __u) {
       return strong_ordering::less;
     } else if (__t > __u) {
       return strong_ordering::greater;
     } else if (__t == __u) {
-      if constexpr (numeric_limits<_Dp>::radix == 2) {
-        return __math::signbit(__u) <=> __math::signbit(__t);
-      } else {
-        // This is bullet 3 of the IEEE754 algorithm, relevant
-        // only for decimal floating-point;
-        // see https://stackoverflow.com/questions/69068075/
-        if (__t == 0 || __math::isinf(__t)) {
-          return __math::signbit(__u) <=> __math::signbit(__t);
-        } else {
-          int __texp, __uexp;
-          (void)__math::frexp(__t, &__texp);
-          (void)__math::frexp(__u, &__uexp);
-          return (__t < 0) ? (__texp <=> __uexp) : (__uexp <=> __texp);
-        }
-      }
+      static_assert(numeric_limits<_Dp>::radix == 2, "floating point type with a radix other than 2?");
+      return __math::signbit(__u) <=> __math::signbit(__t);
     } else {
       // They're unordered, so one of them must be a NAN.
       // The order is -QNAN, -SNAN, numbers, +SNAN, +QNAN.
@@ -93,9 +75,9 @@ struct __fn {
       bool __t_is_negative = __math::signbit(__t);
       bool __u_is_negative = __math::signbit(__u);
       using _IntType =
-          conditional_t< sizeof(__t) == sizeof(int32_t),
-                         int32_t,
-                         conditional_t< sizeof(__t) == sizeof(int64_t), int64_t, void> >;
+          conditional_t<sizeof(__t) == sizeof(int32_t),
+                        int32_t,
+                        conditional_t<sizeof(__t) == sizeof(int64_t), int64_t, void>>;
       if constexpr (is_same_v<_IntType, void>) {
         static_assert(sizeof(_Dp) == 0, "std::strong_order is unimplemented for this floating-point type");
       } else if (__t_is_nan && __u_is_nan) {
diff --git a/libcxx/include/__config b/libcxx/include/__config
index b4c081dcdff1b..5971a3c5407b9 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -1021,7 +1021,9 @@ typedef __char32_t char32_t;
 // the latter depends on internal GNU libc details that are not appropriate
 // to depend on here, so any declarations present when __cpp_char8_t is not
 // defined are ignored.
-#  if defined(_LIBCPP_GLIBC_PREREQ)
+#  if defined(__clang__)
+#    define _LIBCPP_HAS_C8RTOMB_MBRTOC8 1
+#  elif defined(_LIBCPP_GLIBC_PREREQ)
 #    if _LIBCPP_GLIBC_PREREQ(2, 36) && defined(__cpp_char8_t)
 #      define _LIBCPP_HAS_C8RTOMB_MBRTOC8 1
 #    else
diff --git a/libcxx/include/__cxx03/__algorithm/count.h b/libcxx/include/__cxx03/__algorithm/count.h
index 5440fd031a1d3..5b05b4b71f519 100644
--- a/libcxx/include/__cxx03/__algorithm/count.h
+++ b/libcxx/include/__cxx03/__algorithm/count.h
@@ -54,18 +54,18 @@ __count_bool(__bit_iterator<_Cp, _IsConst> __first, typename _Cp::size_type __n)
   if (__first.__ctz_ != 0) {
     __storage_type __clz_f = static_cast<__storage_type>(__bits_per_word - __first.__ctz_);
     __storage_type __dn    = std::min(__clz_f, __n);
-    __storage_type __m     = (~__storage_type(0) << __first.__ctz_) & (~__storage_type(0) >> (__clz_f - __dn));
-    __r                    = std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+    __storage_type __m     = (__storage_type(~0) << __first.__ctz_) & (__storage_type(~0) >> (__clz_f - __dn));
+    __r                    = std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
     __n -= __dn;
     ++__first.__seg_;
   }
   // do middle whole words
   for (; __n >= __bits_per_word; ++__first.__seg_, __n -= __bits_per_word)
-    __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_));
+    __r += std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_));
   // do last partial word
   if (__n > 0) {
-    __storage_type __m = ~__storage_type(0) >> (__bits_per_word - __n);
-    __r += std::__libcpp_popcount(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
+    __storage_type __m = __storage_type(~0) >> (__bits_per_word - __n);
+    __r += std::__libcpp_popcount<__storage_type>(std::__invert_if<!_ToCount>(*__first.__seg_) & __m);
   }
   return __r;
 }
diff --git a/libcxx/include/__cxx03/__algorithm/for_each.h b/libcxx/include/__cxx03/__algorithm/for_each.h
index d160a9eddc50b..1ffb0137c0af7 100644
--- a/libcxx/include/__cxx03/__algorithm/for_each.h
+++ b/libcxx/include/__cxx03/__algorithm/for_each.h
@@ -14,15 +14,11 @@
 #include <__cxx03/__config>
 #include <__cxx03/__iterator/segmented_iterator.h>
 #include <__cxx03/__type_traits/enable_if.h>
-#include <__cxx03/__utility/move.h>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
 #  pragma GCC system_header
 #endif
 
-_LIBCPP_PUSH_MACROS
-#include <__cxx03/__undef_macros>
-
 _LIBCPP_BEGIN_NAMESPACE_STD
 
 template <class _InputIterator, class _Function>
@@ -34,6 +30,4 @@ _LIBCPP_HIDE_FROM_ABI _Function for_each(_InputIterator __first, _InputIterator
 
 _LIBCPP_END_NAMESPACE_STD
 
-_LIBCPP_POP_MACROS
-
 #endif // _LIBCPP___CXX03___ALGORITHM_FOR_EACH_H
diff --git a/libcxx/include/__cxx03/__bit/popcount.h b/libcxx/include/__cxx03/__bit/popcount.h
index 64404d2cf4948..a61a92177c5b7 100644
--- a/libcxx/include/__cxx03/__bit/popcount.h
+++ b/libcxx/include/__cxx03/__bit/popcount.h
@@ -6,9 +6,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-// TODO: __builtin_popcountg is available since Clang 19 and GCC 14. When support for older versions is dropped, we can
-//  refactor this code to exclusively use __builtin_popcountg.
-
 #ifndef _LIBCPP___CXX03___BIT_POPCOUNT_H
 #define _LIBCPP___CXX03___BIT_POPCOUNT_H
 
@@ -25,12 +22,9 @@ _LIBCPP_PUSH_MACROS
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned __x) _NOEXCEPT { return __builtin_popcount(__x); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long __x) _NOEXCEPT { return __builtin_popcountl(__x); }
-
-inline _LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(unsigned long long __x) _NOEXCEPT {
-  return __builtin_popcountll(__x);
+template <class _Tp>
+_LIBCPP_HIDE_FROM_ABI int __libcpp_popcount(_Tp __v) {
+  return __builtin_popcountg(__v);
 }
 
 _LIBCPP_END_NAMESPACE_STD
diff --git a/libcxx/include/__cxx03/__thread/support/pthread.h b/libcxx/include/__cxx03/__thread/support/pthread.h
index 4dc7a4980de2b..4ec5531003e02 100644
--- a/libcxx/include/__cxx03/__thread/support/pthread.h
+++ b/libcxx/include/__cxx03/__thread/support/pthread.h
@@ -14,7 +14,7 @@
 #include <__cxx03/__chrono/duration.h>
 #include <__cxx03/__config>
 #include <__cxx03/ctime>
-#include <__cxx03/errno.h>
+#include <errno.h>
 #include <pthread.h>
 #include <sched.h>
 
diff --git a/libcxx/include/__cxx03/cctype b/libcxx/include/__cxx03/cctype
index 7f1becb46341b..e074f5410efa8 100644
--- a/libcxx/include/__cxx03/cctype
+++ b/libcxx/include/__cxx03/cctype
@@ -36,9 +36,9 @@ int toupper(int c);
 
 #include <__cxx03/__config>
 
-#include <__cxx03/ctype.h>
+#include <ctype.h>
 
-#ifndef _LIBCPP___CXX03_CTYPE_H
+#ifndef _LIBCPP_CTYPE_H
 #   error <cctype> tried including <ctype.h> but didn't find libc++'s <ctype.h> header. \
           This usually means that your header search paths are not configured properly.  \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/cerrno b/libcxx/include/__cxx03/cerrno
index acb894ea19495..a11a53f5c2ac8 100644
--- a/libcxx/include/__cxx03/cerrno
+++ b/libcxx/include/__cxx03/cerrno
@@ -24,9 +24,9 @@ Macros:
 
 #include <__cxx03/__config>
 
-#include <__cxx03/errno.h>
+#include <errno.h>
 
-#ifndef _LIBCPP___CXX03_ERRNO_H
+#ifndef _LIBCPP_ERRNO_H
 #   error <cerrno> tried including <errno.h> but didn't find libc++'s <errno.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/cfenv b/libcxx/include/__cxx03/cfenv
index d707f5a65b9bc..d3a5839221a82 100644
--- a/libcxx/include/__cxx03/cfenv
+++ b/libcxx/include/__cxx03/cfenv
@@ -54,9 +54,9 @@ int feupdateenv(const fenv_t* envp);
 
 #include <__cxx03/__config>
 
-#include <__cxx03/fenv.h>
+#include <fenv.h>
 
-#ifndef _LIBCPP___CXX03_FENV_H
+#ifndef _LIBCPP_FENV_H
 #   error <cfenv> tried including <fenv.h> but didn't find libc++'s <fenv.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/cfloat b/libcxx/include/__cxx03/cfloat
index 5d10db6a454a6..a48a213f1669f 100644
--- a/libcxx/include/__cxx03/cfloat
+++ b/libcxx/include/__cxx03/cfloat
@@ -71,9 +71,9 @@ Macros:
 
 #include <__cxx03/__config>
 
-#include <__cxx03/float.h>
+#include <float.h>
 
-#ifndef _LIBCPP___CXX03_FLOAT_H
+#ifndef _LIBCPP_FLOAT_H
 #   error <cfloat> tried including <float.h> but didn't find libc++'s <float.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/cinttypes b/libcxx/include/__cxx03/cinttypes
index a14c1a57e66f9..68a926cc75646 100644
--- a/libcxx/include/__cxx03/cinttypes
+++ b/libcxx/include/__cxx03/cinttypes
@@ -241,9 +241,9 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int
 // [cinttypes.syn]
 #include <__cxx03/cstdint>
 
-#include <__cxx03/inttypes.h>
+#include <inttypes.h>
 
-#ifndef _LIBCPP___CXX03_INTTYPES_H
+#ifndef _LIBCPP_INTTYPES_H
 #   error <cinttypes> tried including <inttypes.h> but didn't find libc++'s <inttypes.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/complex.h b/libcxx/include/__cxx03/complex.h
deleted file mode 100644
index 373017ff3f440..0000000000000
--- a/libcxx/include/__cxx03/complex.h
+++ /dev/null
@@ -1,32 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_COMPLEX_H
-#define _LIBCPP___CXX03_COMPLEX_H
-
-/*
-    complex.h synopsis
-
-#include <__cxx03/ccomplex>
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#ifdef __cplusplus
-#  include <__cxx03/ccomplex>
-#elif __has_include_next(<complex.h>)
-#  include_next <complex.h>
-#endif
-
-#endif // _LIBCPP___CXX03_COMPLEX_H
diff --git a/libcxx/include/__cxx03/cstddef b/libcxx/include/__cxx03/cstddef
index f1527901bd6b5..2b52c5fb42f95 100644
--- a/libcxx/include/__cxx03/cstddef
+++ b/libcxx/include/__cxx03/cstddef
@@ -39,9 +39,9 @@ Types:
 #include <__cxx03/__type_traits/is_integral.h>
 #include <__cxx03/version>
 
-#include <__cxx03/stddef.h>
+#include <stddef.h>
 
-#ifndef _LIBCPP___CXX03_STDDEF_H
+#ifndef _LIBCPP_STDDEF_H
 #   error <cstddef> tried including <stddef.h> but didn't find libc++'s <stddef.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/cstdio b/libcxx/include/__cxx03/cstdio
index 166ac0d8abb61..90149044aeb8b 100644
--- a/libcxx/include/__cxx03/cstdio
+++ b/libcxx/include/__cxx03/cstdio
@@ -97,9 +97,9 @@ void perror(const char* s);
 
 #include <__cxx03/__config>
 
-#include <__cxx03/stdio.h>
+#include <stdio.h>
 
-#ifndef _LIBCPP___CXX03_STDIO_H
+#ifndef _LIBCPP_STDIO_H
 #   error <cstdio> tried including <stdio.h> but didn't find libc++'s <stdio.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/ctype.h b/libcxx/include/__cxx03/ctype.h
deleted file mode 100644
index cd4097ab012ff..0000000000000
--- a/libcxx/include/__cxx03/ctype.h
+++ /dev/null
@@ -1,61 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_CTYPE_H
-#define _LIBCPP___CXX03_CTYPE_H
-
-/*
-    ctype.h synopsis
-
-int isalnum(int c);
-int isalpha(int c);
-int isblank(int c);  // C99
-int iscntrl(int c);
-int isdigit(int c);
-int isgraph(int c);
-int islower(int c);
-int isprint(int c);
-int ispunct(int c);
-int isspace(int c);
-int isupper(int c);
-int isxdigit(int c);
-int tolower(int c);
-int toupper(int c);
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<ctype.h>)
-#  include_next <ctype.h>
-#endif
-
-#ifdef __cplusplus
-
-#  undef isalnum
-#  undef isalpha
-#  undef isblank
-#  undef iscntrl
-#  undef isdigit
-#  undef isgraph
-#  undef islower
-#  undef isprint
-#  undef ispunct
-#  undef isspace
-#  undef isupper
-#  undef isxdigit
-#  undef tolower
-#  undef toupper
-
-#endif
-
-#endif // _LIBCPP___CXX03_CTYPE_H
diff --git a/libcxx/include/__cxx03/cwctype b/libcxx/include/__cxx03/cwctype
index 5228124936c6c..336b22634abf4 100644
--- a/libcxx/include/__cxx03/cwctype
+++ b/libcxx/include/__cxx03/cwctype
@@ -52,9 +52,9 @@ wctrans_t wctrans(const char* property);
 #include <__cxx03/__config>
 #include <__cxx03/cctype>
 
-#include <__cxx03/wctype.h>
+#include <wctype.h>
 
-#ifndef _LIBCPP___CXX03_WCTYPE_H
+#ifndef _LIBCPP_WCTYPE_H
 #   error <cwctype> tried including <wctype.h> but didn't find libc++'s <wctype.h> header. \
           This usually means that your header search paths are not configured properly. \
           The header search paths should contain the C++ Standard Library headers before \
diff --git a/libcxx/include/__cxx03/errno.h b/libcxx/include/__cxx03/errno.h
deleted file mode 100644
index 440235bef8545..0000000000000
--- a/libcxx/include/__cxx03/errno.h
+++ /dev/null
@@ -1,399 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_ERRNO_H
-#define _LIBCPP___CXX03_ERRNO_H
-
-/*
-    errno.h synopsis
-
-Macros:
-
-    EDOM
-    EILSEQ  // C99
-    ERANGE
-    errno
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<errno.h>)
-#  include_next <errno.h>
-#endif
-
-#ifdef __cplusplus
-
-#  if !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
-
-#    ifdef ELAST
-
-static const int __elast1 = ELAST + 1;
-static const int __elast2 = ELAST + 2;
-
-#    else
-
-static const int __elast1 = 104;
-static const int __elast2 = 105;
-
-#    endif
-
-#    ifdef ENOTRECOVERABLE
-
-#      define EOWNERDEAD __elast1
-
-#      ifdef ELAST
-#        undef ELAST
-#        define ELAST EOWNERDEAD
-#      endif
-
-#    elif defined(EOWNERDEAD)
-
-#      define ENOTRECOVERABLE __elast1
-#      ifdef ELAST
-#        undef ELAST
-#        define ELAST ENOTRECOVERABLE
-#      endif
-
-#    else // defined(EOWNERDEAD)
-
-#      define EOWNERDEAD __elast1
-#      define ENOTRECOVERABLE __elast2
-#      ifdef ELAST
-#        undef ELAST
-#        define ELAST ENOTRECOVERABLE
-#      endif
-
-#    endif // defined(EOWNERDEAD)
-
-#  endif // !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
-
-//  supply errno values likely to be missing, particularly on Windows
-
-#  ifndef EAFNOSUPPORT
-#    define EAFNOSUPPORT 9901
-#  endif
-
-#  ifndef EADDRINUSE
-#    define EADDRINUSE 9902
-#  endif
-
-#  ifndef EADDRNOTAVAIL
-#    define EADDRNOTAVAIL 9903
-#  endif
-
-#  ifndef EISCONN
-#    define EISCONN 9904
-#  endif
-
-#  ifndef EBADMSG
-#    define EBADMSG 9905
-#  endif
-
-#  ifndef ECONNABORTED
-#    define ECONNABORTED 9906
-#  endif
-
-#  ifndef EALREADY
-#    define EALREADY 9907
-#  endif
-
-#  ifndef ECONNREFUSED
-#    define ECONNREFUSED 9908
-#  endif
-
-#  ifndef ECONNRESET
-#    define ECONNRESET 9909
-#  endif
-
-#  ifndef EDESTADDRREQ
-#    define EDESTADDRREQ 9910
-#  endif
-
-#  ifndef EHOSTUNREACH
-#    define EHOSTUNREACH 9911
-#  endif
-
-#  ifndef EIDRM
-#    define EIDRM 9912
-#  endif
-
-#  ifndef EMSGSIZE
-#    define EMSGSIZE 9913
-#  endif
-
-#  ifndef ENETDOWN
-#    define ENETDOWN 9914
-#  endif
-
-#  ifndef ENETRESET
-#    define ENETRESET 9915
-#  endif
-
-#  ifndef ENETUNREACH
-#    define ENETUNREACH 9916
-#  endif
-
-#  ifndef ENOBUFS
-#    define ENOBUFS 9917
-#  endif
-
-#  ifndef ENOLINK
-#    define ENOLINK 9918
-#  endif
-
-#  ifndef ENODATA
-#    define ENODATA 9919
-#  endif
-
-#  ifndef ENOMSG
-#    define ENOMSG 9920
-#  endif
-
-#  ifndef ENOPROTOOPT
-#    define ENOPROTOOPT 9921
-#  endif
-
-#  ifndef ENOSR
-#    define ENOSR 9922
-#  endif
-
-#  ifndef ENOTSOCK
-#    define ENOTSOCK 9923
-#  endif
-
-#  ifndef ENOSTR
-#    define ENOSTR 9924
-#  endif
-
-#  ifndef ENOTCONN
-#    define ENOTCONN 9925
-#  endif
-
-#  ifndef ENOTSUP
-#    define ENOTSUP 9926
-#  endif
-
-#  ifndef ECANCELED
-#    define ECANCELED 9927
-#  endif
-
-#  ifndef EINPROGRESS
-#    define EINPROGRESS 9928
-#  endif
-
-#  ifndef EOPNOTSUPP
-#    define EOPNOTSUPP 9929
-#  endif
-
-#  ifndef EWOULDBLOCK
-#    define EWOULDBLOCK 9930
-#  endif
-
-#  ifndef EOWNERDEAD
-#    define EOWNERDEAD 9931
-#  endif
-
-#  ifndef EPROTO
-#    define EPROTO 9932
-#  endif
-
-#  ifndef EPROTONOSUPPORT
-#    define EPROTONOSUPPORT 9933
-#  endif
-
-#  ifndef ENOTRECOVERABLE
-#    define ENOTRECOVERABLE 9934
-#  endif
-
-#  ifndef ETIME
-#    define ETIME 9935
-#  endif
-
-#  ifndef ETXTBSY
-#    define ETXTBSY 9936
-#  endif
-
-#  ifndef ETIMEDOUT
-#    define ETIMEDOUT 9938
-#  endif
-
-#  ifndef ELOOP
-#    define ELOOP 9939
-#  endif
-
-#  ifndef EOVERFLOW
-#    define EOVERFLOW 9940
-#  endif
-
-#  ifndef EPROTOTYPE
-#    define EPROTOTYPE 9941
-#  endif
-
-#  ifndef ENOSYS
-#    define ENOSYS 9942
-#  endif
-
-#  ifndef EINVAL
-#    define EINVAL 9943
-#  endif
-
-#  ifndef ERANGE
-#    define ERANGE 9944
-#  endif
-
-#  ifndef EILSEQ
-#    define EILSEQ 9945
-#  endif
-
-//  Windows Mobile doesn't appear to define these:
-
-#  ifndef E2BIG
-#    define E2BIG 9946
-#  endif
-
-#  ifndef EDOM
-#    define EDOM 9947
-#  endif
-
-#  ifndef EFAULT
-#    define EFAULT 9948
-#  endif
-
-#  ifndef EBADF
-#    define EBADF 9949
-#  endif
-
-#  ifndef EPIPE
-#    define EPIPE 9950
-#  endif
-
-#  ifndef EXDEV
-#    define EXDEV 9951
-#  endif
-
-#  ifndef EBUSY
-#    define EBUSY 9952
-#  endif
-
-#  ifndef ENOTEMPTY
-#    define ENOTEMPTY 9953
-#  endif
-
-#  ifndef ENOEXEC
-#    define ENOEXEC 9954
-#  endif
-
-#  ifndef EEXIST
-#    define EEXIST 9955
-#  endif
-
-#  ifndef EFBIG
-#    define EFBIG 9956
-#  endif
-
-#  ifndef ENAMETOOLONG
-#    define ENAMETOOLONG 9957
-#  endif
-
-#  ifndef ENOTTY
-#    define ENOTTY 9958
-#  endif
-
-#  ifndef EINTR
-#    define EINTR 9959
-#  endif
-
-#  ifndef ESPIPE
-#    define ESPIPE 9960
-#  endif
-
-#  ifndef EIO
-#    define EIO 9961
-#  endif
-
-#  ifndef EISDIR
-#    define EISDIR 9962
-#  endif
-
-#  ifndef ECHILD
-#    define ECHILD 9963
-#  endif
-
-#  ifndef ENOLCK
-#    define ENOLCK 9964
-#  endif
-
-#  ifndef ENOSPC
-#    define ENOSPC 9965
-#  endif
-
-#  ifndef ENXIO
-#    define ENXIO 9966
-#  endif
-
-#  ifndef ENODEV
-#    define ENODEV 9967
-#  endif
-
-#  ifndef ENOENT
-#    define ENOENT 9968
-#  endif
-
-#  ifndef ESRCH
-#    define ESRCH 9969
-#  endif
-
-#  ifndef ENOTDIR
-#    define ENOTDIR 9970
-#  endif
-
-#  ifndef ENOMEM
-#    define ENOMEM 9971
-#  endif
-
-#  ifndef EPERM
-#    define EPERM 9972
-#  endif
-
-#  ifndef EACCES
-#    define EACCES 9973
-#  endif
-
-#  ifndef EROFS
-#    define EROFS 9974
-#  endif
-
-#  ifndef EDEADLK
-#    define EDEADLK 9975
-#  endif
-
-#  ifndef EAGAIN
-#    define EAGAIN 9976
-#  endif
-
-#  ifndef ENFILE
-#    define ENFILE 9977
-#  endif
-
-#  ifndef EMFILE
-#    define EMFILE 9978
-#  endif
-
-#  ifndef EMLINK
-#    define EMLINK 9979
-#  endif
-
-#endif // __cplusplus
-
-#endif // _LIBCPP___CXX03_ERRNO_H
diff --git a/libcxx/include/__cxx03/ext/__hash b/libcxx/include/__cxx03/ext/__hash
index 8f27c7ea105c5..e2c521568280a 100644
--- a/libcxx/include/__cxx03/ext/__hash
+++ b/libcxx/include/__cxx03/ext/__hash
@@ -14,8 +14,8 @@
 
 #include <__cxx03/__config>
 #include <__cxx03/cstring>
-#include <__cxx03/stddef.h>
 #include <__cxx03/string>
+#include <stddef.h>
 
 namespace __gnu_cxx {
 
diff --git a/libcxx/include/__cxx03/fenv.h b/libcxx/include/__cxx03/fenv.h
deleted file mode 100644
index 87fe9d18b2b77..0000000000000
--- a/libcxx/include/__cxx03/fenv.h
+++ /dev/null
@@ -1,114 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_FENV_H
-#define _LIBCPP___CXX03_FENV_H
-
-/*
-    fenv.h synopsis
-
-This entire header is C99 / C++0X
-
-Macros:
-
-    FE_DIVBYZERO
-    FE_INEXACT
-    FE_INVALID
-    FE_OVERFLOW
-    FE_UNDERFLOW
-    FE_ALL_EXCEPT
-    FE_DOWNWARD
-    FE_TONEAREST
-    FE_TOWARDZERO
-    FE_UPWARD
-    FE_DFL_ENV
-
-Types:
-
-    fenv_t
-    fexcept_t
-
-int feclearexcept(int excepts);
-int fegetexceptflag(fexcept_t* flagp, int excepts);
-int feraiseexcept(int excepts);
-int fesetexceptflag(const fexcept_t* flagp, int excepts);
-int fetestexcept(int excepts);
-int fegetround();
-int fesetround(int round);
-int fegetenv(fenv_t* envp);
-int feholdexcept(fenv_t* envp);
-int fesetenv(const fenv_t* envp);
-int feupdateenv(const fenv_t* envp);
-
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<fenv.h>)
-#  include_next <fenv.h>
-#endif
-
-#ifdef __cplusplus
-
-extern "C++" {
-
-#  ifdef feclearexcept
-#    undef feclearexcept
-#  endif
-
-#  ifdef fegetexceptflag
-#    undef fegetexceptflag
-#  endif
-
-#  ifdef feraiseexcept
-#    undef feraiseexcept
-#  endif
-
-#  ifdef fesetexceptflag
-#    undef fesetexceptflag
-#  endif
-
-#  ifdef fetestexcept
-#    undef fetestexcept
-#  endif
-
-#  ifdef fegetround
-#    undef fegetround
-#  endif
-
-#  ifdef fesetround
-#    undef fesetround
-#  endif
-
-#  ifdef fegetenv
-#    undef fegetenv
-#  endif
-
-#  ifdef feholdexcept
-#    undef feholdexcept
-#  endif
-
-#  ifdef fesetenv
-#    undef fesetenv
-#  endif
-
-#  ifdef feupdateenv
-#    undef feupdateenv
-#  endif
-
-} // extern "C++"
-
-#endif // defined(__cplusplus)
-
-#endif // _LIBCPP___CXX03_FENV_H
diff --git a/libcxx/include/__cxx03/float.h b/libcxx/include/__cxx03/float.h
deleted file mode 100644
index 38f199067f411..0000000000000
--- a/libcxx/include/__cxx03/float.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_FLOAT_H
-#define _LIBCPP___CXX03_FLOAT_H
-
-/*
-    float.h synopsis
-
-Macros:
-
-    FLT_ROUNDS
-    FLT_EVAL_METHOD     // C99
-    FLT_RADIX
-
-    FLT_MANT_DIG
-    DBL_MANT_DIG
-    LDBL_MANT_DIG
-
-    FLT_HAS_SUBNORM     // C11
-    DBL_HAS_SUBNORM     // C11
-    LDBL_HAS_SUBNORM    // C11
-
-    DECIMAL_DIG         // C99
-    FLT_DECIMAL_DIG     // C11
-    DBL_DECIMAL_DIG     // C11
-    LDBL_DECIMAL_DIG    // C11
-
-    FLT_DIG
-    DBL_DIG
-    LDBL_DIG
-
-    FLT_MIN_EXP
-    DBL_MIN_EXP
-    LDBL_MIN_EXP
-
-    FLT_MIN_10_EXP
-    DBL_MIN_10_EXP
-    LDBL_MIN_10_EXP
-
-    FLT_MAX_EXP
-    DBL_MAX_EXP
-    LDBL_MAX_EXP
-
-    FLT_MAX_10_EXP
-    DBL_MAX_10_EXP
-    LDBL_MAX_10_EXP
-
-    FLT_MAX
-    DBL_MAX
-    LDBL_MAX
-
-    FLT_EPSILON
-    DBL_EPSILON
-    LDBL_EPSILON
-
-    FLT_MIN
-    DBL_MIN
-    LDBL_MIN
-
-    FLT_TRUE_MIN        // C11
-    DBL_TRUE_MIN        // C11
-    LDBL_TRUE_MIN       // C11
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<float.h>)
-#  include_next <float.h>
-#endif
-
-#ifdef __cplusplus
-
-#  ifndef FLT_EVAL_METHOD
-#    define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
-#  endif
-
-#  ifndef DECIMAL_DIG
-#    define DECIMAL_DIG __DECIMAL_DIG__
-#  endif
-
-#endif // __cplusplus
-
-#endif // _LIBCPP___CXX03_FLOAT_H
diff --git a/libcxx/include/__cxx03/inttypes.h b/libcxx/include/__cxx03/inttypes.h
deleted file mode 100644
index 82acd87f5e8eb..0000000000000
--- a/libcxx/include/__cxx03/inttypes.h
+++ /dev/null
@@ -1,264 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_INTTYPES_H
-// AIX system headers need inttypes.h to be re-enterable while _STD_TYPES_T
-// is defined until an inclusion of it without _STD_TYPES_T occurs, in which
-// case the header guard macro is defined.
-#if !defined(_AIX) || !defined(_STD_TYPES_T)
-#  define _LIBCPP___CXX03_INTTYPES_H
-#endif // _STD_TYPES_T
-
-/*
-    inttypes.h synopsis
-
-This entire header is C99 / C++0X
-
-#include <__cxx03/stdint.h>  // <cinttypes> includes <cstdint>
-
-Macros:
-
-    PRId8
-    PRId16
-    PRId32
-    PRId64
-
-    PRIdLEAST8
-    PRIdLEAST16
-    PRIdLEAST32
-    PRIdLEAST64
-
-    PRIdFAST8
-    PRIdFAST16
-    PRIdFAST32
-    PRIdFAST64
-
-    PRIdMAX
-    PRIdPTR
-
-    PRIi8
-    PRIi16
-    PRIi32
-    PRIi64
-
-    PRIiLEAST8
-    PRIiLEAST16
-    PRIiLEAST32
-    PRIiLEAST64
-
-    PRIiFAST8
-    PRIiFAST16
-    PRIiFAST32
-    PRIiFAST64
-
-    PRIiMAX
-    PRIiPTR
-
-    PRIo8
-    PRIo16
-    PRIo32
-    PRIo64
-
-    PRIoLEAST8
-    PRIoLEAST16
-    PRIoLEAST32
-    PRIoLEAST64
-
-    PRIoFAST8
-    PRIoFAST16
-    PRIoFAST32
-    PRIoFAST64
-
-    PRIoMAX
-    PRIoPTR
-
-    PRIu8
-    PRIu16
-    PRIu32
-    PRIu64
-
-    PRIuLEAST8
-    PRIuLEAST16
-    PRIuLEAST32
-    PRIuLEAST64
-
-    PRIuFAST8
-    PRIuFAST16
-    PRIuFAST32
-    PRIuFAST64
-
-    PRIuMAX
-    PRIuPTR
-
-    PRIx8
-    PRIx16
-    PRIx32
-    PRIx64
-
-    PRIxLEAST8
-    PRIxLEAST16
-    PRIxLEAST32
-    PRIxLEAST64
-
-    PRIxFAST8
-    PRIxFAST16
-    PRIxFAST32
-    PRIxFAST64
-
-    PRIxMAX
-    PRIxPTR
-
-    PRIX8
-    PRIX16
-    PRIX32
-    PRIX64
-
-    PRIXLEAST8
-    PRIXLEAST16
-    PRIXLEAST32
-    PRIXLEAST64
-
-    PRIXFAST8
-    PRIXFAST16
-    PRIXFAST32
-    PRIXFAST64
-
-    PRIXMAX
-    PRIXPTR
-
-    SCNd8
-    SCNd16
-    SCNd32
-    SCNd64
-
-    SCNdLEAST8
-    SCNdLEAST16
-    SCNdLEAST32
-    SCNdLEAST64
-
-    SCNdFAST8
-    SCNdFAST16
-    SCNdFAST32
-    SCNdFAST64
-
-    SCNdMAX
-    SCNdPTR
-
-    SCNi8
-    SCNi16
-    SCNi32
-    SCNi64
-
-    SCNiLEAST8
-    SCNiLEAST16
-    SCNiLEAST32
-    SCNiLEAST64
-
-    SCNiFAST8
-    SCNiFAST16
-    SCNiFAST32
-    SCNiFAST64
-
-    SCNiMAX
-    SCNiPTR
-
-    SCNo8
-    SCNo16
-    SCNo32
-    SCNo64
-
-    SCNoLEAST8
-    SCNoLEAST16
-    SCNoLEAST32
-    SCNoLEAST64
-
-    SCNoFAST8
-    SCNoFAST16
-    SCNoFAST32
-    SCNoFAST64
-
-    SCNoMAX
-    SCNoPTR
-
-    SCNu8
-    SCNu16
-    SCNu32
-    SCNu64
-
-    SCNuLEAST8
-    SCNuLEAST16
-    SCNuLEAST32
-    SCNuLEAST64
-
-    SCNuFAST8
-    SCNuFAST16
-    SCNuFAST32
-    SCNuFAST64
-
-    SCNuMAX
-    SCNuPTR
-
-    SCNx8
-    SCNx16
-    SCNx32
-    SCNx64
-
-    SCNxLEAST8
-    SCNxLEAST16
-    SCNxLEAST32
-    SCNxLEAST64
-
-    SCNxFAST8
-    SCNxFAST16
-    SCNxFAST32
-    SCNxFAST64
-
-    SCNxMAX
-    SCNxPTR
-
-Types:
-
-    imaxdiv_t
-
-intmax_t  imaxabs(intmax_t j);
-imaxdiv_t imaxdiv(intmax_t numer, intmax_t denom);
-intmax_t  strtoimax(const char* restrict nptr, char** restrict endptr, int base);
-uintmax_t strtoumax(const char* restrict nptr, char** restrict endptr, int base);
-intmax_t  wcstoimax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int base);
-uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int base);
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-/* C99 stdlib (e.g. glibc < 2.18) does not provide format macros needed
-   for C++11 unless __STDC_FORMAT_MACROS is defined
-*/
-#if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS)
-#  define __STDC_FORMAT_MACROS
-#endif
-
-#if __has_include_next(<inttypes.h>)
-#  include_next <inttypes.h>
-#endif
-
-#ifdef __cplusplus
-
-#  include <__cxx03/stdint.h>
-
-#  undef imaxabs
-#  undef imaxdiv
-
-#endif // __cplusplus
-
-#endif // _LIBCPP___CXX03_INTTYPES_H
diff --git a/libcxx/include/__cxx03/stdbool.h b/libcxx/include/__cxx03/stdbool.h
deleted file mode 100644
index e6c2eae2cfd12..0000000000000
--- a/libcxx/include/__cxx03/stdbool.h
+++ /dev/null
@@ -1,40 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_STDBOOL_H
-#define _LIBCPP___CXX03_STDBOOL_H
-
-/*
-    stdbool.h synopsis
-
-Macros:
-
-    __bool_true_false_are_defined
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#if __has_include_next(<stdbool.h>)
-#  include_next <stdbool.h>
-#endif
-
-#ifdef __cplusplus
-#  undef bool
-#  undef true
-#  undef false
-#  undef __bool_true_false_are_defined
-#  define __bool_true_false_are_defined 1
-#endif
-
-#endif // _LIBCPP___CXX03_STDBOOL_H
diff --git a/libcxx/include/__cxx03/stddef.h b/libcxx/include/__cxx03/stddef.h
deleted file mode 100644
index 5d46cf27c39f5..0000000000000
--- a/libcxx/include/__cxx03/stddef.h
+++ /dev/null
@@ -1,44 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-/*
-    stddef.h synopsis
-
-Macros:
-
-    offsetof(type,member-designator)
-    NULL
-
-Types:
-
-    ptrdiff_t
-    size_t
-    max_align_t // C++11
-    nullptr_t
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-// Note: This include is outside of header guards because we sometimes get included multiple times
-//       with different defines and the underlying <stddef.h> will know how to deal with that.
-#include_next <stddef.h>
-
-#ifndef _LIBCPP___CXX03_STDDEF_H
-#  define _LIBCPP___CXX03_STDDEF_H
-
-#  ifdef __cplusplus
-typedef decltype(nullptr) nullptr_t;
-#  endif
-
-#endif // _LIBCPP___CXX03_STDDEF_H
diff --git a/libcxx/include/__cxx03/stdio.h b/libcxx/include/__cxx03/stdio.h
deleted file mode 100644
index 6c6671e90c9a8..0000000000000
--- a/libcxx/include/__cxx03/stdio.h
+++ /dev/null
@@ -1,123 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(__need_FILE) || defined(__need___FILE)
-
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
-
-#  include_next <stdio.h>
-
-#elif !defined(_LIBCPP___CXX03_STDIO_H)
-#  define _LIBCPP___CXX03_STDIO_H
-
-/*
-    stdio.h synopsis
-
-Macros:
-
-    BUFSIZ
-    EOF
-    FILENAME_MAX
-    FOPEN_MAX
-    L_tmpnam
-    NULL
-    SEEK_CUR
-    SEEK_END
-    SEEK_SET
-    TMP_MAX
-    _IOFBF
-    _IOLBF
-    _IONBF
-    stderr
-    stdin
-    stdout
-
-Types:
-
-FILE
-fpos_t
-size_t
-
-int remove(const char* filename);
-int rename(const char* old, const char* new);
-FILE* tmpfile(void);
-char* tmpnam(char* s);
-int fclose(FILE* stream);
-int fflush(FILE* stream);
-FILE* fopen(const char* restrict filename, const char* restrict mode);
-FILE* freopen(const char* restrict filename, const char * restrict mode,
-              FILE * restrict stream);
-void setbuf(FILE* restrict stream, char* restrict buf);
-int setvbuf(FILE* restrict stream, char* restrict buf, int mode, size_t size);
-int fprintf(FILE* restrict stream, const char* restrict format, ...);
-int fscanf(FILE* restrict stream, const char * restrict format, ...);
-int printf(const char* restrict format, ...);
-int scanf(const char* restrict format, ...);
-int snprintf(char* restrict s, size_t n, const char* restrict format, ...);    // C99
-int sprintf(char* restrict s, const char* restrict format, ...);
-int sscanf(const char* restrict s, const char* restrict format, ...);
-int vfprintf(FILE* restrict stream, const char* restrict format, va_list arg);
-int vfscanf(FILE* restrict stream, const char* restrict format, va_list arg);  // C99
-int vprintf(const char* restrict format, va_list arg);
-int vscanf(const char* restrict format, va_list arg);                          // C99
-int vsnprintf(char* restrict s, size_t n, const char* restrict format,         // C99
-              va_list arg);
-int vsprintf(char* restrict s, const char* restrict format, va_list arg);
-int vsscanf(const char* restrict s, const char* restrict format, va_list arg); // C99
-int fgetc(FILE* stream);
-char* fgets(char* restrict s, int n, FILE* restrict stream);
-int fputc(int c, FILE* stream);
-int fputs(const char* restrict s, FILE* restrict stream);
-int getc(FILE* stream);
-int getchar(void);
-char* gets(char* s);  // removed in C++14
-int putc(int c, FILE* stream);
-int putchar(int c);
-int puts(const char* s);
-int ungetc(int c, FILE* stream);
-size_t fread(void* restrict ptr, size_t size, size_t nmemb,
-             FILE* restrict stream);
-size_t fwrite(const void* restrict ptr, size_t size, size_t nmemb,
-              FILE* restrict stream);
-int fgetpos(FILE* restrict stream, fpos_t* restrict pos);
-int fseek(FILE* stream, long offset, int whence);
-int fsetpos(FILE*stream, const fpos_t* pos);
-long ftell(FILE* stream);
-void rewind(FILE* stream);
-void clearerr(FILE* stream);
-int feof(FILE* stream);
-int ferror(FILE* stream);
-void perror(const char* s);
-*/
-
-#  include <__cxx03/__config>
-
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
-
-#  if __has_include_next(<stdio.h>)
-#    include_next <stdio.h>
-#  endif
-
-#  ifdef __cplusplus
-
-#    undef getc
-#    undef putc
-#    undef clearerr
-#    undef feof
-#    undef ferror
-#    undef putchar
-#    undef getchar
-
-#  endif
-
-#endif // _LIBCPP___CXX03_STDIO_H
diff --git a/libcxx/include/__cxx03/tgmath.h b/libcxx/include/__cxx03/tgmath.h
deleted file mode 100644
index 7d81a110ed610..0000000000000
--- a/libcxx/include/__cxx03/tgmath.h
+++ /dev/null
@@ -1,34 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_TGMATH_H
-#define _LIBCPP___CXX03_TGMATH_H
-
-/*
-    tgmath.h synopsis
-
-#include <__cxx03/ctgmath>
-
-*/
-
-#include <__cxx03/__config>
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-#ifdef __cplusplus
-#  include <__cxx03/ctgmath>
-#else
-#  if __has_include_next(<tgmath.h>)
-#    include_next <tgmath.h>
-#  endif
-#endif
-
-#endif // _LIBCPP___CXX03_TGMATH_H
diff --git a/libcxx/include/__cxx03/wchar.h b/libcxx/include/__cxx03/wchar.h
index 91975f7df49bd..f30b18df9c9f7 100644
--- a/libcxx/include/__cxx03/wchar.h
+++ b/libcxx/include/__cxx03/wchar.h
@@ -106,7 +106,7 @@ size_t wcsrtombs(char* restrict dst, const wchar_t** restrict src, size_t len,
 */
 
 #  include <__cxx03/__config>
-#  include <__cxx03/stddef.h>
+#  include <stddef.h>
 
 #  if defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
 #    error                                                                                                             \
diff --git a/libcxx/include/__cxx03/wctype.h b/libcxx/include/__cxx03/wctype.h
deleted file mode 100644
index 131d8b51ea1ca..0000000000000
--- a/libcxx/include/__cxx03/wctype.h
+++ /dev/null
@@ -1,95 +0,0 @@
-// -*- C++ -*-
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef _LIBCPP___CXX03_WCTYPE_H
-#define _LIBCPP___CXX03_WCTYPE_H
-
-/*
-    wctype.h synopsis
-
-Macros:
-
-    WEOF
-
-Types:
-
-    wint_t
-    wctrans_t
-    wctype_t
-
-int iswalnum(wint_t wc);
-int iswalpha(wint_t wc);
-int iswblank(wint_t wc);  // C99
-int iswcntrl(wint_t wc);
-int iswdigit(wint_t wc);
-int iswgraph(wint_t wc);
-int iswlower(wint_t wc);
-int iswprint(wint_t wc);
-int iswpunct(wint_t wc);
-int iswspace(wint_t wc);
-int iswupper(wint_t wc);
-int iswxdigit(wint_t wc);
-int iswctype(wint_t wc, wctype_t desc);
-wctype_t wctype(const char* property);
-wint_t towlower(wint_t wc);
-wint_t towupper(wint_t wc);
-wint_t towctrans(wint_t wc, wctrans_t desc);
-wctrans_t wctrans(const char* property);
-
-*/
-
-#include <__cxx03/__config>
-
-#if defined(_LIBCPP_HAS_NO_WIDE_CHARACTERS)
-#  error                                                                                                               \
-      "The <wctype.h> header is not supported since libc++ has been configured with LIBCXX_ENABLE_WIDE_CHARACTERS disabled"
-#endif
-
-#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#  pragma GCC system_header
-#endif
-
-// TODO:
-// In the future, we should unconditionally include_next <wctype.h> here and instead
-// have a mode under which the library does not need libc++'s <wctype.h> or <cwctype>
-// at all (i.e. a mode without wchar_t). As it stands, we need to do that to completely
-// bypass the using declarations in <cwctype> when we did not include <__cxx03/wctype.h>.
-// Otherwise, a using declaration like `using ::wint_t` in <cwctype> will refer to
-// nothing (with using_if_exists), and if we include another header that defines one
-// of these declarations (e.g. <wchar.h>), the second `using ::wint_t` with using_if_exists
-// will fail because it does not refer to the same declaration.
-#if __has_include_next(<wctype.h>)
-#  include_next <wctype.h>
-#  define _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H
-#endif
-
-#ifdef __cplusplus
-
-#  undef iswalnum
-#  undef iswalpha
-#  undef iswblank
-#  undef iswcntrl
-#  undef iswdigit
-#  undef iswgraph
-#  undef iswlower
-#  undef iswprint
-#  undef iswpunct
-#  undef iswspace
-#  undef iswupper
-#  undef iswxdigit
-#  undef iswctype
-#  undef wctype
-#  undef towlower
-#  undef towupper
-#  undef towctrans
-#  undef wctrans
-
-#endif // __cplusplus
-
-#endif // _LIBCPP___CXX03_WCTYPE_H
diff --git a/libcxx/include/__flat_map/flat_map.h b/libcxx/include/__flat_map/flat_map.h
index 7bb235ba76503..159e652e1a326 100644
--- a/libcxx/include/__flat_map/flat_map.h
+++ b/libcxx/include/__flat_map/flat_map.h
@@ -47,6 +47,7 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
+#include <__ranges/range_adaptor.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
diff --git a/libcxx/include/__flat_map/flat_multimap.h b/libcxx/include/__flat_map/flat_multimap.h
index 96d945405cffe..6a608723da1ba 100644
--- a/libcxx/include/__flat_map/flat_multimap.h
+++ b/libcxx/include/__flat_map/flat_multimap.h
@@ -45,6 +45,7 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
+#include <__ranges/range_adaptor.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__ranges/zip_view.h>
diff --git a/libcxx/include/__flat_set/flat_multiset.h b/libcxx/include/__flat_set/flat_multiset.h
index 5cfa38fb41057..7be0b2d20c54d 100644
--- a/libcxx/include/__flat_set/flat_multiset.h
+++ b/libcxx/include/__flat_set/flat_multiset.h
@@ -40,6 +40,7 @@
 #include <__ranges/container_compatible_range.h>
 #include <__ranges/drop_view.h>
 #include <__ranges/from_range.h>
+#include <__ranges/range_adaptor.h>
 #include <__ranges/size.h>
 #include <__ranges/subrange.h>
 #include <__type_traits/container_traits.h>
diff --git a/libcxx/include/complex.h b/libcxx/include/complex.h
index 8a1f92689601f..09d8080f0f374 100644
--- a/libcxx/include/complex.h
+++ b/libcxx/include/complex.h
@@ -18,19 +18,19 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/complex.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  ifdef __cplusplus
-#    include <complex>
-#  elif __has_include_next(<complex.h>)
-#    include_next <complex.h>
-#  endif
-#endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#ifdef __cplusplus
+#  include <complex>
+#elif __has_include_next(<complex.h>)
+#  include_next <complex.h>
+#endif
 
 #endif // _LIBCPP_COMPLEX_H
diff --git a/libcxx/include/ctype.h b/libcxx/include/ctype.h
index 066f45018bbf5..b2f83ca1c1aa4 100644
--- a/libcxx/include/ctype.h
+++ b/libcxx/include/ctype.h
@@ -30,36 +30,36 @@ int toupper(int c);
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/ctype.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  if __has_include_next(<ctype.h>)
-#    include_next <ctype.h>
-#  endif
+#if __has_include_next(<ctype.h>)
+#  include_next <ctype.h>
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
-#    undef isalnum
-#    undef isalpha
-#    undef isblank
-#    undef iscntrl
-#    undef isdigit
-#    undef isgraph
-#    undef islower
-#    undef isprint
-#    undef ispunct
-#    undef isspace
-#    undef isupper
-#    undef isxdigit
-#    undef tolower
-#    undef toupper
+#  undef isalnum
+#  undef isalpha
+#  undef isblank
+#  undef iscntrl
+#  undef isdigit
+#  undef isgraph
+#  undef islower
+#  undef isprint
+#  undef ispunct
+#  undef isspace
+#  undef isupper
+#  undef isxdigit
+#  undef tolower
+#  undef toupper
 
-#  endif
-#endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif
 
 #endif // _LIBCPP_CTYPE_H
diff --git a/libcxx/include/errno.h b/libcxx/include/errno.h
index 692f00f022c53..e87ad0d1b73e9 100644
--- a/libcxx/include/errno.h
+++ b/libcxx/include/errno.h
@@ -23,381 +23,381 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/errno.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  if __has_include_next(<errno.h>)
-#    include_next <errno.h>
-#  endif
+#if __has_include_next(<errno.h>)
+#  include_next <errno.h>
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
-#    if !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
+#  if !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
 
-#      ifdef ELAST
+#    ifdef ELAST
 
 static const int __elast1 = ELAST + 1;
 static const int __elast2 = ELAST + 2;
 
-#      else
+#    else
 
 static const int __elast1 = 104;
 static const int __elast2 = 105;
 
-#      endif
+#    endif
 
-#      ifdef ENOTRECOVERABLE
+#    ifdef ENOTRECOVERABLE
 
-#        define EOWNERDEAD __elast1
+#      define EOWNERDEAD __elast1
 
-#        ifdef ELAST
-#          undef ELAST
-#          define ELAST EOWNERDEAD
-#        endif
+#      ifdef ELAST
+#        undef ELAST
+#        define ELAST EOWNERDEAD
+#      endif
 
-#      elif defined(EOWNERDEAD)
+#    elif defined(EOWNERDEAD)
 
-#        define ENOTRECOVERABLE __elast1
-#        ifdef ELAST
-#          undef ELAST
-#          define ELAST ENOTRECOVERABLE
-#        endif
+#      define ENOTRECOVERABLE __elast1
+#      ifdef ELAST
+#        undef ELAST
+#        define ELAST ENOTRECOVERABLE
+#      endif
 
-#      else // defined(EOWNERDEAD)
+#    else // defined(EOWNERDEAD)
 
-#        define EOWNERDEAD __elast1
-#        define ENOTRECOVERABLE __elast2
-#        ifdef ELAST
-#          undef ELAST
-#          define ELAST ENOTRECOVERABLE
-#        endif
+#      define EOWNERDEAD __elast1
+#      define ENOTRECOVERABLE __elast2
+#      ifdef ELAST
+#        undef ELAST
+#        define ELAST ENOTRECOVERABLE
+#      endif
 
-#      endif // defined(EOWNERDEAD)
+#    endif // defined(EOWNERDEAD)
 
-#    endif // !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
+#  endif // !defined(EOWNERDEAD) || !defined(ENOTRECOVERABLE)
 
 //  supply errno values likely to be missing, particularly on Windows
 
-#    ifndef EAFNOSUPPORT
-#      define EAFNOSUPPORT 9901
-#    endif
+#  ifndef EAFNOSUPPORT
+#    define EAFNOSUPPORT 9901
+#  endif
 
-#    ifndef EADDRINUSE
-#      define EADDRINUSE 9902
-#    endif
+#  ifndef EADDRINUSE
+#    define EADDRINUSE 9902
+#  endif
 
-#    ifndef EADDRNOTAVAIL
-#      define EADDRNOTAVAIL 9903
-#    endif
+#  ifndef EADDRNOTAVAIL
+#    define EADDRNOTAVAIL 9903
+#  endif
 
-#    ifndef EISCONN
-#      define EISCONN 9904
-#    endif
+#  ifndef EISCONN
+#    define EISCONN 9904
+#  endif
 
-#    ifndef EBADMSG
-#      define EBADMSG 9905
-#    endif
+#  ifndef EBADMSG
+#    define EBADMSG 9905
+#  endif
 
-#    ifndef ECONNABORTED
-#      define ECONNABORTED 9906
-#    endif
+#  ifndef ECONNABORTED
+#    define ECONNABORTED 9906
+#  endif
 
-#    ifndef EALREADY
-#      define EALREADY 9907
-#    endif
+#  ifndef EALREADY
+#    define EALREADY 9907
+#  endif
 
-#    ifndef ECONNREFUSED
-#      define ECONNREFUSED 9908
-#    endif
+#  ifndef ECONNREFUSED
+#    define ECONNREFUSED 9908
+#  endif
 
-#    ifndef ECONNRESET
-#      define ECONNRESET 9909
-#    endif
+#  ifndef ECONNRESET
+#    define ECONNRESET 9909
+#  endif
 
-#    ifndef EDESTADDRREQ
-#      define EDESTADDRREQ 9910
-#    endif
+#  ifndef EDESTADDRREQ
+#    define EDESTADDRREQ 9910
+#  endif
 
-#    ifndef EHOSTUNREACH
-#      define EHOSTUNREACH 9911
-#    endif
+#  ifndef EHOSTUNREACH
+#    define EHOSTUNREACH 9911
+#  endif
 
-#    ifndef EIDRM
-#      define EIDRM 9912
-#    endif
+#  ifndef EIDRM
+#    define EIDRM 9912
+#  endif
 
-#    ifndef EMSGSIZE
-#      define EMSGSIZE 9913
-#    endif
+#  ifndef EMSGSIZE
+#    define EMSGSIZE 9913
+#  endif
 
-#    ifndef ENETDOWN
-#      define ENETDOWN 9914
-#    endif
+#  ifndef ENETDOWN
+#    define ENETDOWN 9914
+#  endif
 
-#    ifndef ENETRESET
-#      define ENETRESET 9915
-#    endif
+#  ifndef ENETRESET
+#    define ENETRESET 9915
+#  endif
 
-#    ifndef ENETUNREACH
-#      define ENETUNREACH 9916
-#    endif
+#  ifndef ENETUNREACH
+#    define ENETUNREACH 9916
+#  endif
 
-#    ifndef ENOBUFS
-#      define ENOBUFS 9917
-#    endif
+#  ifndef ENOBUFS
+#    define ENOBUFS 9917
+#  endif
 
-#    ifndef ENOLINK
-#      define ENOLINK 9918
-#    endif
+#  ifndef ENOLINK
+#    define ENOLINK 9918
+#  endif
 
-#    ifndef ENODATA
-#      define ENODATA 9919
-#    endif
+#  ifndef ENODATA
+#    define ENODATA 9919
+#  endif
 
-#    ifndef ENOMSG
-#      define ENOMSG 9920
-#    endif
+#  ifndef ENOMSG
+#    define ENOMSG 9920
+#  endif
 
-#    ifndef ENOPROTOOPT
-#      define ENOPROTOOPT 9921
-#    endif
+#  ifndef ENOPROTOOPT
+#    define ENOPROTOOPT 9921
+#  endif
 
-#    ifndef ENOSR
-#      define ENOSR 9922
-#    endif
+#  ifndef ENOSR
+#    define ENOSR 9922
+#  endif
 
-#    ifndef ENOTSOCK
-#      define ENOTSOCK 9923
-#    endif
+#  ifndef ENOTSOCK
+#    define ENOTSOCK 9923
+#  endif
 
-#    ifndef ENOSTR
-#      define ENOSTR 9924
-#    endif
+#  ifndef ENOSTR
+#    define ENOSTR 9924
+#  endif
 
-#    ifndef ENOTCONN
-#      define ENOTCONN 9925
-#    endif
+#  ifndef ENOTCONN
+#    define ENOTCONN 9925
+#  endif
 
-#    ifndef ENOTSUP
-#      define ENOTSUP 9926
-#    endif
+#  ifndef ENOTSUP
+#    define ENOTSUP 9926
+#  endif
 
-#    ifndef ECANCELED
-#      define ECANCELED 9927
-#    endif
+#  ifndef ECANCELED
+#    define ECANCELED 9927
+#  endif
 
-#    ifndef EINPROGRESS
-#      define EINPROGRESS 9928
-#    endif
+#  ifndef EINPROGRESS
+#    define EINPROGRESS 9928
+#  endif
 
-#    ifndef EOPNOTSUPP
-#      define EOPNOTSUPP 9929
-#    endif
+#  ifndef EOPNOTSUPP
+#    define EOPNOTSUPP 9929
+#  endif
 
-#    ifndef EWOULDBLOCK
-#      define EWOULDBLOCK 9930
-#    endif
+#  ifndef EWOULDBLOCK
+#    define EWOULDBLOCK 9930
+#  endif
 
-#    ifndef EOWNERDEAD
-#      define EOWNERDEAD 9931
-#    endif
+#  ifndef EOWNERDEAD
+#    define EOWNERDEAD 9931
+#  endif
 
-#    ifndef EPROTO
-#      define EPROTO 9932
-#    endif
+#  ifndef EPROTO
+#    define EPROTO 9932
+#  endif
 
-#    ifndef EPROTONOSUPPORT
-#      define EPROTONOSUPPORT 9933
-#    endif
+#  ifndef EPROTONOSUPPORT
+#    define EPROTONOSUPPORT 9933
+#  endif
 
-#    ifndef ENOTRECOVERABLE
-#      define ENOTRECOVERABLE 9934
-#    endif
+#  ifndef ENOTRECOVERABLE
+#    define ENOTRECOVERABLE 9934
+#  endif
 
-#    ifndef ETIME
-#      define ETIME 9935
-#    endif
+#  ifndef ETIME
+#    define ETIME 9935
+#  endif
 
-#    ifndef ETXTBSY
-#      define ETXTBSY 9936
-#    endif
+#  ifndef ETXTBSY
+#    define ETXTBSY 9936
+#  endif
 
-#    ifndef ETIMEDOUT
-#      define ETIMEDOUT 9938
-#    endif
+#  ifndef ETIMEDOUT
+#    define ETIMEDOUT 9938
+#  endif
 
-#    ifndef ELOOP
-#      define ELOOP 9939
-#    endif
+#  ifndef ELOOP
+#    define ELOOP 9939
+#  endif
 
-#    ifndef EOVERFLOW
-#      define EOVERFLOW 9940
-#    endif
+#  ifndef EOVERFLOW
+#    define EOVERFLOW 9940
+#  endif
 
-#    ifndef EPROTOTYPE
-#      define EPROTOTYPE 9941
-#    endif
+#  ifndef EPROTOTYPE
+#    define EPROTOTYPE 9941
+#  endif
 
-#    ifndef ENOSYS
-#      define ENOSYS 9942
-#    endif
+#  ifndef ENOSYS
+#    define ENOSYS 9942
+#  endif
 
-#    ifndef EINVAL
-#      define EINVAL 9943
-#    endif
+#  ifndef EINVAL
+#    define EINVAL 9943
+#  endif
 
-#    ifndef ERANGE
-#      define ERANGE 9944
-#    endif
+#  ifndef ERANGE
+#    define ERANGE 9944
+#  endif
 
-#    ifndef EILSEQ
-#      define EILSEQ 9945
-#    endif
+#  ifndef EILSEQ
+#    define EILSEQ 9945
+#  endif
 
 //  Windows Mobile doesn't appear to define these:
 
-#    ifndef E2BIG
-#      define E2BIG 9946
-#    endif
+#  ifndef E2BIG
+#    define E2BIG 9946
+#  endif
 
-#    ifndef EDOM
-#      define EDOM 9947
-#    endif
+#  ifndef EDOM
+#    define EDOM 9947
+#  endif
 
-#    ifndef EFAULT
-#      define EFAULT 9948
-#    endif
+#  ifndef EFAULT
+#    define EFAULT 9948
+#  endif
 
-#    ifndef EBADF
-#      define EBADF 9949
-#    endif
+#  ifndef EBADF
+#    define EBADF 9949
+#  endif
 
-#    ifndef EPIPE
-#      define EPIPE 9950
-#    endif
+#  ifndef EPIPE
+#    define EPIPE 9950
+#  endif
 
-#    ifndef EXDEV
-#      define EXDEV 9951
-#    endif
+#  ifndef EXDEV
+#    define EXDEV 9951
+#  endif
 
-#    ifndef EBUSY
-#      define EBUSY 9952
-#    endif
+#  ifndef EBUSY
+#    define EBUSY 9952
+#  endif
 
-#    ifndef ENOTEMPTY
-#      define ENOTEMPTY 9953
-#    endif
+#  ifndef ENOTEMPTY
+#    define ENOTEMPTY 9953
+#  endif
 
-#    ifndef ENOEXEC
-#      define ENOEXEC 9954
-#    endif
+#  ifndef ENOEXEC
+#    define ENOEXEC 9954
+#  endif
 
-#    ifndef EEXIST
-#      define EEXIST 9955
-#    endif
+#  ifndef EEXIST
+#    define EEXIST 9955
+#  endif
 
-#    ifndef EFBIG
-#      define EFBIG 9956
-#    endif
+#  ifndef EFBIG
+#    define EFBIG 9956
+#  endif
 
-#    ifndef ENAMETOOLONG
-#      define ENAMETOOLONG 9957
-#    endif
+#  ifndef ENAMETOOLONG
+#    define ENAMETOOLONG 9957
+#  endif
 
-#    ifndef ENOTTY
-#      define ENOTTY 9958
-#    endif
+#  ifndef ENOTTY
+#    define ENOTTY 9958
+#  endif
 
-#    ifndef EINTR
-#      define EINTR 9959
-#    endif
+#  ifndef EINTR
+#    define EINTR 9959
+#  endif
 
-#    ifndef ESPIPE
-#      define ESPIPE 9960
-#    endif
+#  ifndef ESPIPE
+#    define ESPIPE 9960
+#  endif
 
-#    ifndef EIO
-#      define EIO 9961
-#    endif
+#  ifndef EIO
+#    define EIO 9961
+#  endif
 
-#    ifndef EISDIR
-#      define EISDIR 9962
-#    endif
+#  ifndef EISDIR
+#    define EISDIR 9962
+#  endif
 
-#    ifndef ECHILD
-#      define ECHILD 9963
-#    endif
+#  ifndef ECHILD
+#    define ECHILD 9963
+#  endif
 
-#    ifndef ENOLCK
-#      define ENOLCK 9964
-#    endif
+#  ifndef ENOLCK
+#    define ENOLCK 9964
+#  endif
 
-#    ifndef ENOSPC
-#      define ENOSPC 9965
-#    endif
+#  ifndef ENOSPC
+#    define ENOSPC 9965
+#  endif
 
-#    ifndef ENXIO
-#      define ENXIO 9966
-#    endif
+#  ifndef ENXIO
+#    define ENXIO 9966
+#  endif
 
-#    ifndef ENODEV
-#      define ENODEV 9967
-#    endif
+#  ifndef ENODEV
+#    define ENODEV 9967
+#  endif
 
-#    ifndef ENOENT
-#      define ENOENT 9968
-#    endif
+#  ifndef ENOENT
+#    define ENOENT 9968
+#  endif
 
-#    ifndef ESRCH
-#      define ESRCH 9969
-#    endif
+#  ifndef ESRCH
+#    define ESRCH 9969
+#  endif
 
-#    ifndef ENOTDIR
-#      define ENOTDIR 9970
-#    endif
+#  ifndef ENOTDIR
+#    define ENOTDIR 9970
+#  endif
 
-#    ifndef ENOMEM
-#      define ENOMEM 9971
-#    endif
+#  ifndef ENOMEM
+#    define ENOMEM 9971
+#  endif
 
-#    ifndef EPERM
-#      define EPERM 9972
-#    endif
+#  ifndef EPERM
+#    define EPERM 9972
+#  endif
 
-#    ifndef EACCES
-#      define EACCES 9973
-#    endif
+#  ifndef EACCES
+#    define EACCES 9973
+#  endif
 
-#    ifndef EROFS
-#      define EROFS 9974
-#    endif
+#  ifndef EROFS
+#    define EROFS 9974
+#  endif
 
-#    ifndef EDEADLK
-#      define EDEADLK 9975
-#    endif
+#  ifndef EDEADLK
+#    define EDEADLK 9975
+#  endif
 
-#    ifndef EAGAIN
-#      define EAGAIN 9976
-#    endif
+#  ifndef EAGAIN
+#    define EAGAIN 9976
+#  endif
 
-#    ifndef ENFILE
-#      define ENFILE 9977
-#    endif
+#  ifndef ENFILE
+#    define ENFILE 9977
+#  endif
 
-#    ifndef EMFILE
-#      define EMFILE 9978
-#    endif
+#  ifndef EMFILE
+#    define EMFILE 9978
+#  endif
 
-#    ifndef EMLINK
-#      define EMLINK 9979
-#    endif
+#  ifndef EMLINK
+#    define EMLINK 9979
+#  endif
 
-#  endif // __cplusplus
-#endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif // __cplusplus
 
 #endif // _LIBCPP_ERRNO_H
diff --git a/libcxx/include/fenv.h b/libcxx/include/fenv.h
index 157c24faa1042..a767269b5c989 100644
--- a/libcxx/include/fenv.h
+++ b/libcxx/include/fenv.h
@@ -50,69 +50,69 @@ int feupdateenv(const fenv_t* envp);
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/fenv.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  if __has_include_next(<fenv.h>)
-#    include_next <fenv.h>
-#  endif
+#if __has_include_next(<fenv.h>)
+#  include_next <fenv.h>
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
 extern "C++" {
 
-#    ifdef feclearexcept
-#      undef feclearexcept
-#    endif
+#  ifdef feclearexcept
+#    undef feclearexcept
+#  endif
 
-#    ifdef fegetexceptflag
-#      undef fegetexceptflag
-#    endif
+#  ifdef fegetexceptflag
+#    undef fegetexceptflag
+#  endif
 
-#    ifdef feraiseexcept
-#      undef feraiseexcept
-#    endif
+#  ifdef feraiseexcept
+#    undef feraiseexcept
+#  endif
 
-#    ifdef fesetexceptflag
-#      undef fesetexceptflag
-#    endif
+#  ifdef fesetexceptflag
+#    undef fesetexceptflag
+#  endif
 
-#    ifdef fetestexcept
-#      undef fetestexcept
-#    endif
+#  ifdef fetestexcept
+#    undef fetestexcept
+#  endif
 
-#    ifdef fegetround
-#      undef fegetround
-#    endif
+#  ifdef fegetround
+#    undef fegetround
+#  endif
 
-#    ifdef fesetround
-#      undef fesetround
-#    endif
+#  ifdef fesetround
+#    undef fesetround
+#  endif
 
-#    ifdef fegetenv
-#      undef fegetenv
-#    endif
+#  ifdef fegetenv
+#    undef fegetenv
+#  endif
 
-#    ifdef feholdexcept
-#      undef feholdexcept
-#    endif
+#  ifdef feholdexcept
+#    undef feholdexcept
+#  endif
 
-#    ifdef fesetenv
-#      undef fesetenv
-#    endif
+#  ifdef fesetenv
+#    undef fesetenv
+#  endif
 
-#    ifdef feupdateenv
-#      undef feupdateenv
-#    endif
+#  ifdef feupdateenv
+#    undef feupdateenv
+#  endif
 
 } // extern "C++"
 
-#  endif // defined(__cplusplus)
-#endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif // defined(__cplusplus)
 
 #endif // _LIBCPP_FENV_H
diff --git a/libcxx/include/float.h b/libcxx/include/float.h
index a6e79db912487..9de29ec7392f3 100644
--- a/libcxx/include/float.h
+++ b/libcxx/include/float.h
@@ -71,29 +71,29 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/float.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  if __has_include_next(<float.h>)
-#    include_next <float.h>
-#  endif
+#if __has_include_next(<float.h>)
+#  include_next <float.h>
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
-#    ifndef FLT_EVAL_METHOD
-#      define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
-#    endif
+#  ifndef FLT_EVAL_METHOD
+#    define FLT_EVAL_METHOD __FLT_EVAL_METHOD__
+#  endif
 
-#    ifndef DECIMAL_DIG
-#      define DECIMAL_DIG __DECIMAL_DIG__
-#    endif
+#  ifndef DECIMAL_DIG
+#    define DECIMAL_DIG __DECIMAL_DIG__
+#  endif
 
-#  endif // __cplusplus
-#endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif // __cplusplus
 
 #endif // _LIBCPP_FLOAT_H
diff --git a/libcxx/include/inttypes.h b/libcxx/include/inttypes.h
index 1869284550e90..cd487a8408275 100644
--- a/libcxx/include/inttypes.h
+++ b/libcxx/include/inttypes.h
@@ -236,33 +236,33 @@ uintmax_t wcstoumax(const wchar_t* restrict nptr, wchar_t** restrict endptr, int
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/inttypes.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
 /* C99 stdlib (e.g. glibc < 2.18) does not provide format macros needed
    for C++11 unless __STDC_FORMAT_MACROS is defined
 */
-#  if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS)
-#    define __STDC_FORMAT_MACROS
-#  endif
+#if defined(__cplusplus) && !defined(__STDC_FORMAT_MACROS)
+#  define __STDC_FORMAT_MACROS
+#endif
 
-#  if __has_include_next(<inttypes.h>)
-#    include_next <inttypes.h>
-#  endif
+#if __has_include_next(<inttypes.h>)
+#  include_next <inttypes.h>
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
-#    include <stdint.h>
+#  include <stdint.h>
 
-#    undef imaxabs
-#    undef imaxdiv
+#  undef imaxabs
+#  undef imaxdiv
 
-#  endif // __cplusplus
-#endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif // __cplusplus
 
 #endif // _LIBCPP_INTTYPES_H
diff --git a/libcxx/include/stdbool.h b/libcxx/include/stdbool.h
index c34853b97b854..768d08247256a 100644
--- a/libcxx/include/stdbool.h
+++ b/libcxx/include/stdbool.h
@@ -20,25 +20,25 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/stdbool.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
-
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
-
-#  if __has_include_next(<stdbool.h>)
-#    include_next <stdbool.h>
-#  endif
-
-#  ifdef __cplusplus
-#    undef bool
-#    undef true
-#    undef false
-#    undef __bool_true_false_are_defined
-#    define __bool_true_false_are_defined 1
-#  endif
-#endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif
+
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
+
+#if __has_include_next(<stdbool.h>)
+#  include_next <stdbool.h>
+#endif
+
+#ifdef __cplusplus
+#  undef bool
+#  undef true
+#  undef false
+#  undef __bool_true_false_are_defined
+#  define __bool_true_false_are_defined 1
+#endif
 
 #endif // _LIBCPP_STDBOOL_H
diff --git a/libcxx/include/stddef.h b/libcxx/include/stddef.h
index 46bed2a1e1a68..e9e259da152d2 100644
--- a/libcxx/include/stddef.h
+++ b/libcxx/include/stddef.h
@@ -25,24 +25,24 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/stddef.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
 // Note: This include is outside of header guards because we sometimes get included multiple times
 //       with different defines and the underlying <stddef.h> will know how to deal with that.
-#  include_next <stddef.h>
+#include_next <stddef.h>
 
-#  ifndef _LIBCPP_STDDEF_H
-#    define _LIBCPP_STDDEF_H
+#ifndef _LIBCPP_STDDEF_H
+#  define _LIBCPP_STDDEF_H
 
-#    ifdef __cplusplus
+#  ifdef __cplusplus
 typedef decltype(nullptr) nullptr_t;
-#    endif
-#  endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#  endif
 
 #endif // _LIBCPP_STDDEF_H
diff --git a/libcxx/include/stdio.h b/libcxx/include/stdio.h
index 20b845a00b477..4ce98d178ed2f 100644
--- a/libcxx/include/stdio.h
+++ b/libcxx/include/stdio.h
@@ -88,35 +88,34 @@ void perror(const char* s);
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/stdio.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
 // The inclusion of the system's <stdio.h> is intentionally done once outside of any include
 // guards because some code expects to be able to include the underlying system header multiple
 // times to get different definitions based on the macros that are set before inclusion.
-#  if __has_include_next(<stdio.h>)
-#    include_next <stdio.h>
-#  endif
+#if __has_include_next(<stdio.h>)
+#  include_next <stdio.h>
+#endif
 
-#  ifndef _LIBCPP_STDIO_H
-#    define _LIBCPP_STDIO_H
+#ifndef _LIBCPP_STDIO_H
+#  define _LIBCPP_STDIO_H
 
-#    ifdef __cplusplus
+#  ifdef __cplusplus
 
-#      undef getc
-#      undef putc
-#      undef clearerr
-#      undef feof
-#      undef ferror
-#      undef putchar
-#      undef getchar
+#    undef getc
+#    undef putc
+#    undef clearerr
+#    undef feof
+#    undef ferror
+#    undef putchar
+#    undef getchar
 
-#    endif // __cplusplus
-#  endif   // _LIBCPP_STDIO_H
-
-#endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#  endif // __cplusplus
+#endif   // _LIBCPP_STDIO_H
diff --git a/libcxx/include/tgmath.h b/libcxx/include/tgmath.h
index 3f8f14fd579a9..6ed7a2bd68020 100644
--- a/libcxx/include/tgmath.h
+++ b/libcxx/include/tgmath.h
@@ -18,22 +18,22 @@
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/tgmath.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
-#  ifdef __cplusplus
-#    include <cmath>
-#    include <complex>
-#  else
-#    if __has_include_next(<tgmath.h>)
-#      include_next <tgmath.h>
-#    endif
+#ifdef __cplusplus
+#  include <cmath>
+#  include <complex>
+#else
+#  if __has_include_next(<tgmath.h>)
+#    include_next <tgmath.h>
 #  endif
-#endif // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif
 
 #endif // _LIBCPP_TGMATH_H
diff --git a/libcxx/include/wctype.h b/libcxx/include/wctype.h
index 9a5b163ba43a9..d2607406d8c4c 100644
--- a/libcxx/include/wctype.h
+++ b/libcxx/include/wctype.h
@@ -45,13 +45,14 @@ wctrans_t wctrans(const char* property);
 */
 
 #if defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
-#  include <__cxx03/wctype.h>
+#  include <__cxx03/__config>
 #else
 #  include <__config>
+#endif
 
-#  if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
-#    pragma GCC system_header
-#  endif
+#if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
+#  pragma GCC system_header
+#endif
 
 // TODO:
 // In the future, we should unconditionally include_next <wctype.h> here and instead
@@ -62,33 +63,32 @@ wctrans_t wctrans(const char* property);
 // nothing (with using_if_exists), and if we include another header that defines one
 // of these declarations (e.g. <wchar.h>), the second `using ::wint_t` with using_if_exists
 // will fail because it does not refer to the same declaration.
-#  if __has_include_next(<wctype.h>)
-#    include_next <wctype.h>
-#    define _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H
-#  endif
+#if __has_include_next(<wctype.h>)
+#  include_next <wctype.h>
+#  define _LIBCPP_INCLUDED_C_LIBRARY_WCTYPE_H
+#endif
 
-#  ifdef __cplusplus
+#ifdef __cplusplus
 
-#    undef iswalnum
-#    undef iswalpha
-#    undef iswblank
-#    undef iswcntrl
-#    undef iswdigit
-#    undef iswgraph
-#    undef iswlower
-#    undef iswprint
-#    undef iswpunct
-#    undef iswspace
-#    undef iswupper
-#    undef iswxdigit
-#    undef iswctype
-#    undef wctype
-#    undef towlower
-#    undef towupper
-#    undef towctrans
-#    undef wctrans
+#  undef iswalnum
+#  undef iswalpha
+#  undef iswblank
+#  undef iswcntrl
+#  undef iswdigit
+#  undef iswgraph
+#  undef iswlower
+#  undef iswprint
+#  undef iswpunct
+#  undef iswspace
+#  undef iswupper
+#  undef iswxdigit
+#  undef iswctype
+#  undef wctype
+#  undef towlower
+#  undef towupper
+#  undef towctrans
+#  undef wctrans
 
-#  endif // __cplusplus
-#endif   // defined(__cplusplus) && __cplusplus < 201103L && defined(_LIBCPP_USE_FROZEN_CXX03_HEADERS)
+#endif // __cplusplus
 
 #endif // _LIBCPP_WCTYPE_H
diff --git a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
index fd0701e9ee3a7..ed2e475ccf022 100644
--- a/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
+++ b/libcxx/test/libcxx/containers/views/mdspan/layout_stride/assert.ctor.extents_span.non_unique.pass.cpp
@@ -30,8 +30,9 @@
 //
 // Effects: Direct-non-list-initializes extents_ with e, and for all d in the range [0, rank_), direct-non-list-initializes strides_[d] with as_const(s[d]).
 
-#include <mdspan>
 #include <cassert>
+#include <mdspan>
+#include <span>
 
 #include "check_assertion.h"
 
diff --git a/libcxx/test/libcxx/utilities/utility/__murmur2_or_cityhash.abi-v2.pass.cpp b/libcxx/test/libcxx/utilities/utility/__murmur2_or_cityhash.abi-v2.pass.cpp
index 25c0b9f177dce..6101f0ac75053 100644
--- a/libcxx/test/libcxx/utilities/utility/__murmur2_or_cityhash.abi-v2.pass.cpp
+++ b/libcxx/test/libcxx/utilities/utility/__murmur2_or_cityhash.abi-v2.pass.cpp
@@ -15,6 +15,7 @@
 // REQUIRES: libcpp-abi-version=2
 
 #include <cassert>
+#include <cstdint>
 #include <string>
 #include <utility>
 
diff --git a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
index e696dcdb34351..ffe3e0ef746c1 100644
--- a/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
+++ b/libcxx/test/std/algorithms/alg.nonmodifying/alg.count/count.pass.cpp
@@ -15,7 +15,6 @@
 
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-steps): -fconstexpr-steps=20000000
 // ADDITIONAL_COMPILE_FLAGS(has-fconstexpr-ops-limit): -fconstexpr-ops-limit=80000000
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
 
 #include <algorithm>
 #include <cassert>
diff --git a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
index 3e4c5b1c03fcd..c9a0fa162b7d0 100644
--- a/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.map/unord.map.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_map& operator=(const unordered_map& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -270,7 +268,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)};
       Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.at(1) == 1);
       assert(copy.at(2) == 3);
diff --git a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
index 938b6beccd141..beb67d89915d3 100644
--- a/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multimap/unord.multimap.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_multimap& operator=(const unordered_multimap& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -289,7 +287,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       V rhs_arr[] = {V(10, 4), V(13, 5), V(12, 324), V(0, 54), V(50, 5), V(2, 5)};
       Map copy(begin(rhs_arr), end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.find(1)->second == 1);
       assert(copy.find(2)->second == 3);
diff --git a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
index e415253c5f60f..5c85676fac2e3 100644
--- a/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.multiset/unord.multiset.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_multiset& operator=(const unordered_multiset& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -259,7 +257,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       int rhs_arr[] = {10, 13, 12, 0, 50, 2};
       Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.count(1) == 1);
       assert(copy.count(2) == 1);
diff --git a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
index 9828b8b459c89..30e12e2dfa57c 100644
--- a/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
+++ b/libcxx/test/std/containers/unord/unord.set/unord.set.cnstr/assign_copy.pass.cpp
@@ -14,8 +14,6 @@
 
 // unordered_set& operator=(const unordered_set& u);
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <algorithm>
 #include <cassert>
 #include <cfloat>
@@ -262,7 +260,7 @@ void test_alloc(const Alloc& lhs_alloc                   = Alloc(),
       int rhs_arr[] = {10, 13, 12, 0, 50, 2};
       Set copy(std::begin(rhs_arr), std::end(rhs_arr), 0, std::hash<int>(), std::equal_to<int>(), rhs_alloc);
       copy = orig;
-      LIBCPP_ASSERT(copy.bucket_count() == 5);
+      LIBCPP_NON_FROZEN_ASSERT(copy.bucket_count() == 5);
       assert(copy.size() == 4);
       assert(copy.count(1) == 1);
       assert(copy.count(2) == 1);
diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp
index a1560c8ee5853..c448ba83f4b38 100644
--- a/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp
+++ b/libcxx/test/std/depr/depr.c.headers/uchar_h.compile.pass.cpp
@@ -23,11 +23,6 @@
 // __STDC_UTF_16__ may or may not be defined by the C standard library
 // __STDC_UTF_32__ may or may not be defined by the C standard library
 
-#if !defined(TEST_HAS_NO_C8RTOMB_MBRTOC8)
-ASSERT_SAME_TYPE(size_t, decltype(mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
-ASSERT_SAME_TYPE(size_t, decltype(c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0)));
-#endif
-
 ASSERT_SAME_TYPE(size_t, decltype(mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
 ASSERT_SAME_TYPE(size_t, decltype(c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0)));
 
diff --git a/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp b/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp
new file mode 100644
index 0000000000000..34b512f9c5959
--- /dev/null
+++ b/libcxx/test/std/depr/depr.c.headers/uchar_h_char8_t.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// The following platforms do not provide mbrtoc8 and c8rtomb so the tests fail
+// XFAIL: target={{.+}}-aix{{.*}}
+// XFAIL: android
+// XFAIL: darwin
+// XFAIL: freebsd
+// XFAIL: windows
+// XFAIL: glibc-no-char8_t-support
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
+// <uchar.h>
+
+#include <uchar.h>
+
+ASSERT_SAME_TYPE(size_t, decltype(mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(size_t, decltype(c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0)));
diff --git a/libcxx/test/std/iterators/iterator.container/ssize.LWG3207.compile.pass.cpp b/libcxx/test/std/iterators/iterator.container/ssize.LWG3207.compile.pass.cpp
index e97020f9440ca..1ca68b15205e8 100644
--- a/libcxx/test/std/iterators/iterator.container/ssize.LWG3207.compile.pass.cpp
+++ b/libcxx/test/std/iterators/iterator.container/ssize.LWG3207.compile.pass.cpp
@@ -22,6 +22,7 @@
 #include <iterator>
 #include <climits>
 #include <cstddef>
+#include <cstdint>
 
 // Test the test:
 static_assert(sizeof(std::ptrdiff_t) == 4, "Run only on these platforms");
diff --git a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
index 572a14e67e631..fed5b4a610fd4 100644
--- a/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
+++ b/libcxx/test/std/localization/locale.categories/category.numeric/locale.nm.put/facet.num.put.members/put_pointer.pass.cpp
@@ -12,8 +12,6 @@
 
 // iter_type put(iter_type s, ios_base& iob, char_type fill, void* v) const;
 
-// XFAIL: FROZEN-CXX03-HEADERS-FIXME
-
 #include <cassert>
 #include <ios>
 #include <locale>
@@ -36,7 +34,7 @@ int main(int, char**) {
     cpp17_output_iterator<char*> iter = f.put(cpp17_output_iterator<char*>(str), ios, '*', v);
     std::string ex(str, base(iter));
     assert(!ex.empty());
-    LIBCPP_ASSERT(ex == "0");
+    LIBCPP_NON_FROZEN_ASSERT(ex == "0");
   }
 
   return 0;
diff --git a/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp
index 2076384deb2b2..96b394a9934f8 100644
--- a/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp
+++ b/libcxx/test/std/strings/c.strings/cuchar.compile.pass.cpp
@@ -23,11 +23,6 @@
 // __STDC_UTF_16__ may or may not be defined by the C standard library
 // __STDC_UTF_32__ may or may not be defined by the C standard library
 
-#if !defined(TEST_HAS_NO_C8RTOMB_MBRTOC8)
-ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
-ASSERT_SAME_TYPE(std::size_t, decltype(std::c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0)));
-#endif
-
 ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc16((char16_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
 ASSERT_SAME_TYPE(std::size_t, decltype(std::c16rtomb((char*)0, (char16_t)0, (mbstate_t*)0)));
 
diff --git a/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp b/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp
new file mode 100644
index 0000000000000..019265b534c5c
--- /dev/null
+++ b/libcxx/test/std/strings/c.strings/cuchar_char8_t.compile.pass.cpp
@@ -0,0 +1,25 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// UNSUPPORTED: c++03
+
+// The following platforms do not provide mbrtoc8 and c8rtomb so the tests fail
+// XFAIL: target={{.+}}-aix{{.*}}
+// XFAIL: android
+// XFAIL: darwin
+// XFAIL: freebsd
+// XFAIL: windows
+// XFAIL: glibc-no-char8_t-support
+// XFAIL: LIBCXX-PICOLIBC-FIXME
+
+// <cuchar>
+
+#include <cuchar>
+
+ASSERT_SAME_TYPE(std::size_t, decltype(std::mbrtoc8((char8_t*)0, (const char*)0, (size_t)0, (mbstate_t*)0)));
+ASSERT_SAME_TYPE(std::size_t, decltype(std::c8rtomb((char*)0, (char8_t)0, (mbstate_t*)0)));
diff --git a/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp b/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp
deleted file mode 100644
index 1d4a225668d80..0000000000000
--- a/libcxx/test/std/strings/c.strings/no_c8rtomb_mbrtoc8.verify.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-//===----------------------------------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-// UNSUPPORTED: c++03
-
-#include <uchar.h>
-
-#include "test_macros.h"
-
-// When C++ char8_t support is not enabled, definitions of these functions that
-// match the C2X declarations may still be present in the global namespace with
-// a char8_t typedef substituted for the C++ char8_t type. If so, these are not
-// the declarations we are looking for, so don't test for them.
-#if !defined(TEST_HAS_NO_CHAR8_T)
-using U = decltype(::c8rtomb);
-using V = decltype(::mbrtoc8);
-#  if !_LIBCPP_HAS_C8RTOMB_MBRTOC8
-// expected-error@-3 {{no member named 'c8rtomb' in the global namespace}}
-// expected-error@-3 {{no member named 'mbrtoc8' in the global namespace}}
-#  else
-// expected-no-diagnostics
-#  endif
-#else
-// expected-no-diagnostics
-#endif
diff --git a/libcxx/test/support/test_macros.h b/libcxx/test/support/test_macros.h
index 2fc25fc024ac1..c4e1600572456 100644
--- a/libcxx/test/support/test_macros.h
+++ b/libcxx/test/support/test_macros.h
@@ -264,6 +264,12 @@
 #define LIBCPP_ONLY(...) static_assert(true, "")
 #endif
 
+#ifdef _LIBCPP_USE_FROZEN_CXX03_HEADERS
+#  define LIBCPP_NON_FROZEN_ASSERT(...) static_assert(true, "")
+#else
+#  define LIBCPP_NON_FROZEN_ASSERT(...) LIBCPP_ASSERT(__VA_ARGS__)
+#endif
+
 #if __has_cpp_attribute(nodiscard)
 #  define TEST_NODISCARD [[nodiscard]]
 #else
diff --git a/libcxx/utils/libcxx/test/features.py b/libcxx/utils/libcxx/test/features.py
index 7d6e78de343c5..1668e4ab01d75 100644
--- a/libcxx/utils/libcxx/test/features.py
+++ b/libcxx/utils/libcxx/test/features.py
@@ -293,6 +293,23 @@ def _mingwSupportsModules(cfg):
           """,
         ),
     ),
+    # Check for Glibc < 2.36, where there was no support for char8_t functions
+    Feature(
+        name="glibc-no-char8_t-support",
+        when=lambda cfg: "__GLIBC__" in compilerMacros(cfg)
+        and not sourceBuilds(
+            cfg,
+            """
+            #include <uchar.h>
+            #include <wchar.h>
+            int main(void) {
+                char8_t c;
+                mbstate_t s = {0};
+                return mbrtoc8(&c, "", 0, &s);
+            }
+            """,
+        ),
+    ),
     Feature(
         name="has-unix-headers",
         when=lambda cfg: sourceBuilds(
diff --git a/libunwind/include/__libunwind_config.h b/libunwind/include/__libunwind_config.h
index 343934e885368..980d11ef5d4f2 100644
--- a/libunwind/include/__libunwind_config.h
+++ b/libunwind/include/__libunwind_config.h
@@ -212,11 +212,13 @@
 # define _LIBUNWIND_HIGHEST_DWARF_REGISTER 287
 #endif // _LIBUNWIND_IS_NATIVE_ONLY
 
-#if __has_feature(ptrauth_calls) && __has_feature(ptrauth_returns)
-#  define _LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING 1
-#elif __has_feature(ptrauth_calls) != __has_feature(ptrauth_returns)
-#  error "Either both or none of ptrauth_calls and ptrauth_returns "\
-         "is allowed to be enabled"
+#if defined(__has_feature)
+#  if __has_feature(ptrauth_calls) && __has_feature(ptrauth_returns)
+#    define _LIBUNWIND_TARGET_AARCH64_AUTHENTICATED_UNWINDING 1
+#  elif __has_feature(ptrauth_calls) != __has_feature(ptrauth_returns)
+#    error "Either both or none of ptrauth_calls and ptrauth_returns "\
+           "is allowed to be enabled"
+#  endif
 #endif
 
 #endif // ____LIBUNWIND_CONFIG_H__
diff --git a/libunwind/src/UnwindRegistersRestore.S b/libunwind/src/UnwindRegistersRestore.S
index 1ab4c43b673b4..198735fa800a9 100644
--- a/libunwind/src/UnwindRegistersRestore.S
+++ b/libunwind/src/UnwindRegistersRestore.S
@@ -634,6 +634,10 @@ Lnovec:
 
 #elif defined(__aarch64__)
 
+#ifndef __has_feature
+#define __has_feature(__feature) 0
+#endif
+
 #if defined(__ARM_FEATURE_GCS_DEFAULT)
 .arch_extension gcs
 #endif
diff --git a/libunwind/src/UnwindRegistersSave.S b/libunwind/src/UnwindRegistersSave.S
index 31a177f4a0df4..619a59751151e 100644
--- a/libunwind/src/UnwindRegistersSave.S
+++ b/libunwind/src/UnwindRegistersSave.S
@@ -763,6 +763,10 @@ LnoR2Fix:
 
 #elif defined(__aarch64__)
 
+#ifndef __has_feature
+#define __has_feature(__feature) 0
+#endif
+
 //
 // extern int __unw_getcontext(unw_context_t* thread_state)
 //
diff --git a/libunwind/src/assembly.h b/libunwind/src/assembly.h
index f8e83e138eff5..f0fcd006f2073 100644
--- a/libunwind/src/assembly.h
+++ b/libunwind/src/assembly.h
@@ -15,7 +15,7 @@
 #ifndef UNWIND_ASSEMBLY_H
 #define UNWIND_ASSEMBLY_H
 
-#if defined(__linux__) && defined(__CET__)
+#if defined(__CET__)
 #include <cet.h>
 #define _LIBUNWIND_CET_ENDBR _CET_ENDBR
 #else
diff --git a/libunwind/src/shadow_stack_unwind.h b/libunwind/src/shadow_stack_unwind.h
index 1f229d8317116..b00ca2c9327c7 100644
--- a/libunwind/src/shadow_stack_unwind.h
+++ b/libunwind/src/shadow_stack_unwind.h
@@ -12,8 +12,8 @@
 
 #include "libunwind.h"
 
-// Currently, CET is implemented on Linux x86 platforms.
-#if defined(_LIBUNWIND_TARGET_LINUX) && defined(__CET__) && defined(__SHSTK__)
+// Currently, CET is implemented on some ELF x86 platforms.
+#if defined(__CET__) && defined(__SHSTK__)
 #define _LIBUNWIND_USE_CET 1
 #endif
 
diff --git a/lld/COFF/Chunks.cpp b/lld/COFF/Chunks.cpp
index cb5cba5c414a1..548d87bdaefe5 100644
--- a/lld/COFF/Chunks.cpp
+++ b/lld/COFF/Chunks.cpp
@@ -777,7 +777,7 @@ uint32_t SectionChunk::getSectionNumber() const {
   return s.getIndex() + 1;
 }
 
-CommonChunk::CommonChunk(const COFFSymbolRef s) : sym(s) {
+CommonChunk::CommonChunk(const COFFSymbolRef s) : live(false), sym(s) {
   // The value of a common symbol is its size. Align all common symbols smaller
   // than 32 bytes naturally, i.e. round the size up to the next power of two.
   // This is what MSVC link.exe does.
diff --git a/lld/COFF/Chunks.h b/lld/COFF/Chunks.h
index 6d88f5ec73776..cf8857dc57305 100644
--- a/lld/COFF/Chunks.h
+++ b/lld/COFF/Chunks.h
@@ -522,6 +522,8 @@ class CommonChunk : public NonSectionChunk {
   uint32_t getOutputCharacteristics() const override;
   StringRef getSectionName() const override { return ".bss"; }
 
+  bool live;
+
 private:
   const COFFSymbolRef sym;
 };
diff --git a/lld/COFF/Config.h b/lld/COFF/Config.h
index 76b7c1f61cd25..16997e4de780e 100644
--- a/lld/COFF/Config.h
+++ b/lld/COFF/Config.h
@@ -201,6 +201,9 @@ struct Configuration {
   // Used for /thinlto-remote-compiler:<path>
   StringRef dtltoCompiler;
 
+  // Used for /thinlto-remote-compiler-prepend-arg:<arg>
+  llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerPrependArgs;
+
   // Used for /thinlto-remote-compiler-arg:<arg>
   llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerArgs;
 
diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 3676b8881016b..0e528de9c3652 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -2113,6 +2113,10 @@ void LinkerDriver::linkerMain(ArrayRef<const char *> argsArr) {
     Err(ctx) << "A value must be specified for /thinlto-remote-compiler if "
                 "/thinlto-distributor is specified.";
 
+  // Handle /thinlto-remote-compiler-prepend-arg:<arg>
+  config->dtltoCompilerPrependArgs =
+      args::getStrings(args, OPT_thinlto_remote_compiler_prepend_arg);
+
   // Handle /thinlto-remote-compiler-arg:<arg>
   config->dtltoCompilerArgs =
       args::getStrings(args, OPT_thinlto_remote_compiler_arg);
diff --git a/lld/COFF/LTO.cpp b/lld/COFF/LTO.cpp
index a988be610864a..b1efb28347f74 100644
--- a/lld/COFF/LTO.cpp
+++ b/lld/COFF/LTO.cpp
@@ -118,8 +118,8 @@ BitcodeCompiler::BitcodeCompiler(COFFLinkerContext &c) : ctx(c) {
         /*ShouldEmitIndexFiles=*/false,
         /*ShouldEmitImportFiles=*/false, ctx.config.outputFile,
         ctx.config.dtltoDistributor, ctx.config.dtltoDistributorArgs,
-        ctx.config.dtltoCompiler, ctx.config.dtltoCompilerArgs,
-        !ctx.config.saveTempsArgs.empty());
+        ctx.config.dtltoCompiler, ctx.config.dtltoCompilerPrependArgs,
+        ctx.config.dtltoCompilerArgs, !ctx.config.saveTempsArgs.empty());
   } else if (ctx.config.thinLTOIndexOnly) {
     auto OnIndexWrite = [&](StringRef S) { thinIndices.erase(S); };
     backend = lto::createWriteIndexesThinBackend(
diff --git a/lld/COFF/Options.td b/lld/COFF/Options.td
index f3d0eb3356200..d77478fc9c987 100644
--- a/lld/COFF/Options.td
+++ b/lld/COFF/Options.td
@@ -292,6 +292,9 @@ def thinlto_distributor_arg : P<"thinlto-distributor-arg",
 def thinlto_remote_compiler : P<"thinlto-remote-compiler",
   "Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
   "compilations">;
+def thinlto_remote_compiler_prepend_arg : P<"thinlto-remote-compiler-prepend-arg",
+  "Compiler prepend arguments for the ThinLTO distributor to pass for ThinLTO backend "
+  "compilations">;
 def thinlto_remote_compiler_arg : P<"thinlto-remote-compiler-arg",
   "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
   "compilations">;
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index c86ded860876b..9f077ddb2bb72 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -236,6 +236,8 @@ class DefinedCommon : public DefinedCOFF {
                 CommonChunk *c = nullptr)
       : DefinedCOFF(DefinedCommonKind, f, n, s), data(c), size(size) {
     this->isExternal = true;
+    if (c)
+      c->live = true;
   }
 
   static bool classof(const Symbol *s) {
diff --git a/lld/COFF/Writer.cpp b/lld/COFF/Writer.cpp
index 258a82e371f3a..3ac26681541ba 100644
--- a/lld/COFF/Writer.cpp
+++ b/lld/COFF/Writer.cpp
@@ -1114,6 +1114,10 @@ void Writer::createSections() {
         sc->printDiscardedMessage();
       continue;
     }
+    if (auto *cc = dyn_cast<CommonChunk>(c)) {
+      if (!cc->live)
+        continue;
+    }
     StringRef name = c->getSectionName();
     if (shouldStripSectionSuffix(sc, name, ctx.config.mingw))
       name = name.split('$').first;
diff --git a/lld/Common/DriverDispatcher.cpp b/lld/Common/DriverDispatcher.cpp
index 7c5f1cd3692ef..0b71c08094863 100644
--- a/lld/Common/DriverDispatcher.cpp
+++ b/lld/Common/DriverDispatcher.cpp
@@ -30,10 +30,10 @@ static void err(const Twine &s) { llvm::errs() << s << "\n"; }
 
 static Flavor getFlavor(StringRef s) {
   return StringSwitch<Flavor>(s)
-      .CasesLower("ld", "ld.lld", "gnu", Gnu)
-      .CasesLower("wasm", "ld-wasm", Wasm)
+      .CasesLower({"ld", "ld.lld", "gnu"}, Gnu)
+      .CasesLower({"wasm", "ld-wasm"}, Wasm)
       .CaseLower("link", WinLink)
-      .CasesLower("ld64", "ld64.lld", "darwin", Darwin)
+      .CasesLower({"ld64", "ld64.lld", "darwin"}, Darwin)
       .Default(Invalid);
 }
 
diff --git a/lld/ELF/Arch/RISCV.cpp b/lld/ELF/Arch/RISCV.cpp
index dc2ab97e9d9be..5ed89e47c672e 100644
--- a/lld/ELF/Arch/RISCV.cpp
+++ b/lld/ELF/Arch/RISCV.cpp
@@ -8,6 +8,7 @@
 
 #include "InputFiles.h"
 #include "OutputSections.h"
+#include "RelocScan.h"
 #include "Symbols.h"
 #include "SyntheticSections.h"
 #include "Target.h"
@@ -38,6 +39,10 @@ class RISCV final : public TargetInfo {
   void writePltHeader(uint8_t *buf) const override;
   void writePlt(uint8_t *buf, const Symbol &sym,
                 uint64_t pltEntryAddr) const override;
+  template <class ELFT, class RelTy>
+  void scanSectionImpl(InputSectionBase &, Relocs<RelTy>);
+  template <class ELFT> void scanSection1(InputSectionBase &);
+  void scanSection(InputSectionBase &) override;
   RelType getDynRel(RelType type) const override;
   RelExpr getRelExpr(RelType type, const Symbol &s,
                      const uint8_t *loc) const override;
@@ -278,6 +283,7 @@ RelExpr RISCV::getRelExpr(const RelType type, const Symbol &s,
                           const uint8_t *loc) const {
   switch (type) {
   case R_RISCV_NONE:
+  case R_RISCV_VENDOR:
     return R_NONE;
   case R_RISCV_32:
   case R_RISCV_64:
@@ -1476,3 +1482,54 @@ void elf::mergeRISCVAttributesSections(Ctx &ctx) {
 }
 
 void elf::setRISCVTargetInfo(Ctx &ctx) { ctx.target.reset(new RISCV(ctx)); }
+
+template <class ELFT, class RelTy>
+void RISCV::scanSectionImpl(InputSectionBase &sec, Relocs<RelTy> rels) {
+  RelocScan rs(ctx, &sec);
+  // Many relocations end up in sec.relocations.
+  sec.relocations.reserve(rels.size());
+
+  StringRef rvVendor;
+  for (auto it = rels.begin(); it != rels.end(); ++it) {
+    RelType type = it->getType(false);
+    uint32_t symIndex = it->getSymbol(false);
+    Symbol &sym = sec.getFile<ELFT>()->getSymbol(symIndex);
+    const uint8_t *loc = sec.content().data() + it->r_offset;
+
+    if (type == R_RISCV_VENDOR) {
+      if (!rvVendor.empty())
+        Err(ctx) << getErrorLoc(ctx, loc)
+                 << "malformed consecutive R_RISCV_VENDOR relocations";
+      rvVendor = sym.getName();
+      continue;
+    } else if (!rvVendor.empty()) {
+      Err(ctx) << getErrorLoc(ctx, loc)
+               << "unknown vendor-specific relocation (" << type.v
+               << ") in namespace '" << rvVendor << "' against symbol '" << &sym
+               << "'";
+      rvVendor = "";
+      continue;
+    }
+
+    rs.scan<ELFT, RelTy>(it, type, rs.getAddend<ELFT>(*it, type));
+  }
+
+  // Sort relocations by offset for more efficient searching for
+  // R_RISCV_PCREL_HI20.
+  llvm::stable_sort(sec.relocs(),
+                    [](const Relocation &lhs, const Relocation &rhs) {
+                      return lhs.offset < rhs.offset;
+                    });
+}
+
+template <class ELFT> void RISCV::scanSection1(InputSectionBase &sec) {
+  const RelsOrRelas<ELFT> rels = sec.template relsOrRelas<ELFT>();
+  if (rels.areRelocsCrel())
+    scanSectionImpl<ELFT>(sec, rels.crels);
+  else
+    scanSectionImpl<ELFT>(sec, rels.relas);
+}
+
+void RISCV::scanSection(InputSectionBase &sec) {
+  invokeELFT(scanSection1, sec);
+}
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index fd57967a1d21f..8ec5a2c04e71c 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -278,6 +278,7 @@ struct Config {
   llvm::StringRef dtltoDistributor;
   llvm::SmallVector<llvm::StringRef, 0> dtltoDistributorArgs;
   llvm::StringRef dtltoCompiler;
+  llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerPrependArgs;
   llvm::SmallVector<llvm::StringRef, 0> dtltoCompilerArgs;
   llvm::SmallVector<llvm::StringRef, 0> undefined;
   llvm::SmallVector<SymbolVersion, 0> dynamicList;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index b2c252b75d151..1991d660ccc95 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1400,6 +1400,8 @@ static void readConfigs(Ctx &ctx, opt::InputArgList &args) {
   ctx.arg.dtltoDistributorArgs =
       args::getStrings(args, OPT_thinlto_distributor_arg);
   ctx.arg.dtltoCompiler = args.getLastArgValue(OPT_thinlto_remote_compiler_eq);
+  ctx.arg.dtltoCompilerPrependArgs =
+      args::getStrings(args, OPT_thinlto_remote_compiler_prepend_arg);
   ctx.arg.dtltoCompilerArgs =
       args::getStrings(args, OPT_thinlto_remote_compiler_arg);
   ctx.arg.dwoDir = args.getLastArgValue(OPT_plugin_opt_dwo_dir_eq);
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index cb158362e4c55..f8be9254d145b 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -188,7 +188,8 @@ BitcodeCompiler::BitcodeCompiler(Ctx &ctx) : ctx(ctx) {
         ctx.arg.thinLTOEmitIndexFiles, ctx.arg.thinLTOEmitImportsFiles,
         ctx.arg.outputFile, ctx.arg.dtltoDistributor,
         ctx.arg.dtltoDistributorArgs, ctx.arg.dtltoCompiler,
-        ctx.arg.dtltoCompilerArgs, !ctx.arg.saveTempsArgs.empty());
+        ctx.arg.dtltoCompilerPrependArgs, ctx.arg.dtltoCompilerArgs,
+        !ctx.arg.saveTempsArgs.empty());
   } else {
     backend = lto::createInProcessThinBackend(
         llvm::heavyweight_hardware_concurrency(ctx.arg.thinLTOJobs),
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index 93883d4c6ff5c..fb9e5df648c75 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -725,6 +725,9 @@ defm thinlto_distributor_arg: EEq<"thinlto-distributor-arg", "Arguments to "
 def thinlto_remote_compiler_eq: JJ<"thinlto-remote-compiler=">,
   HelpText<"Compiler for the ThinLTO distributor to invoke for ThinLTO backend "
   "compilations">;
+defm thinlto_remote_compiler_prepend_arg: EEq<"thinlto-remote-compiler-prepend-arg",
+  "Compiler prepend arguments for the ThinLTO distributor to pass for ThinLTO backend "
+  "compilations">;
 defm thinlto_remote_compiler_arg: EEq<"thinlto-remote-compiler-arg",
   "Compiler arguments for the ThinLTO distributor to pass for ThinLTO backend "
   "compilations">;
diff --git a/lld/docs/DTLTO.rst b/lld/docs/DTLTO.rst
index 54fcc034d1371..bec24e6fab2c6 100644
--- a/lld/docs/DTLTO.rst
+++ b/lld/docs/DTLTO.rst
@@ -14,19 +14,23 @@ ELF LLD
 
 The command-line interface is as follows:
 
-- ``--thinlto-distributor=<path>``  
+- ``--thinlto-distributor=<path>``
   Specifies the file to execute as the distributor process. If specified,
   ThinLTO backend compilations will be distributed.
 
-- ``--thinlto-remote-compiler=<path>``  
+- ``--thinlto-remote-compiler=<path>``
   Specifies the path to the compiler that the distributor process will use for
   backend compilations. The compiler invoked must match the version of LLD.
 
-- ``--thinlto-distributor-arg=<arg>``  
+- ``--thinlto-distributor-arg=<arg>``
   Specifies ``<arg>`` on the command line when invoking the distributor.
   Can be specified multiple times.
 
-- ``--thinlto-remote-compiler-arg=<arg>``  
+- ``--thinlto-remote-compiler-prepend-arg=<arg>``
+  Prepends ``<arg>`` to the remote compiler's command line.
+  Can be specified multiple times.
+
+- ``--thinlto-remote-compiler-arg=<arg>``
   Appends ``<arg>`` to the remote compiler's command line.
   Can be specified multiple times.
 
@@ -57,6 +61,10 @@ The command-line interface is as follows:
   Specifies ``<arg>`` on the command line when invoking the distributor.
   Can be specified multiple times.
 
+- ``/thinlto-remote-compiler-prepend-arg:<arg>``
+  Prepends ``<arg>`` to the remote compiler's command line.
+  Can be specified multiple times.
+
 - ``/thinlto-remote-compiler-arg:<arg>``
   Appends ``<arg>`` to the remote compiler's command line.
   Can be specified multiple times.
@@ -72,4 +80,4 @@ Currently, other options are silently accepted but do not have the intended
 effect. Support for such options could be expanded in the future.
 
 Currently, there is no DTLTO command line interface supplied for ``clang-cl``,
-as users are expected to invoke LLD directly.
\ No newline at end of file
+as users are expected to invoke LLD directly.
diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
index 7e29044ebe00d..f57a3e3c1ede4 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-lto-a.ll
@@ -47,8 +47,8 @@ entry:
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.linker.options = !{!1, !2}
diff --git a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
index 0f64e236a1f47..7347fde7a9dee 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-lto-b.ll
@@ -11,7 +11,7 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone sspstrong uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.linker.options = !{!0, !1}
 !llvm.module.flags = !{!2, !3, !4, !5}
diff --git a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
index 5f6730272e610..61828cae7ea58 100644
--- a/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
+++ b/lld/test/COFF/Inputs/undefined-symbol-multi-lto.ll
@@ -13,8 +13,8 @@ declare dso_local i32 @"?foo@@YAHXZ"() #1
 
 declare dso_local i32 @"?bar@@YAHXZ"() #1
 
-attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/lld/test/COFF/common-dedup.ll b/lld/test/COFF/common-dedup.ll
new file mode 100644
index 0000000000000..cec3a7065d545
--- /dev/null
+++ b/lld/test/COFF/common-dedup.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: llc %t.dir/t1.ll -o %t.dir/t1.obj --filetype=obj
+; RUN: llc %t.dir/t2.ll -o %t.dir/t2.obj --filetype=obj
+; RUN: lld-link %t.dir/t1.obj %t.dir/t2.obj -entry:main -out:%t.dir/out.exe
+; RUN: llvm-readobj --section-headers %t.dir/out.exe | FileCheck %s
+
+; Make sure that the data section contains just one copy of @a, not two.
+; CHECK: Name: .data
+; CHECK-NEXT: VirtualSize: 0x1000
+
+;--- t1.ll
+target triple = "x86_64-pc-windows-msvc"
+@a = common global [4096 x i8] zeroinitializer
+
+define i32 @usea() {
+  %ref_common = load i32, ptr @a
+  ret i32 %ref_common
+}
+
+;--- t2.ll
+target triple = "x86_64-pc-windows-msvc"
+@a = common global [4096 x i8] zeroinitializer
+
+define i32 @useb() {
+  %ref_common = load i32, ptr @a
+  ret i32 %ref_common
+}
+
+declare i32 @usea()
+
+define dso_local i32 @main() local_unnamed_addr {
+entry:
+  %a = tail call i32 @usea()
+  %b = tail call i32 @useb()
+  %add = add nsw i32 %a, %b
+  ret i32 %add
+}
diff --git a/lld/test/ELF/riscv-vendor-relocations.s b/lld/test/ELF/riscv-vendor-relocations.s
new file mode 100644
index 0000000000000..b0f3c4a30d060
--- /dev/null
+++ b/lld/test/ELF/riscv-vendor-relocations.s
@@ -0,0 +1,19 @@
+# REQUIRES: riscv
+# RUN: llvm-mc -triple riscv32 %s -filetype=obj -o %t.o
+# RUN: not ld.lld -pie %t.o -o /dev/null 2>&1 | FileCheck %s
+
+  .option exact
+
+  .global TARGET
+TARGET:
+  nop
+
+.global INVALID_VENDOR
+.reloc 1f, R_RISCV_VENDOR, INVALID_VENDOR+0
+.reloc 1f, R_RISCV_VENDOR, INVALID_VENDOR+0
+.reloc 1f, R_RISCV_CUSTOM255, TARGET
+1:
+  nop
+
+# CHECK: error: {{.*}}:(.text+0x4): malformed consecutive R_RISCV_VENDOR relocations
+# CHECK: error: {{.*}}:(.text+0x4): unknown vendor-specific relocation (255) in namespace 'INVALID_VENDOR' against symbol 'TARGET'
diff --git a/lld/test/MachO/lto-mattrs.ll b/lld/test/MachO/lto-mattrs.ll
index f658b485a1792..41313296dc269 100644
--- a/lld/test/MachO/lto-mattrs.ll
+++ b/lld/test/MachO/lto-mattrs.ll
@@ -33,4 +33,4 @@ define float @foo(float %x) #0 {
   ret float %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
+attributes #0 = { "reciprocal-estimates"="divf,vec-divf" }
diff --git a/lld/test/wasm/Inputs/debuginfo1.ll b/lld/test/wasm/Inputs/debuginfo1.ll
index d6db88002ce1e..0a1c42a214f2c 100644
--- a/lld/test/wasm/Inputs/debuginfo1.ll
+++ b/lld/test/wasm/Inputs/debuginfo1.ll
@@ -35,9 +35,9 @@ declare void @foo(i32) local_unnamed_addr #2
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #3
 
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
+attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone speculatable }
 attributes #4 = { nounwind }
 
diff --git a/lld/test/wasm/Inputs/debuginfo2.ll b/lld/test/wasm/Inputs/debuginfo2.ll
index 1b63dd57474e5..c832be521e325 100644
--- a/lld/test/wasm/Inputs/debuginfo2.ll
+++ b/lld/test/wasm/Inputs/debuginfo2.ll
@@ -31,7 +31,7 @@ entry:
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.value(metadata, metadata, metadata) #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone speculatable }
 
 !llvm.dbg.cu = !{!2}
diff --git a/lld/test/wasm/debug-removed-fn.ll b/lld/test/wasm/debug-removed-fn.ll
index 8dae48a136dc4..20c30346cc0e7 100644
--- a/lld/test/wasm/debug-removed-fn.ll
+++ b/lld/test/wasm/debug-removed-fn.ll
@@ -28,7 +28,7 @@ entry:
   ret i32 6, !dbg !13
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/lld/test/wasm/lto/relocation-model.ll b/lld/test/wasm/lto/relocation-model.ll
new file mode 100644
index 0000000000000..8fe198d0c64e6
--- /dev/null
+++ b/lld/test/wasm/lto/relocation-model.ll
@@ -0,0 +1,21 @@
+;; The explicit relocation model flag.
+
+; RUN: llvm-as %s -o %t.o
+
+; RUN: wasm-ld %t.o -o %t.wasm -save-temps -r -mllvm -relocation-model=pic
+; RUN: llvm-readobj -r %t.wasm.lto.o | FileCheck %s --check-prefix=PIC
+
+; RUN: wasm-ld %t.o -o %t_static.wasm -save-temps -r -mllvm -relocation-model=static
+; RUN: llvm-readobj -r %t_static.wasm.lto.o | FileCheck %s --check-prefix=STATIC
+
+; PIC: R_WASM_GLOBAL_INDEX_LEB foo
+; STATIC: R_WASM_MEMORY_ADDR_LEB foo
+
+target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20"
+target triple = "wasm32-unknown-unknown"
+
+@foo = external global i32
+define i32 @_start() {
+  %t = load i32, i32* @foo
+  ret i32 %t
+}
diff --git a/lld/wasm/LTO.cpp b/lld/wasm/LTO.cpp
index 71f18aa25a35c..ae85f4693214b 100644
--- a/lld/wasm/LTO.cpp
+++ b/lld/wasm/LTO.cpp
@@ -57,7 +57,9 @@ static lto::Config createConfig() {
   c.DebugPassManager = ctx.arg.ltoDebugPassManager;
   c.AlwaysEmitRegularLTOObj = !ctx.arg.ltoObjPath.empty();
 
-  if (ctx.arg.relocatable)
+  if (auto relocModel = getRelocModelFromCMModel())
+    c.RelocModel = *relocModel;
+  else if (ctx.arg.relocatable)
     c.RelocModel = std::nullopt;
   else if (ctx.isPic)
     c.RelocModel = Reloc::PIC_;
diff --git a/lldb/bindings/interface/SBPlatformExtensions.i b/lldb/bindings/interface/SBPlatformExtensions.i
new file mode 100644
index 0000000000000..86759bda50630
--- /dev/null
+++ b/lldb/bindings/interface/SBPlatformExtensions.i
@@ -0,0 +1,7 @@
+%extend lldb::SBPlatform {
+#ifdef SWIGPYTHON
+    %pythoncode %{
+        is_host = property(IsHost, None, doc='''A read only property that returns a boolean value that indiciates if this platform is the host platform.''')
+    %}
+#endif
+}
diff --git a/lldb/bindings/interfaces.swig b/lldb/bindings/interfaces.swig
index e71ed136f20e6..b3d44979c916c 100644
--- a/lldb/bindings/interfaces.swig
+++ b/lldb/bindings/interfaces.swig
@@ -53,6 +53,7 @@
 %include "./interface/SBModuleSpecDocstrings.i"
 %include "./interface/SBMutexExtensions.i"
 %include "./interface/SBPlatformDocstrings.i"
+%include "./interface/SBPlatformExtensions.i"
 %include "./interface/SBProcessDocstrings.i"
 %include "./interface/SBProcessInfoDocstrings.i"
 %include "./interface/SBProgressDocstrings.i"
diff --git a/lldb/cmake/modules/FindLuaAndSwig.cmake b/lldb/cmake/modules/FindLuaAndSwig.cmake
index 33fadb2a09740..c5df29e8d0a92 100644
--- a/lldb/cmake/modules/FindLuaAndSwig.cmake
+++ b/lldb/cmake/modules/FindLuaAndSwig.cmake
@@ -34,6 +34,7 @@ else()
                                     FOUND_VAR
                                       LUAANDSWIG_FOUND
                                     REQUIRED_VARS
+                                      LUA_EXECUTABLE
                                       LUA_LIBRARIES
                                       LUA_INCLUDE_DIR
                                       LUA_VERSION_MINOR
diff --git a/lldb/docs/resources/lldbgdbremote.md b/lldb/docs/resources/lldbgdbremote.md
index 287484ea5cabf..032edb66690e4 100644
--- a/lldb/docs/resources/lldbgdbremote.md
+++ b/lldb/docs/resources/lldbgdbremote.md
@@ -2491,9 +2491,10 @@ The packet below are supported by the
 ### qWasmCallStack
 
 Get the Wasm call stack for the given thread id. This returns a hex-encoded
-list of PC values, one for each frame of the call stack. To match the Wasm
-specification, the addresses are encoded in little endian byte order, even if
-the endian of the Wasm runtime's host is not little endian.
+list (with no delimiters) of 64-bit PC values, one for each frame of the call
+stack. To match the Wasm specification, the addresses are encoded in little
+endian byte order, even if the endian of the Wasm runtime's host is not little
+endian.
 
 ```
 send packet: $qWasmCallStack:202dbe040#08
diff --git a/lldb/include/lldb/API/SBPlatform.h b/lldb/include/lldb/API/SBPlatform.h
index d63d2ed1eaba6..7cb19b3f7ec46 100644
--- a/lldb/include/lldb/API/SBPlatform.h
+++ b/lldb/include/lldb/API/SBPlatform.h
@@ -112,6 +112,9 @@ class LLDB_API SBPlatform {
 
   bool IsValid() const;
 
+  /// Returns true if this platform is the host platform, otherwise false.
+  bool IsHost() const;
+
   void Clear();
 
   const char *GetWorkingDirectory();
diff --git a/lldb/include/lldb/Target/Target.h b/lldb/include/lldb/Target/Target.h
index f4a09237ce897..c375df248154f 100644
--- a/lldb/include/lldb/Target/Target.h
+++ b/lldb/include/lldb/Target/Target.h
@@ -1356,7 +1356,11 @@ class Target : public std::enable_shared_from_this<Target>,
     StopHook(const StopHook &rhs);
     virtual ~StopHook() = default;
 
-    enum class StopHookKind  : uint32_t { CommandBased = 0, ScriptBased };
+    enum class StopHookKind : uint32_t {
+      CommandBased = 0,
+      ScriptBased,
+      CodeBased,
+    };
     enum class StopHookResult : uint32_t {
       KeepStopped = 0,
       RequestContinue,
@@ -1403,6 +1407,12 @@ class Target : public std::enable_shared_from_this<Target>,
 
     bool GetRunAtInitialStop() const { return m_at_initial_stop; }
 
+    void SetSuppressOutput(bool suppress_output) {
+      m_suppress_output = suppress_output;
+    }
+
+    bool GetSuppressOutput() const { return m_suppress_output; }
+
     void GetDescription(Stream &s, lldb::DescriptionLevel level) const;
     virtual void GetSubclassDescription(Stream &s,
                                         lldb::DescriptionLevel level) const = 0;
@@ -1414,6 +1424,7 @@ class Target : public std::enable_shared_from_this<Target>,
     bool m_active = true;
     bool m_auto_continue = false;
     bool m_at_initial_stop = true;
+    bool m_suppress_output = false;
 
     StopHook(lldb::TargetSP target_sp, lldb::user_id_t uid);
   };
@@ -1433,8 +1444,8 @@ class Target : public std::enable_shared_from_this<Target>,
 
   private:
     StringList m_commands;
-    // Use CreateStopHook to make a new empty stop hook. The GetCommandPointer
-    // and fill it with commands, and SetSpecifier to set the specifier shared
+    // Use CreateStopHook to make a new empty stop hook. Use SetActionFromString
+    // to fill it with commands, and SetSpecifier to set the specifier shared
     // pointer (can be null, that will match anything.)
     StopHookCommandLine(lldb::TargetSP target_sp, lldb::user_id_t uid)
         : StopHook(target_sp, uid) {}
@@ -1460,19 +1471,56 @@ class Target : public std::enable_shared_from_this<Target>,
     StructuredDataImpl m_extra_args;
     lldb::ScriptedStopHookInterfaceSP m_interface_sp;
 
-    /// Use CreateStopHook to make a new empty stop hook. The GetCommandPointer
-    /// and fill it with commands, and SetSpecifier to set the specifier shared
-    /// pointer (can be null, that will match anything.)
+    /// Use CreateStopHook to make a new empty stop hook. Use SetScriptCallback
+    /// to set the script to execute, and SetSpecifier to set the specifier
+    /// shared pointer (can be null, that will match anything.)
     StopHookScripted(lldb::TargetSP target_sp, lldb::user_id_t uid)
         : StopHook(target_sp, uid) {}
     friend class Target;
   };
 
+  class StopHookCoded : public StopHook {
+  public:
+    ~StopHookCoded() override = default;
+
+    using HandleStopCallback = StopHookResult(ExecutionContext &exc_ctx,
+                                              lldb::StreamSP output);
+
+    void SetCallback(llvm::StringRef name, HandleStopCallback *callback) {
+      m_name = name;
+      m_callback = callback;
+    }
+
+    StopHookResult HandleStop(ExecutionContext &exc_ctx,
+                              lldb::StreamSP output) override {
+      return m_callback(exc_ctx, output);
+    }
+
+    void GetSubclassDescription(Stream &s,
+                                lldb::DescriptionLevel level) const override {
+      s.Indent();
+      s.Printf("%s (built-in)\n", m_name.c_str());
+    }
+
+  private:
+    std::string m_name;
+    HandleStopCallback *m_callback;
+
+    /// Use CreateStopHook to make a new empty stop hook. Use SetCallback to set
+    /// the callback to execute, and SetSpecifier to set the specifier shared
+    /// pointer (can be null, that will match anything.)
+    StopHookCoded(lldb::TargetSP target_sp, lldb::user_id_t uid)
+        : StopHook(target_sp, uid) {}
+    friend class Target;
+  };
+
+  void RegisterInternalStopHooks();
+
   typedef std::shared_ptr<StopHook> StopHookSP;
 
   /// Add an empty stop hook to the Target's stop hook list, and returns a
-  /// shared pointer to it in new_hook. Returns the id of the new hook.
-  StopHookSP CreateStopHook(StopHook::StopHookKind kind);
+  /// shared pointer to the new hook.
+  StopHookSP CreateStopHook(StopHook::StopHookKind kind, bool internal = false);
 
   /// If you tried to create a stop hook, and that failed, call this to
   /// remove the stop hook, as it will also reset the stop hook counter.
@@ -1484,8 +1532,6 @@ class Target : public std::enable_shared_from_this<Target>,
   // control over the process for the first time.
   bool RunStopHooks(bool at_initial_stop = false);
 
-  size_t GetStopHookSize();
-
   bool SetSuppresStopHooks(bool suppress) {
     bool old_value = m_suppress_stop_hooks;
     m_suppress_stop_hooks = suppress;
@@ -1504,19 +1550,7 @@ class Target : public std::enable_shared_from_this<Target>,
 
   void SetAllStopHooksActiveState(bool active_state);
 
-  size_t GetNumStopHooks() const { return m_stop_hooks.size(); }
-
-  StopHookSP GetStopHookAtIndex(size_t index) {
-    if (index >= GetNumStopHooks())
-      return StopHookSP();
-    StopHookCollection::iterator pos = m_stop_hooks.begin();
-
-    while (index > 0) {
-      pos++;
-      index--;
-    }
-    return (*pos).second;
-  }
+  const std::vector<StopHookSP> GetStopHooks(bool internal = false) const;
 
   lldb::PlatformSP GetPlatform() { return m_platform_sp; }
 
@@ -1656,6 +1690,7 @@ class Target : public std::enable_shared_from_this<Target>,
   typedef std::map<lldb::user_id_t, StopHookSP> StopHookCollection;
   StopHookCollection m_stop_hooks;
   lldb::user_id_t m_stop_hook_next_id;
+  std::vector<StopHookSP> m_internal_stop_hooks;
   uint32_t m_latest_stop_hook_id; /// This records the last natural stop at
                                   /// which we ran a stop-hook.
   bool m_valid;
diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index a3d924d495fb1..d892c01f0bc71 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -11,6 +11,7 @@
 import signal
 import sys
 import threading
+import warnings
 import time
 from typing import (
     Any,
@@ -383,6 +384,10 @@ def _process_recv_packets(self) -> None:
         """Process received packets, updating the session state."""
         with self._recv_condition:
             for packet in self._recv_packets:
+                if packet and ("seq" not in packet or packet["seq"] == 0):
+                    warnings.warn(
+                        f"received a malformed packet, expected 'seq != 0' for {packet!r}"
+                    )
                 # Handle events that may modify any stateful properties of
                 # the DAP session.
                 if packet and packet["type"] == "event":
@@ -576,6 +581,7 @@ def wait_for_stopped(self) -> Optional[List[Event]]:
             # If we exited, then we are done
             if stopped_event["event"] == "exited":
                 break
+
             # Otherwise we stopped and there might be one or more 'stopped'
             # events for each thread that stopped with a reason, so keep
             # checking for more 'stopped' events and return all of them
diff --git a/lldb/source/API/SBPlatform.cpp b/lldb/source/API/SBPlatform.cpp
index ec59e27a08b30..9a0b47ce80336 100644
--- a/lldb/source/API/SBPlatform.cpp
+++ b/lldb/source/API/SBPlatform.cpp
@@ -331,6 +331,11 @@ SBPlatform::operator bool() const {
   return m_opaque_sp.get() != nullptr;
 }
 
+bool SBPlatform::IsHost() const {
+  LLDB_INSTRUMENT_VA(this);
+  return m_opaque_sp.get() != nullptr && m_opaque_sp->IsHost();
+}
+
 void SBPlatform::Clear() {
   LLDB_INSTRUMENT_VA(this);
 
diff --git a/lldb/source/Commands/CommandCompletions.cpp b/lldb/source/Commands/CommandCompletions.cpp
index b2fc893e13fe3..c60d30326a3b4 100644
--- a/lldb/source/Commands/CommandCompletions.cpp
+++ b/lldb/source/Commands/CommandCompletions.cpp
@@ -777,13 +777,11 @@ void CommandCompletions::StopHookIDs(CommandInterpreter &interpreter,
   if (!target_sp)
     return;
 
-  const size_t num = target_sp->GetNumStopHooks();
-  for (size_t idx = 0; idx < num; ++idx) {
+  for (auto &stophook_sp : target_sp->GetStopHooks()) {
     StreamString strm;
     // The value 11 is an offset to make the completion description looks
     // neater.
     strm.SetIndentLevel(11);
-    const Target::StopHookSP stophook_sp = target_sp->GetStopHookAtIndex(idx);
     stophook_sp->GetDescription(strm, lldb::eDescriptionLevelInitial);
     request.TryCompleteCurrentArg(std::to_string(stophook_sp->GetID()),
                                   strm.GetString());
diff --git a/lldb/source/Commands/CommandObjectBreakpoint.cpp b/lldb/source/Commands/CommandObjectBreakpoint.cpp
index de0a7e7093411..5a5512610cd33 100644
--- a/lldb/source/Commands/CommandObjectBreakpoint.cpp
+++ b/lldb/source/Commands/CommandObjectBreakpoint.cpp
@@ -1114,9 +1114,7 @@ class CommandObjectBreakpointList : public CommandObjectParsed {
   CommandObjectBreakpointList(CommandInterpreter &interpreter)
       : CommandObjectParsed(
             interpreter, "breakpoint list",
-            "List some or all breakpoints at configurable levels of detail.",
-            nullptr) {
-    CommandArgumentData bp_id_arg;
+            "List some or all breakpoints at configurable levels of detail.") {
 
     // Define the first (and only) variant of this arg.
     AddSimpleArgumentList(eArgTypeBreakpointID, eArgRepeatOptional);
diff --git a/lldb/source/Commands/CommandObjectTarget.cpp b/lldb/source/Commands/CommandObjectTarget.cpp
index c59d02812f328..8de6521e65b25 100644
--- a/lldb/source/Commands/CommandObjectTarget.cpp
+++ b/lldb/source/Commands/CommandObjectTarget.cpp
@@ -5223,33 +5223,72 @@ class CommandObjectTargetStopHookEnableDisable : public CommandObjectParsed {
 #pragma mark CommandObjectTargetStopHookList
 
 // CommandObjectTargetStopHookList
+#define LLDB_OPTIONS_target_stop_hook_list
+#include "CommandOptions.inc"
 
 class CommandObjectTargetStopHookList : public CommandObjectParsed {
 public:
   CommandObjectTargetStopHookList(CommandInterpreter &interpreter)
       : CommandObjectParsed(interpreter, "target stop-hook list",
-                            "List all stop-hooks.", "target stop-hook list") {}
+                            "List all stop-hooks.") {}
 
   ~CommandObjectTargetStopHookList() override = default;
 
+  Options *GetOptions() override { return &m_options; }
+
+  class CommandOptions : public Options {
+  public:
+    CommandOptions() = default;
+    ~CommandOptions() override = default;
+
+    Status SetOptionValue(uint32_t option_idx, llvm::StringRef option_arg,
+                          ExecutionContext *execution_context) override {
+      Status error;
+      const int short_option = m_getopt_table[option_idx].val;
+
+      switch (short_option) {
+      case 'i':
+        m_internal = true;
+        break;
+      default:
+        llvm_unreachable("Unimplemented option");
+      }
+
+      return error;
+    }
+
+    void OptionParsingStarting(ExecutionContext *execution_context) override {
+      m_internal = false;
+    }
+
+    llvm::ArrayRef<OptionDefinition> GetDefinitions() override {
+      return llvm::ArrayRef(g_target_stop_hook_list_options);
+    }
+
+    // Instance variables to hold the values for command options.
+    bool m_internal = false;
+  };
+
 protected:
   void DoExecute(Args &command, CommandReturnObject &result) override {
     Target &target = GetTarget();
 
-    size_t num_hooks = target.GetNumStopHooks();
-    if (num_hooks == 0) {
-      result.GetOutputStream().PutCString("No stop hooks.\n");
-    } else {
-      for (size_t i = 0; i < num_hooks; i++) {
-        Target::StopHookSP this_hook = target.GetStopHookAtIndex(i);
-        if (i > 0)
-          result.GetOutputStream().PutCString("\n");
-        this_hook->GetDescription(result.GetOutputStream(),
-                                  eDescriptionLevelFull);
-      }
+    bool printed_hook = false;
+    for (auto &hook : target.GetStopHooks(m_options.m_internal)) {
+      if (printed_hook)
+        result.GetOutputStream().PutCString("\n");
+      hook->GetDescription(result.GetOutputStream(), eDescriptionLevelFull);
+      printed_hook = true;
     }
+
+    if (!printed_hook)
+      result.GetOutputStream().PutCString("No stop hooks.\n");
+
     result.SetStatus(eReturnStatusSuccessFinishResult);
   }
+
+private:
+  CommandOptions m_options;
 };
 
 #pragma mark CommandObjectMultiwordTargetStopHooks
diff --git a/lldb/source/Commands/Options.td b/lldb/source/Commands/Options.td
index a9f054e1d3d45..ed061312e2bb4 100644
--- a/lldb/source/Commands/Options.td
+++ b/lldb/source/Commands/Options.td
@@ -77,7 +77,7 @@ let Command = "breakpoint list" in {
   // FIXME: We need to add an "internal" command, and then add this sort of
   // thing to it. But I need to see it for now, and don't want to wait.
   def blist_internal : Option<"internal", "i">,
-                       Desc<"Show debugger ${i}nternal breakpoints">;
+                       Desc<"Show debugger ${i}nternal breakpoints.">;
   def blist_brief : Option<"brief", "b">,
                     Group<1>,
                     Desc<"Give a ${b}rief description of the breakpoint (no "
@@ -1686,7 +1686,7 @@ let Command = "target modules lookup" in {
                                        "match, if a best match is available.">;
 }
 
-let Command = "target stop hook add" in {
+let Command = "target stop_hook add" in {
   def target_stop_hook_add_one_liner
       : Option<"one-liner", "o">,
         GroupRange<1, 3>,
@@ -1762,6 +1762,12 @@ let Command = "target stop hook add" in {
              "Defaults to true.">;
 }
 
+let Command = "target stop_hook list" in {
+  def target_stop_hook_list_internal
+      : Option<"internal", "i">,
+        Desc<"Show debugger ${i}nternal stop hooks.">;
+}
+
 let Command = "thread backtrace" in {
   def thread_backtrace_count : Option<"count", "c">,
                                Group<1>,
diff --git a/lldb/source/Core/DumpDataExtractor.cpp b/lldb/source/Core/DumpDataExtractor.cpp
index 37dffc72d76ac..a0dc752a3466d 100644
--- a/lldb/source/Core/DumpDataExtractor.cpp
+++ b/lldb/source/Core/DumpDataExtractor.cpp
@@ -157,7 +157,8 @@ static lldb::offset_t DumpInstructions(const DataExtractor &DE, Stream *s,
         exe_scope->CalculateExecutionContext(exe_ctx);
         disassembler_sp->GetInstructionList().Dump(
             s, show_address, show_bytes, show_control_flow_kind, &exe_ctx);
-      }
+      } else if (number_of_instructions)
+        s->Printf("failed to decode instructions at 0x%" PRIx64 ".", addr);
     }
   } else
     s->Printf("invalid target");
diff --git a/lldb/source/Host/common/File.cpp b/lldb/source/Host/common/File.cpp
index 1272f13c1d82d..65b75bd647c5d 100644
--- a/lldb/source/Host/common/File.cpp
+++ b/lldb/source/Host/common/File.cpp
@@ -81,18 +81,17 @@ File::GetStreamOpenModeFromOptions(File::OpenOptions options) {
 Expected<File::OpenOptions> File::GetOptionsFromMode(llvm::StringRef mode) {
   OpenOptions opts =
       llvm::StringSwitch<OpenOptions>(mode)
-          .Cases("r", "rb", eOpenOptionReadOnly)
-          .Cases("w", "wb", eOpenOptionWriteOnly)
-          .Cases("a", "ab",
-                 eOpenOptionWriteOnly | eOpenOptionAppend |
-                 eOpenOptionCanCreate)
-          .Cases("r+", "rb+", "r+b", eOpenOptionReadWrite)
-          .Cases("w+", "wb+", "w+b",
-                 eOpenOptionReadWrite | eOpenOptionCanCreate |
-                 eOpenOptionTruncate)
-          .Cases("a+", "ab+", "a+b",
-                 eOpenOptionReadWrite | eOpenOptionAppend |
-                     eOpenOptionCanCreate)
+          .Cases({"r", "rb"}, eOpenOptionReadOnly)
+          .Cases({"w", "wb"}, eOpenOptionWriteOnly)
+          .Cases({"a", "ab"}, eOpenOptionWriteOnly | eOpenOptionAppend |
+                                  eOpenOptionCanCreate)
+          .Cases({"r+", "rb+", "r+b"}, eOpenOptionReadWrite)
+          .Cases({"w+", "wb+", "w+b"}, eOpenOptionReadWrite |
+                                           eOpenOptionCanCreate |
+                                           eOpenOptionTruncate)
+          .Cases({"a+", "ab+", "a+b"}, eOpenOptionReadWrite |
+                                           eOpenOptionAppend |
+                                           eOpenOptionCanCreate)
           .Default(eOpenOptionInvalid);
   if (opts != eOpenOptionInvalid)
     return opts;
diff --git a/lldb/source/Host/common/Socket.cpp b/lldb/source/Host/common/Socket.cpp
index bc3d849c5c6c6..eb333f7e74396 100644
--- a/lldb/source/Host/common/Socket.cpp
+++ b/lldb/source/Host/common/Socket.cpp
@@ -500,13 +500,13 @@ Socket::GetProtocolAndMode(llvm::StringRef scheme) {
   return llvm::StringSwitch<std::optional<ProtocolModePair>>(scheme)
       .Case("listen", ProtocolModePair{SocketProtocol::ProtocolTcp,
                                        SocketMode::ModeAccept})
-      .Cases("accept", "unix-accept",
+      .Cases({"accept", "unix-accept"},
              ProtocolModePair{SocketProtocol::ProtocolUnixDomain,
                               SocketMode::ModeAccept})
       .Case("unix-abstract-accept",
             ProtocolModePair{SocketProtocol::ProtocolUnixAbstract,
                              SocketMode::ModeAccept})
-      .Cases("connect", "tcp-connect", "connection",
+      .Cases({"connect", "tcp-connect", "connection"},
              ProtocolModePair{SocketProtocol::ProtocolTcp,
                               SocketMode::ModeConnect})
       .Case("udp", ProtocolModePair{SocketProtocol::ProtocolTcp,
diff --git a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
index 6b121c934dfbb..990074566be7e 100644
--- a/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
+++ b/lldb/source/Plugins/ExpressionParser/Clang/ClangExpressionParser.cpp
@@ -754,7 +754,6 @@ ClangExpressionParser::ClangExpressionParser(
   // Make sure clang uses the same VFS as LLDB.
   m_compiler->setVirtualFileSystem(
       FileSystem::Instance().GetVirtualFileSystem());
-  m_compiler->createFileManager();
 
   // 2. Configure the compiler with a set of default options that are
   // appropriate for most situations.
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 91f3a6ce383b1..b4422a7d58077 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -86,6 +86,7 @@
 #include "lldb/Host/Host.h"
 #include "lldb/Utility/StringExtractorGDBRemote.h"
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -2762,6 +2763,108 @@ size_t ProcessGDBRemote::DoReadMemory(addr_t addr, void *buf, size_t size,
   return 0;
 }
 
+llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+ProcessGDBRemote::ReadMemoryRanges(
+    llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+    llvm::MutableArrayRef<uint8_t> buffer) {
+  if (!m_gdb_comm.GetMultiMemReadSupported())
+    return Process::ReadMemoryRanges(ranges, buffer);
+
+  llvm::Expected<StringExtractorGDBRemote> response =
+      SendMultiMemReadPacket(ranges);
+  if (!response) {
+    LLDB_LOG_ERROR(GetLog(GDBRLog::Process), response.takeError(),
+                   "MultiMemRead error response: {0}");
+    return Process::ReadMemoryRanges(ranges, buffer);
+  }
+
+  llvm::StringRef response_str = response->GetStringRef();
+  const unsigned expected_num_ranges = ranges.size();
+  llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+      parsed_response =
+          ParseMultiMemReadPacket(response_str, buffer, expected_num_ranges);
+  if (!parsed_response) {
+    LLDB_LOG_ERROR(GetLog(GDBRLog::Process), parsed_response.takeError(),
+                   "MultiMemRead error parsing response: {0}");
+    return Process::ReadMemoryRanges(ranges, buffer);
+  }
+  return std::move(*parsed_response);
+}
+
+llvm::Expected<StringExtractorGDBRemote>
+ProcessGDBRemote::SendMultiMemReadPacket(
+    llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges) {
+  std::string packet_str;
+  llvm::raw_string_ostream stream(packet_str);
+  stream << "MultiMemRead:ranges:";
+
+  auto range_to_stream = [&](auto range) {
+    // the "-" marker omits the '0x' prefix.
+    stream << llvm::formatv("{0:x-},{1:x-}", range.base, range.size);
+  };
+  llvm::interleave(ranges, stream, range_to_stream, ",");
+  stream << ";";
+
+  StringExtractorGDBRemote response;
+  GDBRemoteCommunication::PacketResult packet_result =
+      m_gdb_comm.SendPacketAndWaitForResponse(packet_str.data(), response,
+                                              GetInterruptTimeout());
+  if (packet_result != GDBRemoteCommunication::PacketResult::Success)
+    return llvm::createStringError(
+        llvm::formatv("MultiMemRead failed to send packet: '{0}'", packet_str));
+
+  if (response.IsErrorResponse())
+    return llvm::createStringError(
+        llvm::formatv("MultiMemRead failed: '{0}'", response.GetStringRef()));
+
+  if (!response.IsNormalResponse())
+    return llvm::createStringError(llvm::formatv(
+        "MultiMemRead unexpected response: '{0}'", response.GetStringRef()));
+
+  return response;
+}
+
+llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+ProcessGDBRemote::ParseMultiMemReadPacket(llvm::StringRef response_str,
+                                          llvm::MutableArrayRef<uint8_t> buffer,
+                                          unsigned expected_num_ranges) {
+  // The sizes and the data are separated by a `;`.
+  auto [sizes_str, memory_data] = response_str.split(';');
+  if (sizes_str.size() == response_str.size())
+    return llvm::createStringError(llvm::formatv(
+        "MultiMemRead response missing field separator ';' in: '{0}'",
+        response_str));
+
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>> read_results;
+
+  // Sizes are separated by a `,`.
+  for (llvm::StringRef size_str : llvm::split(sizes_str, ',')) {
+    uint64_t read_size;
+    if (size_str.getAsInteger(16, read_size))
+      return llvm::createStringError(llvm::formatv(
+          "MultiMemRead response has invalid size string: {0}", size_str));
+
+    if (memory_data.size() < read_size)
+      return llvm::createStringError(
+          llvm::formatv("MultiMemRead response did not have enough data, "
+                        "requested sizes: {0}",
+                        sizes_str));
+
+    llvm::StringRef region_to_read = memory_data.take_front(read_size);
+    memory_data = memory_data.drop_front(read_size);
+
+    assert(buffer.size() >= read_size);
+    llvm::MutableArrayRef<uint8_t> region_to_write =
+        buffer.take_front(read_size);
+    buffer = buffer.drop_front(read_size);
+
+    memcpy(region_to_write.data(), region_to_read.data(), read_size);
+    read_results.push_back(region_to_write);
+  }
+
+  return read_results;
+}
+
 bool ProcessGDBRemote::SupportsMemoryTagging() {
   return m_gdb_comm.GetMemoryTaggingSupported();
 }
diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
index 7c3dfb179a4b3..eb33b52b57441 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.h
@@ -137,6 +137,22 @@ class ProcessGDBRemote : public Process,
   size_t DoReadMemory(lldb::addr_t addr, void *buf, size_t size,
                       Status &error) override;
 
+  /// Override of ReadMemoryRanges that uses MultiMemRead to optimize this
+  /// operation.
+  llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>
+  ReadMemoryRanges(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges,
+                   llvm::MutableArrayRef<uint8_t> buf) override;
+
+private:
+  llvm::Expected<StringExtractorGDBRemote>
+  SendMultiMemReadPacket(llvm::ArrayRef<Range<lldb::addr_t, size_t>> ranges);
+
+  llvm::Expected<llvm::SmallVector<llvm::MutableArrayRef<uint8_t>>>
+  ParseMultiMemReadPacket(llvm::StringRef response_str,
+                          llvm::MutableArrayRef<uint8_t> buffer,
+                          unsigned expected_num_ranges);
+
+public:
   Status
   WriteObjectFile(std::vector<ObjectFile::LoadableData> entries) override;
 
diff --git a/lldb/source/Target/Target.cpp b/lldb/source/Target/Target.cpp
index e224a12e33463..d070c3d953d4a 100644
--- a/lldb/source/Target/Target.cpp
+++ b/lldb/source/Target/Target.cpp
@@ -183,8 +183,8 @@ Target::Target(Debugger &debugger, const ArchSpec &target_arch,
       m_watchpoint_list(), m_process_sp(), m_search_filter_sp(),
       m_image_search_paths(ImageSearchPathsChanged, this),
       m_source_manager_up(), m_stop_hooks(), m_stop_hook_next_id(0),
-      m_latest_stop_hook_id(0), m_valid(true), m_suppress_stop_hooks(false),
-      m_is_dummy_target(is_dummy_target),
+      m_internal_stop_hooks(), m_latest_stop_hook_id(0), m_valid(true),
+      m_suppress_stop_hooks(false), m_is_dummy_target(is_dummy_target),
       m_target_unique_id(g_target_unique_id++),
       m_frame_recognizer_manager_up(
           std::make_unique<StackFrameRecognizerManager>()) {
@@ -217,6 +217,7 @@ Target::~Target() {
 void Target::PrimeFromDummyTarget(Target &target) {
   m_stop_hooks = target.m_stop_hooks;
   m_stop_hook_next_id = target.m_stop_hook_next_id;
+  m_internal_stop_hooks = target.m_internal_stop_hooks;
 
   for (const auto &breakpoint_sp : target.m_breakpoint_list.Breakpoints()) {
     if (breakpoint_sp->IsInternal())
@@ -383,6 +384,7 @@ void Target::Destroy() {
   m_image_search_paths.Clear(notify);
   m_stop_hooks.clear();
   m_stop_hook_next_id = 0;
+  m_internal_stop_hooks.clear();
   m_suppress_stop_hooks = false;
   m_repl_map.clear();
   Args signal_args;
@@ -3041,8 +3043,9 @@ SourceManager &Target::GetSourceManager() {
   return *m_source_manager_up;
 }
 
-Target::StopHookSP Target::CreateStopHook(StopHook::StopHookKind kind) {
-  lldb::user_id_t new_uid = ++m_stop_hook_next_id;
+Target::StopHookSP Target::CreateStopHook(StopHook::StopHookKind kind,
+                                          bool internal) {
+  user_id_t new_uid = (internal ? LLDB_INVALID_UID : ++m_stop_hook_next_id);
   Target::StopHookSP stop_hook_sp;
   switch (kind) {
   case StopHook::StopHookKind::CommandBased:
@@ -3051,8 +3054,14 @@ Target::StopHookSP Target::CreateStopHook(StopHook::StopHookKind kind) {
   case StopHook::StopHookKind::ScriptBased:
     stop_hook_sp.reset(new StopHookScripted(shared_from_this(), new_uid));
     break;
+  case StopHook::StopHookKind::CodeBased:
+    stop_hook_sp.reset(new StopHookCoded(shared_from_this(), new_uid));
+    break;
   }
-  m_stop_hooks[new_uid] = stop_hook_sp;
+  if (internal)
+    m_internal_stop_hooks.push_back(stop_hook_sp);
+  else
+    m_stop_hooks[new_uid] = stop_hook_sp;
   return stop_hook_sp;
 }
 
@@ -3098,6 +3107,23 @@ void Target::SetAllStopHooksActiveState(bool active_state) {
   }
 }
 
+// FIXME:  Ideally we would like to return a `const &` (const reference) instead
+//   of creating copy here, but that is not possible due to different container
+//   types.  In C++20, we should be able to use `std::ranges::views::values` to
+//   adapt the key-pair entries in the `std::map` (behind `StopHookCollection`)
+//   to avoid creating the copy.
+const std::vector<Target::StopHookSP>
+Target::GetStopHooks(bool internal) const {
+  if (internal)
+    return m_internal_stop_hooks;
+
+  std::vector<StopHookSP> stop_hooks;
+  for (auto &[_, hook] : m_stop_hooks)
+    stop_hooks.push_back(hook);
+
+  return stop_hooks;
+}
+
 bool Target::RunStopHooks(bool at_initial_stop) {
   if (m_suppress_stop_hooks)
     return false;
@@ -3111,16 +3137,20 @@ bool Target::RunStopHooks(bool at_initial_stop) {
   if (!(state == eStateStopped || state == eStateAttaching))
     return false;
 
-  if (m_stop_hooks.empty())
-    return false;
+  auto is_active = [at_initial_stop](StopHookSP hook) {
+    bool should_run_now = (!at_initial_stop || hook->GetRunAtInitialStop());
+    return hook->IsActive() && should_run_now;
+  };
 
-  bool no_active_hooks =
-      llvm::none_of(m_stop_hooks, [at_initial_stop](auto &p) {
-        bool should_run_now =
-            !at_initial_stop || p.second->GetRunAtInitialStop();
-        return p.second->IsActive() && should_run_now;
-      });
-  if (no_active_hooks)
+  // Create list of active internal and user stop hooks.
+  std::vector<StopHookSP> active_hooks;
+  llvm::copy_if(m_internal_stop_hooks, std::back_inserter(active_hooks),
+                is_active);
+  for (auto &[_, hook] : m_stop_hooks) {
+    if (is_active(hook))
+      active_hooks.push_back(hook);
+  }
+  if (active_hooks.empty())
     return false;
 
   // Make sure we check that we are not stopped because of us running a user
@@ -3169,24 +3199,21 @@ bool Target::RunStopHooks(bool at_initial_stop) {
   StreamSP output_sp = m_debugger.GetAsyncOutputStream();
   auto on_exit = llvm::make_scope_exit([output_sp] { output_sp->Flush(); });
 
-  bool print_hook_header = (m_stop_hooks.size() != 1);
-  bool print_thread_header = (num_exe_ctx != 1);
+  size_t num_hooks_with_output = llvm::count_if(
+      active_hooks, [](auto h) { return !h->GetSuppressOutput(); });
+  bool print_hook_header = (num_hooks_with_output > 1);
+  bool print_thread_header = (num_exe_ctx > 1);
   bool should_stop = false;
   bool requested_continue = false;
 
-  for (auto stop_entry : m_stop_hooks) {
-    StopHookSP cur_hook_sp = stop_entry.second;
-    if (!cur_hook_sp->IsActive())
-      continue;
-    if (at_initial_stop && !cur_hook_sp->GetRunAtInitialStop())
-      continue;
-
+  for (auto cur_hook_sp : active_hooks) {
     bool any_thread_matched = false;
     for (auto exc_ctx : exc_ctx_with_reasons) {
       if (!cur_hook_sp->ExecutionContextPasses(exc_ctx))
         continue;
 
-      if (print_hook_header && !any_thread_matched) {
+      bool suppress_output = cur_hook_sp->GetSuppressOutput();
+      if (print_hook_header && !any_thread_matched && !suppress_output) {
         StreamString s;
         cur_hook_sp->GetDescription(s, eDescriptionLevelBrief);
         if (s.GetSize() != 0)
@@ -3197,7 +3224,7 @@ bool Target::RunStopHooks(bool at_initial_stop) {
         any_thread_matched = true;
       }
 
-      if (print_thread_header)
+      if (print_thread_header && !suppress_output)
         output_sp->Printf("-- Thread %d\n",
                           exc_ctx.GetThreadPtr()->GetIndexID());
 
diff --git a/lldb/test/API/CMakeLists.txt b/lldb/test/API/CMakeLists.txt
index e3bffbc73c394..c719ac3382a67 100644
--- a/lldb/test/API/CMakeLists.txt
+++ b/lldb/test/API/CMakeLists.txt
@@ -74,6 +74,16 @@ else()
   endif()
 endif()
 
+find_program(LLDB_DIRNAME_PATH dirname)
+if(LLDB_DIRNAME_PATH)
+  message(STATUS "Found dirname: ${LLDB_DIRNAME_PATH}")
+else()
+  message(STATUS "Could NOT find 'dirname'")
+  message(WARNING
+        "Many LLDB API tests require the GNU coreutils tools. Please make "
+        "sure they are installed and in PATH.")
+endif()
+
 if (TARGET clang)
   set(LLDB_DEFAULT_TEST_COMPILER "${LLVM_TOOLS_BINARY_DIR}/clang${CMAKE_EXECUTABLE_SUFFIX}")
 else()
diff --git a/lldb/test/API/commands/memory/read/TestMemoryRead.py b/lldb/test/API/commands/memory/read/TestMemoryRead.py
index 67b28ee79067b..dbe6d9e7a62ea 100644
--- a/lldb/test/API/commands/memory/read/TestMemoryRead.py
+++ b/lldb/test/API/commands/memory/read/TestMemoryRead.py
@@ -41,6 +41,18 @@ def test_memory_read_c_string(self):
             " Consider increasing the maximum read length.",
         )
 
+    @skipIf(archs=no_match("^(riscv|aarch64).*"))
+    def test_memory_read_instruction_decode_failure(self):
+        """Test the 'memory read' command with instruction format."""
+        self.build_run_stop()
+
+        # assume that 0xffffff is invalid instruction in RISC-V and AArch64,
+        # so decoding it will fail
+        self.expect(
+            "memory read --format instruction `&my_insns[0]` `&my_insns[3]`",
+            substrs=["failed to decode instructions at"],
+        )
+
     def test_memory_read(self):
         """Test the 'memory read' command with plain and vector formats."""
         self.build_run_stop()
diff --git a/lldb/test/API/commands/memory/read/main.c b/lldb/test/API/commands/memory/read/main.c
index 774b47bfeabb0..360f85af919cc 100644
--- a/lldb/test/API/commands/memory/read/main.c
+++ b/lldb/test/API/commands/memory/read/main.c
@@ -5,5 +5,8 @@ int main(int argc, const char *argv[]) {
   double my_double = 1234.5678;
   int my_ints[] = {2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22};
   uint64_t my_uint64s[] = {0, 1, 2, 3, 4, 5, 6, 7};
+  // assume that 0xffffff is invalid instruction in RISC-V and AArch64,
+  // so decoding it will fail
+  char my_insns[] = {0xff, 0xff, 0xff};
   return 0; // break here
 }
diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
index f5d3da530f4f5..c95a57ff55815 100644
--- a/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
+++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/TestEmptyFuncThreadStepOut.py
@@ -13,12 +13,31 @@ class FinishFromEmptyFunctionTestCase(TestBase):
 
     @skipIf(compiler="clang", compiler_version=['<', '17.0'])
     def test_finish_from_empty_function(self):
-        """Test that when stopped at a breakpoint in an empty function, finish leaves it correctly."""
+        """Test that when stopped at a breakpoint located at the last instruction
+        of a function, finish leaves it correctly."""
         self.build()
-        exe = self.getBuildArtifact("a.out")
-        target, process, thread, _ = lldbutil.run_to_name_breakpoint(
-            self, "done", exe_name=exe
+        target, _, thread, _ = lldbutil.run_to_source_breakpoint(
+            self, "// Set breakpoint here", lldb.SBFileSpec("main.c")
         )
+        # Find the address of the last instruction of 'done()' and set a breakpoint there.
+        # Even though 'done()' is empty, it may contain prologue and epilogue code, so
+        # simply setting a breakpoint at the function can place it before 'ret'.
+        error = lldb.SBError()
+        ret_bp_addr = lldb.SBAddress()
+        while True:
+            thread.StepInstruction(False, error)
+            self.assertTrue(error.Success())
+            frame = thread.GetSelectedFrame()
+            if "done" in frame.GetFunctionName():
+                ret_bp_addr = frame.GetPCAddress()
+            elif ret_bp_addr.IsValid():
+                # The entire function 'done()' has been stepped through, so 'ret_bp_addr'
+                # now contains the address of its last instruction, i.e. 'ret'.
+                break
+        ret_bp = target.BreakpointCreateByAddress(ret_bp_addr.GetLoadAddress(target))
+        self.assertTrue(ret_bp.IsValid())
+        # Resume the execution and hit the new breakpoint.
+        self.runCmd("cont")
         if self.TraceOn():
             self.runCmd("bt")
 
@@ -29,7 +48,6 @@ def test_finish_from_empty_function(self):
         )
         self.assertTrue(safety_bp.IsValid())
 
-        error = lldb.SBError()
         thread.StepOut(error)
         self.assertTrue(error.Success())
 
diff --git a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
index bc66a548a89df..b3f90db5e2562 100644
--- a/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
+++ b/lldb/test/API/functionalities/thread/finish-from-empty-func/main.c
@@ -2,6 +2,7 @@
 void done() {}
 int main() {
   puts("in main");
+  done(); // Set breakpoint here
   done();
   puts("leaving main");
   return 0;
diff --git a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
index a14035db5e057..a9fbe54e074ff 100644
--- a/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
+++ b/lldb/test/API/lang/objc/foundation/TestObjCMethodsNSError.py
@@ -45,3 +45,30 @@ def test_NSError_p(self):
             ],
         )
         self.runCmd("process continue")
+
+    @skipIfOutOfTreeDebugserver
+    def test_runtime_types_efficient_memreads(self):
+        # Test that we use an efficient reading of memory when reading
+        # Objective-C method descriptions.
+        logfile = os.path.join(self.getBuildDir(), "log.txt")
+        self.runCmd(f"log enable -f {logfile} gdb-remote packets process")
+        self.addTearDownHook(lambda: self.runCmd("log disable gdb-remote packets"))
+
+        self.build()
+        self.target, process, thread, bkpt = lldbutil.run_to_source_breakpoint(
+            self, "// Break here for NSString tests", lldb.SBFileSpec("main.m", False)
+        )
+
+        self.runCmd(f"proc plugin packet send StartTesting", check=False)
+        self.expect('expression str = [NSString stringWithCString: "new"]')
+        self.runCmd(f"proc plugin packet send EndTesting", check=False)
+
+        self.assertTrue(os.path.exists(logfile))
+        log_text = open(logfile).read()
+        log_text = log_text.split("StartTesting", 1)[-1].split("EndTesting", 1)[0]
+
+        # This test is only checking that the packet it used at all (and that
+        # no errors are produced). It doesn't check that the packet is being
+        # used to solve a problem in an optimal way.
+        self.assertIn("MultiMemRead:", log_text)
+        self.assertNotIn("MultiMemRead error", log_text)
diff --git a/lldb/test/API/lua_api/TestLuaAPI.py b/lldb/test/API/lua_api/TestLuaAPI.py
index 4ac795d696425..e78ed9de72375 100644
--- a/lldb/test/API/lua_api/TestLuaAPI.py
+++ b/lldb/test/API/lua_api/TestLuaAPI.py
@@ -158,7 +158,9 @@ def get_tests(self):
         return tests
 
     def test_lua_api(self):
-        if "LUA_EXECUTABLE" not in os.environ or len(os.environ["LUA_EXECUTABLE"]) == 0:
+        if "LUA_EXECUTABLE" not in os.environ or not os.path.exists(
+            os.environ["LUA_EXECUTABLE"]
+        ):
             self.skipTest("Lua API tests could not find Lua executable.")
             return
         lua_executable = os.environ["LUA_EXECUTABLE"]
diff --git a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py
index ffb6ef0fe4a18..0f40dfd09c958 100644
--- a/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py
+++ b/lldb/test/API/macosx/posix_spawn/TestLaunchProcessPosixSpawn.py
@@ -36,9 +36,11 @@ def no_apple_silicon(self):
 
     def run_arch(self, exe, arch):
         self.runCmd("target create -arch {} {}".format(arch, exe))
-        self.runCmd("run")
-
-        process = self.dbg.GetSelectedTarget().process
+        target = self.dbg.GetSelectedTarget()
+        launch_info = target.GetLaunchInfo()
+        error = lldb.SBError()
+        process = target.Launch(launch_info, error)
+        self.assertTrue(error.Success(), str(error))
         self.assertState(process.GetState(), lldb.eStateExited)
         self.assertIn("slice: {}".format(arch), process.GetSTDOUT(1000))
 
diff --git a/lldb/test/Shell/ExecControl/StopHook/stop-hook-list.test b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list.test
new file mode 100644
index 0000000000000..42d0a67c60dfa
--- /dev/null
+++ b/lldb/test/Shell/ExecControl/StopHook/stop-hook-list.test
@@ -0,0 +1,70 @@
+# Test stop hook user ID assignment, ordering, and printing.
+#
+# RUN: %lldb -b -s %s | FileCheck %s
+
+# Create some stop hooks
+target stop-hook add -o 'print "Hello"'
+target stop-hook add -o 'print "world,"'
+target stop-hook add -o 'print "nice"'
+target stop-hook add -o 'print "weather"'
+target stop-hook add -o 'print "today!"'
+
+# Print hooks
+target stop-hook list
+
+# CHECK: (lldb) target stop-hook list
+# CHECK: Hook: 1
+# CHECK:   "Hello"
+# CHECK: Hook: 2
+# CHECK:   "world,"
+# CHECK: Hook: 3
+# CHECK:   "nice"
+# CHECK: Hook: 4
+# CHECK:   "weather"
+# CHECK: Hook: 5
+# CHECK:   "today!"
+
+# Delete last hook, then add new one
+target stop-hook delete 5
+target stop-hook add -o 'print "Sunshine,"'
+
+# Stop hook gets new user ID (it is not reused)
+# CHECK: (lldb) target stop-hook add -o 'print "Sunshine,"'
+# CHECK: Stop hook #6 added.
+
+target stop-hook list
+# CHECK: (lldb) target stop-hook list
+# CHECK:     Hook: 4
+# CHECK-NOT: Hook: 5
+# CHECK:     Hook: 6
+
+# Add a few more hooks
+target stop-hook add -o 'print "rain,"'
+target stop-hook add -o 'print "and wind!"'
+target stop-hook add -o 'print "It is all okay!"'
+# CHECK: Stop hook #7 added.
+# CHECK: Stop hook #8 added.
+# CHECK: Stop hook #9 added.
+
+# Delete a few hooks
+target stop-hook delete 1
+target stop-hook delete 3
+target stop-hook delete 7
+target stop-hook delete 9
+
+# Check that the list is still well-ordered
+target stop-hook list
+# CHECK: (lldb) target stop-hook list
+# CHECK-NOT: Hook: 1
+# CHECK:     Hook: 2
+# CHECK:       "world,"
+# CHECK-NOT: Hook: 3
+# CHECK:     Hook: 4
+# CHECK:       "weather"
+# CHECK-NOT: Hook: 5
+# CHECK:     Hook: 6
+# CHECK:       "Sunshine,"
+# CHECK-NOT: Hook: 7
+# CHECK:     Hook: 8
+# CHECK:       "and wind!"
+# CHECK-NOT: Hook: 9
diff --git a/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
index f33b3202ccab5..f258c5924867f 100644
--- a/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
+++ b/lldb/tools/debugserver/source/MacOSX/arm64/sme_thread_status.h
@@ -46,7 +46,7 @@ __attribute__((aligned(alignof(unsigned int))));
 #define ARM_SME_ZA_STATE9 40
 #define ARM_SME_ZA_STATE10 41
 #define ARM_SME_ZA_STATE11 42
-#define ARM_SME_ZA_STATE12 42
+#define ARM_SME_ZA_STATE12 43
 #define ARM_SME_ZA_STATE13 44
 #define ARM_SME_ZA_STATE14 45
 #define ARM_SME_ZA_STATE15 46
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 61226cceb6db0..3c4f2253d1ad5 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -26,6 +26,7 @@
 #include "lldb/API/SBEvent.h"
 #include "lldb/API/SBLanguageRuntime.h"
 #include "lldb/API/SBListener.h"
+#include "lldb/API/SBMutex.h"
 #include "lldb/API/SBProcess.h"
 #include "lldb/API/SBStream.h"
 #include "lldb/Host/JSONTransport.h"
@@ -266,40 +267,49 @@ void DAP::SendJSON(const llvm::json::Value &json) {
   Send(message);
 }
 
-void DAP::Send(const Message &message) {
-  if (const protocol::Event *event = std::get_if<protocol::Event>(&message)) {
+Id DAP::Send(const Message &message) {
+  std::lock_guard<std::mutex> guard(call_mutex);
+  Message msg = std::visit(
+      [this](auto &&msg) -> Message {
+        if (msg.seq == kCalculateSeq)
+          msg.seq = seq++;
+        return msg;
+      },
+      Message(message));
+
+  if (const protocol::Event *event = std::get_if<protocol::Event>(&msg)) {
     if (llvm::Error err = transport.Send(*event))
       DAP_LOG_ERROR(log, std::move(err), "({0}) sending event failed",
                     m_client_name);
-    return;
+    return event->seq;
   }
 
-  if (const Request *req = std::get_if<Request>(&message)) {
+  if (const Request *req = std::get_if<Request>(&msg)) {
     if (llvm::Error err = transport.Send(*req))
       DAP_LOG_ERROR(log, std::move(err), "({0}) sending request failed",
                     m_client_name);
-    return;
+    return req->seq;
   }
 
-  if (const Response *resp = std::get_if<Response>(&message)) {
+  if (const Response *resp = std::get_if<Response>(&msg)) {
     // FIXME: After all the requests have migrated from LegacyRequestHandler >
     // RequestHandler<> this should be handled in RequestHandler<>::operator().
     // If the debugger was interrupted, convert this response into a
     // 'cancelled' response because we might have a partial result.
-    llvm::Error err =
-        (debugger.InterruptRequested())
-            ? transport.Send({/*request_seq=*/resp->request_seq,
-                              /*command=*/resp->command,
-                              /*success=*/false,
-                              /*message=*/eResponseMessageCancelled,
-                              /*body=*/std::nullopt})
-            : transport.Send(*resp);
-    if (err) {
+    llvm::Error err = (debugger.InterruptRequested())
+                          ? transport.Send({
+                                /*request_seq=*/resp->request_seq,
+                                /*command=*/resp->command,
+                                /*success=*/false,
+                                /*message=*/eResponseMessageCancelled,
+                                /*body=*/std::nullopt,
+                                /*seq=*/resp->seq,
+                            })
+                          : transport.Send(*resp);
+    if (err)
       DAP_LOG_ERROR(log, std::move(err), "({0}) sending response failed",
                     m_client_name);
-      return;
-    }
-    return;
+    return resp->seq;
   }
 
   llvm_unreachable("Unexpected message type");
@@ -1443,7 +1453,11 @@ void DAP::EventThread() {
           const bool remove_module =
               event_mask & lldb::SBTarget::eBroadcastBitModulesUnloaded;
 
-          std::lock_guard<std::mutex> guard(modules_mutex);
+          // NOTE: Both mutexes must be acquired to prevent deadlock when
+          // handling `modules_request`, which also requires both locks.
+          lldb::SBMutex api_mutex = GetAPIMutex();
+          const std::scoped_lock<lldb::SBMutex, std::mutex> guard(
+              api_mutex, modules_mutex);
           for (uint32_t i = 0; i < num_modules; ++i) {
             lldb::SBModule module =
                 lldb::SBTarget::GetModuleAtIndexFromEvent(i, event);
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index bf2c3f146a396..b4f111e4e720c 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -136,7 +136,7 @@ struct DAP final : public DAPTransport::MessageHandler {
   /// unless we send a "thread" event to indicate the thread exited.
   llvm::DenseSet<lldb::tid_t> thread_ids;
 
-  uint32_t reverse_request_seq = 0;
+  protocol::Id seq = 0;
   std::mutex call_mutex;
   llvm::SmallDenseMap<int64_t, std::unique_ptr<ResponseHandler>>
       inflight_reverse_requests;
@@ -220,8 +220,8 @@ struct DAP final : public DAPTransport::MessageHandler {
   /// Serialize the JSON value into a string and send the JSON packet to the
   /// "out" stream.
   void SendJSON(const llvm::json::Value &json);
-  /// Send the given message to the client
-  void Send(const protocol::Message &message);
+  /// Send the given message to the client.
+  protocol::Id Send(const protocol::Message &message);
 
   void SendOutput(OutputType o, const llvm::StringRef output);
 
@@ -353,19 +353,13 @@ struct DAP final : public DAPTransport::MessageHandler {
   template <typename Handler>
   void SendReverseRequest(llvm::StringRef command,
                           llvm::json::Value arguments) {
-    int64_t id;
-    {
-      std::lock_guard<std::mutex> locker(call_mutex);
-      id = ++reverse_request_seq;
-      inflight_reverse_requests[id] = std::make_unique<Handler>(command, id);
-    }
-
-    SendJSON(llvm::json::Object{
-        {"type", "request"},
-        {"seq", id},
-        {"command", command},
-        {"arguments", std::move(arguments)},
+    protocol::Id id = Send(protocol::Request{
+        command.str(),
+        std::move(arguments),
     });
+
+    std::lock_guard<std::mutex> locker(call_mutex);
+    inflight_reverse_requests[id] = std::make_unique<Handler>(command, id);
   }
 
   /// The set of capabilities supported by this adapter.
diff --git a/lldb/tools/lldb-dap/EventHelper.cpp b/lldb/tools/lldb-dap/EventHelper.cpp
index 3042d3293b482..c5d5f2bb59b42 100644
--- a/lldb/tools/lldb-dap/EventHelper.cpp
+++ b/lldb/tools/lldb-dap/EventHelper.cpp
@@ -15,6 +15,7 @@
 #include "Protocol/ProtocolRequests.h"
 #include "Protocol/ProtocolTypes.h"
 #include "lldb/API/SBFileSpec.h"
+#include "lldb/API/SBPlatform.h"
 #include "llvm/Support/Error.h"
 #include <utility>
 
@@ -78,11 +79,9 @@ void SendExtraCapabilities(DAP &dap) {
 //     { "$ref": "#/definitions/Event" },
 //     {
 //       "type": "object",
-//       "description": "Event message for 'process' event type. The event
-//                       indicates that the debugger has begun debugging a
-//                       new process. Either one that it has launched, or one
-//                       that it has attached to.",
-//       "properties": {
+//       "description": "The event indicates that the debugger has begun
+//       debugging a new process. Either one that it has launched, or one that
+//       it has attached to.", "properties": {
 //         "event": {
 //           "type": "string",
 //           "enum": [ "process" ]
@@ -93,31 +92,37 @@ void SendExtraCapabilities(DAP &dap) {
 //             "name": {
 //               "type": "string",
 //               "description": "The logical name of the process. This is
-//                               usually the full path to process's executable
-//                               file. Example: /home/myproj/program.js."
+//               usually the full path to process's executable file. Example:
+//               /home/example/myproj/program.js."
 //             },
 //             "systemProcessId": {
 //               "type": "integer",
-//               "description": "The system process id of the debugged process.
-//                               This property will be missing for non-system
-//                               processes."
+//               "description": "The process ID of the debugged process, as
+//               assigned by the operating system. This property should be
+//               omitted for logical processes that do not map to operating
+//               system processes on the machine."
 //             },
 //             "isLocalProcess": {
 //               "type": "boolean",
 //               "description": "If true, the process is running on the same
-//                               computer as the debug adapter."
+//               computer as the debug adapter."
 //             },
 //             "startMethod": {
 //               "type": "string",
 //               "enum": [ "launch", "attach", "attachForSuspendedLaunch" ],
 //               "description": "Describes how the debug engine started
-//                               debugging this process.",
-//               "enumDescriptions": [
+//               debugging this process.", "enumDescriptions": [
 //                 "Process was launched under the debugger.",
 //                 "Debugger attached to an existing process.",
-//                 "A project launcher component has launched a new process in
-//                  a suspended state and then asked the debugger to attach."
+//                 "A project launcher component has launched a new process in a
+//                 suspended state and then asked the debugger to attach."
 //               ]
+//             },
+//             "pointerSize": {
+//               "type": "integer",
+//               "description": "The size of a pointer or address for this
+//               process, in bits. This value may be used by clients when
+//               formatting addresses for display."
 //             }
 //           },
 //           "required": [ "name" ]
@@ -126,7 +131,7 @@ void SendExtraCapabilities(DAP &dap) {
 //       "required": [ "event", "body" ]
 //     }
 //   ]
-// }
+// },
 void SendProcessEvent(DAP &dap, LaunchMethod launch_method) {
   lldb::SBFileSpec exe_fspec = dap.target.GetExecutable();
   char exe_path[PATH_MAX];
@@ -136,7 +141,8 @@ void SendProcessEvent(DAP &dap, LaunchMethod launch_method) {
   EmplaceSafeString(body, "name", exe_path);
   const auto pid = dap.target.GetProcess().GetProcessID();
   body.try_emplace("systemProcessId", (int64_t)pid);
-  body.try_emplace("isLocalProcess", true);
+  body.try_emplace("isLocalProcess", dap.target.GetPlatform().IsHost());
+  body.try_emplace("pointerSize", dap.target.GetAddressByteSize() * 8);
   const char *startMethod = nullptr;
   switch (launch_method) {
   case Launch:
diff --git a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
index de63c9d78efd1..d67437ad5b3ae 100644
--- a/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RequestHandler.cpp
@@ -157,11 +157,12 @@ RunInTerminal(DAP &dap, const protocol::LaunchRequestArguments &arguments) {
 void BaseRequestHandler::Run(const Request &request) {
   // If this request was cancelled, send a cancelled response.
   if (dap.IsCancelled(request)) {
-    Response cancelled{/*request_seq=*/request.seq,
-                       /*command=*/request.command,
-                       /*success=*/false,
-                       /*message=*/eResponseMessageCancelled,
-                       /*body=*/std::nullopt};
+    Response cancelled{
+        /*request_seq=*/request.seq,
+        /*command=*/request.command,
+        /*success=*/false,
+        /*message=*/eResponseMessageCancelled,
+    };
     dap.Send(cancelled);
     return;
   }
diff --git a/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp b/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp
index 45dd7ddce0428..100173bfc3082 100644
--- a/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp
+++ b/lldb/tools/lldb-dap/Handler/RestartRequestHandler.cpp
@@ -124,6 +124,8 @@ void RestartRequestHandler::operator()(
     return;
   }
 
+  SendProcessEvent(dap, Launch);
+
   // This is normally done after receiving a "configuration done" request.
   // Because we're restarting, configuration has already happened so we can
   // continue the process right away.
diff --git a/lldb/tools/lldb-dap/JSONUtils.cpp b/lldb/tools/lldb-dap/JSONUtils.cpp
index 71e91f8f41a68..2780a5b7748e8 100644
--- a/lldb/tools/lldb-dap/JSONUtils.cpp
+++ b/lldb/tools/lldb-dap/JSONUtils.cpp
@@ -10,6 +10,7 @@
 #include "DAP.h"
 #include "ExceptionBreakpoint.h"
 #include "LLDBUtils.h"
+#include "Protocol/ProtocolBase.h"
 #include "ProtocolUtils.h"
 #include "lldb/API/SBAddress.h"
 #include "lldb/API/SBCompileUnit.h"
@@ -284,7 +285,7 @@ void FillResponse(const llvm::json::Object &request,
   // Fill in all of the needed response fields to a "request" and set "success"
   // to true by default.
   response.try_emplace("type", "response");
-  response.try_emplace("seq", (int64_t)0);
+  response.try_emplace("seq", protocol::kCalculateSeq);
   EmplaceSafeString(response, "command",
                     GetString(request, "command").value_or(""));
   const uint64_t seq = GetInteger<uint64_t>(request, "seq").value_or(0);
@@ -417,7 +418,7 @@ llvm::json::Value CreateScope(const llvm::StringRef name,
 // }
 llvm::json::Object CreateEventObject(const llvm::StringRef event_name) {
   llvm::json::Object event;
-  event.try_emplace("seq", 0);
+  event.try_emplace("seq", protocol::kCalculateSeq);
   event.try_emplace("type", "event");
   EmplaceSafeString(event, "event", event_name);
   return event;
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp
index 9cd9028d879e9..72359214c8537 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.cpp
@@ -58,6 +58,8 @@ bool fromJSON(const json::Value &Params, MessageType &M, json::Path P) {
 }
 
 json::Value toJSON(const Request &R) {
+  assert(R.seq != kCalculateSeq && "invalid seq");
+
   json::Object Result{
       {"type", "request"},
       {"seq", R.seq},
@@ -103,8 +105,10 @@ bool operator==(const Request &a, const Request &b) {
 }
 
 json::Value toJSON(const Response &R) {
+  assert(R.seq != kCalculateSeq && "invalid seq");
+
   json::Object Result{{"type", "response"},
-                      {"seq", 0},
+                      {"seq", R.seq},
                       {"command", R.command},
                       {"request_seq", R.request_seq},
                       {"success", R.success}};
@@ -132,8 +136,9 @@ json::Value toJSON(const Response &R) {
   return std::move(Result);
 }
 
-bool fromJSON(json::Value const &Params,
-              std::variant<ResponseMessage, std::string> &M, json::Path P) {
+static bool fromJSON(json::Value const &Params,
+                     std::variant<ResponseMessage, std::string> &M,
+                     json::Path P) {
   auto rawMessage = Params.getAsString();
   if (!rawMessage) {
     P.report("expected a string");
@@ -157,8 +162,7 @@ bool fromJSON(json::Value const &Params, Response &R, json::Path P) {
     return false;
 
   MessageType type;
-  int64_t seq;
-  if (!O.map("type", type) || !O.map("seq", seq) ||
+  if (!O.map("type", type) || !O.map("seq", R.seq) ||
       !O.map("command", R.command) || !O.map("request_seq", R.request_seq))
     return false;
 
@@ -168,12 +172,7 @@ bool fromJSON(json::Value const &Params, Response &R, json::Path P) {
   }
 
   if (R.command.empty()) {
-    P.field("command").report("expected to not be ''");
-    return false;
-  }
-
-  if (R.request_seq == 0) {
-    P.field("request_seq").report("expected to not be '0'");
+    P.field("command").report("expected to not be empty");
     return false;
   }
 
@@ -217,9 +216,11 @@ bool fromJSON(json::Value const &Params, ErrorMessage &EM, json::Path P) {
 }
 
 json::Value toJSON(const Event &E) {
+  assert(E.seq != kCalculateSeq && "invalid seq");
+
   json::Object Result{
       {"type", "event"},
-      {"seq", 0},
+      {"seq", E.seq},
       {"event", E.event},
   };
 
@@ -235,8 +236,7 @@ bool fromJSON(json::Value const &Params, Event &E, json::Path P) {
     return false;
 
   MessageType type;
-  int64_t seq;
-  if (!O.map("type", type) || !O.map("seq", seq) || !O.map("event", E.event))
+  if (!O.map("type", type) || !O.map("seq", E.seq) || !O.map("event", E.event))
     return false;
 
   if (type != eMessageTypeEvent) {
@@ -244,13 +244,8 @@ bool fromJSON(json::Value const &Params, Event &E, json::Path P) {
     return false;
   }
 
-  if (seq != 0) {
-    P.field("seq").report("expected to be '0'");
-    return false;
-  }
-
   if (E.event.empty()) {
-    P.field("event").report("expected to not be ''");
+    P.field("event").report("expected to not be empty");
     return false;
   }
 
diff --git a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
index 92e41b1dbf595..42c6c8890af24 100644
--- a/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
+++ b/lldb/tools/lldb-dap/Protocol/ProtocolBase.h
@@ -30,19 +30,15 @@ namespace lldb_dap::protocol {
 
 // MARK: Base Protocol
 
+/// Message unique identifier type.
 using Id = int64_t;
 
+/// A unique identifier that indicates the `seq` field should be calculated by
+/// the current session.
+static constexpr Id kCalculateSeq = INT64_MAX;
+
 /// A client or debug adapter initiated request.
 struct Request {
-  /// Sequence number of the message (also known as message ID). The `seq` for
-  /// the first message sent by a client or debug adapter is 1, and for each
-  /// subsequent message is 1 greater than the previous message sent by that
-  /// actor. `seq` can be used to order requests, responses, and events, and to
-  /// associate requests with their corresponding responses. For protocol
-  /// messages of type `request` the sequence number can be used to cancel the
-  /// request.
-  Id seq;
-
   /// The command to execute.
   std::string command;
 
@@ -50,7 +46,16 @@ struct Request {
   ///
   /// Request handlers are expected to validate the arguments, which is handled
   /// by `RequestHandler`.
-  std::optional<llvm::json::Value> arguments;
+  std::optional<llvm::json::Value> arguments = std::nullopt;
+
+  /// Sequence number of the message (also known as message ID). The `seq` for
+  /// the first message sent by a client or debug adapter is 1, and for each
+  /// subsequent message is 1 greater than the previous message sent by that
+  /// actor. `seq` can be used to order requests, responses, and events, and to
+  /// associate requests with their corresponding responses. For protocol
+  /// messages of type `request` the sequence number can be used to cancel the
+  /// request.
+  Id seq = kCalculateSeq;
 };
 llvm::json::Value toJSON(const Request &);
 bool fromJSON(const llvm::json::Value &, Request &, llvm::json::Path);
@@ -62,7 +67,16 @@ struct Event {
   std::string event;
 
   /// Event-specific information.
-  std::optional<llvm::json::Value> body;
+  std::optional<llvm::json::Value> body = std::nullopt;
+
+  /// Sequence number of the message (also known as message ID). The `seq` for
+  /// the first message sent by a client or debug adapter is 1, and for each
+  /// subsequent message is 1 greater than the previous message sent by that
+  /// actor. `seq` can be used to order requests, responses, and events, and to
+  /// associate requests with their corresponding responses. For protocol
+  /// messages of type `request` the sequence number can be used to cancel the
+  /// request.
+  Id seq = kCalculateSeq;
 };
 llvm::json::Value toJSON(const Event &);
 bool fromJSON(const llvm::json::Value &, Event &, llvm::json::Path);
@@ -78,7 +92,7 @@ enum ResponseMessage : unsigned {
 /// Response for a request.
 struct Response {
   /// Sequence number of the corresponding request.
-  Id request_seq;
+  Id request_seq = 0;
 
   /// The command requested.
   std::string command;
@@ -87,21 +101,31 @@ struct Response {
   /// attribute may contain the result of the request. If the value is false,
   /// the attribute `message` contains the error in short form and the `body`
   /// may contain additional information (see `ErrorMessage`).
-  bool success;
+  bool success = false;
 
   // FIXME: Migrate usage of fallback string to ErrorMessage
 
   /// Contains the raw error in short form if `success` is false. This raw error
   /// might be interpreted by the client and is not shown in the UI. Some
   /// predefined values exist.
-  std::optional<std::variant<ResponseMessage, std::string>> message;
+  std::optional<std::variant<ResponseMessage, std::string>> message =
+      std::nullopt;
 
   /// Contains request result if success is true and error details if success is
   /// false.
   ///
   /// Request handlers are expected to build an appropriate body, see
   /// `RequestHandler`.
-  std::optional<llvm::json::Value> body;
+  std::optional<llvm::json::Value> body = std::nullopt;
+
+  /// Sequence number of the message (also known as message ID). The `seq` for
+  /// the first message sent by a client or debug adapter is 1, and for each
+  /// subsequent message is 1 greater than the previous message sent by that
+  /// actor. `seq` can be used to order requests, responses, and events, and to
+  /// associate requests with their corresponding responses. For protocol
+  /// messages of type `request` the sequence number can be used to cancel the
+  /// request.
+  Id seq = kCalculateSeq;
 };
 bool fromJSON(const llvm::json::Value &, Response &, llvm::json::Path);
 llvm::json::Value toJSON(const Response &);
diff --git a/lldb/unittests/Target/MemoryTest.cpp b/lldb/unittests/Target/MemoryTest.cpp
index f7b4e97b1f64a..e444f68dc4871 100644
--- a/lldb/unittests/Target/MemoryTest.cpp
+++ b/lldb/unittests/Target/MemoryTest.cpp
@@ -245,7 +245,7 @@ class DummyReaderProcess : public Process {
     if (read_more_than_requested)
       size *= 2;
     uint8_t *buffer = static_cast<uint8_t *>(buf);
-    for (size_t addr = vm_addr; addr < vm_addr + size; addr++)
+    for (lldb::addr_t addr = vm_addr; addr < vm_addr + size; addr++)
       buffer[addr - vm_addr] = static_cast<uint8_t>(addr); // LSB of addr.
     return size;
   }
diff --git a/llvm/docs/CIBestPractices.rst b/llvm/docs/CIBestPractices.rst
index da92ed3660e55..855e2ccac8ece 100644
--- a/llvm/docs/CIBestPractices.rst
+++ b/llvm/docs/CIBestPractices.rst
@@ -136,3 +136,20 @@ branches as follows:
     branches:
       - main
       - releases/*
+
+Container Best Practices
+========================
+
+This section contains best practices/guidelines when working with containers
+for LLVM infrastructure.
+
+Using Fully Qualified Container Names
+-------------------------------------
+
+When referencing container images from a registry, such as in Github Actions
+workflows, or in ``Dockerfile`` files used for building images, prefer fully
+qualified names (i.e., including the registry domain) over just the image.
+For example, prefer ``docker.io/ubuntu:24.04`` over ``ubuntu:24.04``. This
+ensures portability across systems where a different default registry might
+be specified and also prevents attackers from changing the default registry
+to pull in a malicious image instead of the intended one.
diff --git a/llvm/docs/CodingStandards.rst b/llvm/docs/CodingStandards.rst
index 8677d894a94a6..63f6663d687ea 100644
--- a/llvm/docs/CodingStandards.rst
+++ b/llvm/docs/CodingStandards.rst
@@ -1692,29 +1692,29 @@ faraway places in the file to tell that the function is local:
 Don't Use Braces on Simple Single-Statement Bodies of if/else/loop Statements
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-When writing the body of an ``if``, ``else``, or for/while loop statement, we
-prefer to omit the braces to avoid unnecessary line noise. However, braces
-should be used in cases where the omission of braces harms the readability and
-maintainability of the code.
+When writing the body of an ``if``, ``else``, or ``for``/``while`` loop
+statement, we aim to reduce unnecessary line noise.
 
-We consider that readability is harmed when omitting the brace in the presence
-of a single statement that is accompanied by a comment (assuming the comment
-can't be hoisted above the ``if`` or loop statement, see below).
+**Omit braces when:**
 
-Similarly, braces should be used when a single-statement body is complex enough
-that it becomes difficult to see where the block containing the following
-statement began. An ``if``/``else`` chain or a loop is considered a single
-statement for this rule, and this rule applies recursively.
+*   The body consists of a single **simple** statement.
+*   The single statement is not preceded by a comment.
+    (Hoist comments above the control statement if you can.)
+*   An ``else`` clause, if present, also meets the above criteria (single
+    simple statement, no associated comments).
 
-This list is not exhaustive. For example, readability is also harmed if an
-``if``/``else`` chain does not use braced bodies for either all or none of its
-members, or has complex conditionals, deep nesting, etc. The examples below
-intend to provide some guidelines.
+**Use braces in all other cases, including:**
 
-Maintainability is harmed if the body of an ``if`` ends with a (directly or
-indirectly) nested ``if`` statement with no ``else``. Braces on the outer ``if``
-would help to avoid running into a "dangling else" situation.
+*   Multi-statement bodies
+*   Single-statement bodies with non-hoistable comments
+*   Complex single-statement bodies (e.g., deep nesting, complex nested
+    loops)
+*   Inconsistent bracing within ``if``/``else if``/``else`` chains (if one
+    block requires braces, all must)
+*   ``if`` statements ending with a nested ``if`` lacking an ``else`` (to
+    prevent "dangling else")
 
+The examples below provide guidelines for these cases:
 
 .. code-block:: c++
 
diff --git a/llvm/docs/CommandGuide/lit.rst b/llvm/docs/CommandGuide/lit.rst
index 70daae46170cd..bbc149740494a 100644
--- a/llvm/docs/CommandGuide/lit.rst
+++ b/llvm/docs/CommandGuide/lit.rst
@@ -628,7 +628,7 @@ TestRunner.py:
  %{fs-src-root}          root component of file system paths pointing to the LLVM checkout
  %{fs-tmp-root}          root component of file system paths pointing to the test's temporary directory
  %{fs-sep}               file system path separator
- %t                      temporary file name unique to the test
+ %t                      a path unique to the test (which may be used to make files or directories)
  %basename_t             The last path component of %t but without the ``.tmp`` extension (deprecated, use ``%{t:stem}`` instead)
  %%                      %
  %/s                     %s but ``\`` is replaced by ``/``
diff --git a/llvm/docs/CommandGuide/llc.rst b/llvm/docs/CommandGuide/llc.rst
index 900649f59c401..cc670f6043656 100644
--- a/llvm/docs/CommandGuide/llc.rst
+++ b/llvm/docs/CommandGuide/llc.rst
@@ -125,13 +125,6 @@ End-user Options
 
  Enable setting the FP exceptions build attribute not to use exceptions.
 
-.. option:: --enable-unsafe-fp-math
-
- Enable optimizations that make unsafe assumptions about IEEE math (e.g. that
- addition is associative) or may not work for all input ranges.  These
- optimizations allow the code generator to make use of some instructions which
- would otherwise not be usable (such as ``fsin`` on X86).
-
 .. option:: --stats
 
  Print statistics recorded by code-generation passes.
diff --git a/llvm/docs/CommandGuide/lli.rst b/llvm/docs/CommandGuide/lli.rst
index 94c001380441f..8afe10db05d48 100644
--- a/llvm/docs/CommandGuide/lli.rst
+++ b/llvm/docs/CommandGuide/lli.rst
@@ -107,11 +107,6 @@ FLOATING POINT OPTIONS
 
  Enable optimizations that assume no NAN values.
 
-.. option:: -enable-unsafe-fp-math
-
- Causes :program:`lli` to enable optimizations that may decrease floating point
- precision.
-
 .. option:: -soft-float
 
  Causes :program:`lli` to generate software floating point library calls instead of
diff --git a/llvm/docs/CommandGuide/llvm-ir2vec.rst b/llvm/docs/CommandGuide/llvm-ir2vec.rst
index fc590a6180316..f51da065b43d8 100644
--- a/llvm/docs/CommandGuide/llvm-ir2vec.rst
+++ b/llvm/docs/CommandGuide/llvm-ir2vec.rst
@@ -1,5 +1,5 @@
-llvm-ir2vec - IR2Vec Embedding Generation Tool
-==============================================
+llvm-ir2vec - IR2Vec and MIR2Vec Embedding Generation Tool
+===========================================================
 
 .. program:: llvm-ir2vec
 
@@ -11,9 +11,9 @@ SYNOPSIS
 DESCRIPTION
 -----------
 
-:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec. It
-generates IR2Vec embeddings for LLVM IR and supports triplet generation 
-for vocabulary training. 
+:program:`llvm-ir2vec` is a standalone command-line tool for IR2Vec and MIR2Vec.
+It generates embeddings for both LLVM IR and Machine IR (MIR) and supports 
+triplet generation for vocabulary training. 
 
 The tool provides three main subcommands:
 
@@ -23,23 +23,33 @@ The tool provides three main subcommands:
 2. **entities**: Generates entity mapping files (entity2id.txt) for vocabulary 
    training.
 
-3. **embeddings**: Generates IR2Vec embeddings using a trained vocabulary
+3. **embeddings**: Generates IR2Vec or MIR2Vec embeddings using a trained vocabulary
    at different granularity levels (instruction, basic block, or function).
 
+The tool supports two operation modes:
+
+* **LLVM IR mode** (``--mode=llvm``): Process LLVM IR bitcode files and generate
+  IR2Vec embeddings
+* **Machine IR mode** (``--mode=mir``): Process Machine IR (.mir) files and generate
+  MIR2Vec embeddings
+
 The tool is designed to facilitate machine learning applications that work with
-LLVM IR by converting the IR into numerical representations that can be used by
-ML models. The `triplets` subcommand generates numeric IDs directly instead of string 
-triplets, streamlining the training data preparation workflow.
+LLVM IR or Machine IR by converting them into numerical representations that can 
+be used by ML models. The `triplets` subcommand generates numeric IDs directly 
+instead of string triplets, streamlining the training data preparation workflow.
 
 .. note::
 
-   For information about using IR2Vec programmatically within LLVM passes and 
-   the C++ API, see the `IR2Vec Embeddings <https://llvm.org/docs/MLGO.html#ir2vec-embeddings>`_ 
+   For information about using IR2Vec and MIR2Vec programmatically within LLVM 
+   passes and the C++ API, see the `IR2Vec Embeddings <https://llvm.org/docs/MLGO.html#ir2vec-embeddings>`_ 
    section in the MLGO documentation.
 
 OPERATION MODES
 ---------------
 
+The tool operates in two modes: **LLVM IR mode** and **Machine IR mode**. The mode
+is selected using the ``--mode`` option (default: ``llvm``).
+
 Triplet Generation and Entity Mapping Modes are used for preparing
 vocabulary and training data for knowledge graph embeddings. The Embedding Mode
 is used for generating embeddings from LLVM IR using a pre-trained vocabulary.
@@ -58,49 +68,82 @@ these two modes are used to generate the triplets and entity mappings.
 Triplet Generation
 ~~~~~~~~~~~~~~~~~~
 
-With the `triplets` subcommand, :program:`llvm-ir2vec` analyzes LLVM IR and extracts
-numeric triplets consisting of opcode IDs, type IDs, and operand IDs. These triplets
+With the `triplets` subcommand, :program:`llvm-ir2vec` analyzes LLVM IR or Machine IR
+and extracts numeric triplets consisting of opcode IDs and operand IDs. These triplets
 are generated in the standard format used for knowledge graph embedding training.
-The tool outputs numeric IDs directly using the ir2vec::Vocabulary mapping
-infrastructure, eliminating the need for string-to-ID preprocessing.
+The tool outputs numeric IDs directly using the vocabulary mapping infrastructure,
+eliminating the need for string-to-ID preprocessing.
+
+Usage for LLVM IR:
+
+.. code-block:: bash
 
-Usage:
+   llvm-ir2vec triplets --mode=llvm input.bc -o triplets_train2id.txt
+
+Usage for Machine IR:
 
 .. code-block:: bash
 
-   llvm-ir2vec triplets input.bc -o triplets_train2id.txt
+   llvm-ir2vec triplets --mode=mir input.mir -o triplets_train2id.txt
 
 Entity Mapping Generation
 ~~~~~~~~~~~~~~~~~~~~~~~~~
 
 With the `entities` subcommand, :program:`llvm-ir2vec` generates the entity mappings
-supported by IR2Vec in the standard format used for knowledge graph embedding
-training. This subcommand outputs all supported entities (opcodes, types, and
-operands) with their corresponding numeric IDs, and is not specific for an
-LLVM IR file.
+supported by IR2Vec or MIR2Vec in the standard format used for knowledge graph embedding
+training. This subcommand outputs all supported entities with their corresponding numeric IDs.
+
+For LLVM IR, entities include opcodes, types, and operands. For Machine IR, entities include
+machine opcodes, common operands, and register classes (both physical and virtual).
+
+Usage for LLVM IR:
+
+.. code-block:: bash
+
+   llvm-ir2vec entities --mode=llvm -o entity2id.txt
 
-Usage:
+Usage for Machine IR:
 
 .. code-block:: bash
 
-   llvm-ir2vec entities -o entity2id.txt
+   llvm-ir2vec entities --mode=mir input.mir -o entity2id.txt
+
+.. note::
+
+   For LLVM IR mode, the entity mapping is target-independent and does not require an input file.
+   For Machine IR mode, an input .mir file is required to determine the target architecture,
+   as entity mappings vary by target (different architectures have different instruction sets
+   and register classes).
 
 Embedding Generation
 ~~~~~~~~~~~~~~~~~~~~
 
 With the `embeddings` subcommand, :program:`llvm-ir2vec` uses a pre-trained vocabulary to
-generate numerical embeddings for LLVM IR at different levels of granularity.
+generate numerical embeddings for LLVM IR or Machine IR at different levels of granularity.
+
+Example Usage for LLVM IR:
+
+.. code-block:: bash
 
-Example Usage:
+   llvm-ir2vec embeddings --mode=llvm --ir2vec-vocab-path=vocab.json --ir2vec-kind=symbolic --level=func input.bc -o embeddings.txt
+
+Example Usage for Machine IR:
 
 .. code-block:: bash
 
-   llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json --ir2vec-kind=symbolic --level=func input.bc -o embeddings.txt
+   llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=vocab.json --level=func input.mir -o embeddings.txt
 
 OPTIONS
 -------
 
-Global options:
+Common options (applicable to both LLVM IR and Machine IR modes):
+
+.. option:: --mode=<mode>
+
+   Specify the operation mode. Valid values are:
+
+   * ``llvm`` - Process LLVM IR bitcode files (default)
+   * ``mir`` - Process Machine IR (.mir) files
 
 .. option:: -o <filename>
 
@@ -116,8 +159,8 @@ Subcommand-specific options:
 
 .. option:: <input-file>
 
-   The input LLVM IR or bitcode file to process. This positional argument is
-   required for the `embeddings` subcommand.
+   The input LLVM IR/bitcode file (.ll/.bc) or Machine IR file (.mir) to process. 
+   This positional argument is required for the `embeddings` subcommand.
 
 .. option:: --level=<level>
 
@@ -131,6 +174,8 @@ Subcommand-specific options:
 
    Process only the specified function instead of all functions in the module.
 
+**IR2Vec-specific options** (for ``--mode=llvm``):
+
 .. option:: --ir2vec-kind=<kind>
 
    Specify the kind of IR2Vec embeddings to generate. Valid values are:
@@ -143,8 +188,8 @@ Subcommand-specific options:
 
 .. option:: --ir2vec-vocab-path=<path>
 
-   Specify the path to the vocabulary file (required for embedding generation).
-   The vocabulary file should be in JSON format and contain the trained
+   Specify the path to the IR2Vec vocabulary file (required for LLVM IR embedding 
+   generation). The vocabulary file should be in JSON format and contain the trained
    vocabulary for embedding generation. See `llvm/lib/Analysis/models`
    for pre-trained vocabulary files.
 
@@ -163,17 +208,51 @@ Subcommand-specific options:
    Specify the weight for argument embeddings (default: 0.2). This controls
    the relative importance of operand information in the final embedding.
 
+**MIR2Vec-specific options** (for ``--mode=mir``):
+
+.. option:: --mir2vec-vocab-path=<path>
+
+   Specify the path to the MIR2Vec vocabulary file (required for Machine IR 
+   embedding generation). The vocabulary file should be in JSON format and 
+   contain the trained vocabulary for embedding generation.
+
+.. option:: --mir2vec-kind=<kind>
+
+   Specify the kind of MIR2Vec embeddings to generate. Valid values are:
+
+   * ``symbolic`` - Generate symbolic embeddings (default)
+
+.. option:: --mir2vec-opc-weight=<weight>
+
+   Specify the weight for machine opcode embeddings (default: 1.0). This controls
+   the relative importance of machine instruction opcodes in the final embedding.
+
+.. option:: --mir2vec-common-operand-weight=<weight>
+
+   Specify the weight for common operand embeddings (default: 1.0). This controls
+   the relative importance of common operand types in the final embedding.
+
+.. option:: --mir2vec-reg-operand-weight=<weight>
+
+   Specify the weight for register operand embeddings (default: 1.0). This controls
+   the relative importance of register operands in the final embedding.
+
 
 **triplets** subcommand:
 
 .. option:: <input-file>
 
-   The input LLVM IR or bitcode file to process. This positional argument is
-   required for the `triplets` subcommand.
+   The input LLVM IR/bitcode file (.ll/.bc) or Machine IR file (.mir) to process. 
+   This positional argument is required for the `triplets` subcommand.
 
 **entities** subcommand:
 
-   No subcommand-specific options.
+.. option:: <input-file>
+
+   The input Machine IR file (.mir) to process. This positional argument is required
+   for the `entities` subcommand when using ``--mode=mir``, as the entity mappings
+   are target-specific. For ``--mode=llvm``, no input file is required as IR2Vec
+   entity mappings are target-independent.
 
 OUTPUT FORMAT
 -------------
@@ -186,19 +265,37 @@ metadata headers. The format includes:
 
 .. code-block:: text
 
-   MAX_RELATIONS=<max_relations_count>
+   MAX_RELATION=<max_relation_count>
    <head_entity_id> <tail_entity_id> <relation_id>
    <head_entity_id> <tail_entity_id> <relation_id>
    ...
 
 Each line after the metadata header represents one instruction relationship,
-with numeric IDs for head entity, relation, and tail entity. The metadata 
-header (MAX_RELATIONS) provides counts for post-processing and training setup.
+with numeric IDs for head entity, tail entity, and relation type. The metadata 
+header (MAX_RELATION) indicates the maximum relation ID used.
+
+**Relation Types:**
+
+For LLVM IR (IR2Vec):
+  * **0** = Type relationship (instruction to its type)
+  * **1** = Next relationship (sequential instructions)
+  * **2+** = Argument relationships (Arg0, Arg1, Arg2, ...)
+
+For Machine IR (MIR2Vec):
+  * **0** = Next relationship (sequential instructions)
+  * **1+** = Argument relationships (Arg0, Arg1, Arg2, ...)
+
+**Entity IDs:**
+
+For LLVM IR: Entity IDs represent opcodes, types, and operands as defined by the IR2Vec vocabulary.
+
+For Machine IR: Entity IDs represent machine opcodes, common operands (immediate, frame index, etc.),
+physical register classes, and virtual register classes as defined by the MIR2Vec vocabulary. The entity layout is target-specific.
 
 Entity Mode Output
 ~~~~~~~~~~~~~~~~~~
 
-In entity mode, the output consists of entity mapping in the format:
+In entity mode, the output consists of entity mappings in the format:
 
 .. code-block:: text
 
@@ -210,6 +307,13 @@ In entity mode, the output consists of entity mapping in the format:
 The first line contains the total number of entities, followed by one entity
 mapping per line with tab-separated entity string and numeric ID.
 
+For LLVM IR, entities include instruction opcodes (e.g., "Add", "Ret"), types 
+(e.g., "INT", "PTR"), and operand kinds.
+
+For Machine IR, entities include machine opcodes (e.g., "COPY", "ADD"), 
+common operands (e.g., "Immediate", "FrameIndex"), physical register classes 
+(e.g., "PhyReg_GR32"), and virtual register classes (e.g., "VirtReg_GR32").
+
 Embedding Mode Output
 ~~~~~~~~~~~~~~~~~~~~~
 
@@ -240,3 +344,6 @@ SEE ALSO
 
 For more information about the IR2Vec algorithm and approach, see:
 `IR2Vec: LLVM IR Based Scalable Program Embeddings <https://doi.org/10.1145/3418463>`_.
+
+For more information about the MIR2Vec algorithm and approach, see:
+`RL4ReAl: Reinforcement Learning for Register Allocation <https://doi.org/10.1145/3578360.3580273>`_.
diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index b94e79e5e7bd5..45f2df20984e6 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -413,6 +413,10 @@ Below are some guidelines about the format of the message itself:
   message self-explanatory. Note that such non-public links should not be
   included in the submitted code.
 
+* Avoid 'tagging' someone's username in your commits and PR descriptions
+  (e.g., `@<someUser>`), doing so results in that account receiving a notification
+  every time the commit is cherry-picked and/or pushed to a fork.
+
 LLVM uses a squash workflow for pull requests, so as the pull request evolves
 during review, it's important to update the pull request description over the
 course of a review. GitHub uses the initial commit message to create the pull
diff --git a/llvm/docs/DirectX/DXILArchitecture.rst b/llvm/docs/DirectX/DXILArchitecture.rst
index 32b1e72deae7c..bce7fdaa386ed 100644
--- a/llvm/docs/DirectX/DXILArchitecture.rst
+++ b/llvm/docs/DirectX/DXILArchitecture.rst
@@ -118,9 +118,10 @@ The passes to generate DXIL IR follow the flow:
 Each of these passes has a defined responsibility:
 
 #. DXILOpLowering translates LLVM intrinsic calls to dx.op calls.
-#. DXILPrepare transforms the DXIL IR to be compatible with LLVM 3.7, and
-   inserts bitcasts to allow typed pointers to be inserted.
-#. DXILTranslateMetadata emits the DXIL Metadata structures.
+#. DXILPrepare updates functions in the DXIL IR to be compatible with LLVM 3.7,
+   namely removing attributes, and inserting bitcasts to allow typed pointers
+   to be inserted.
+#. DXILTranslateMetadata transforms and emits all recognized DXIL Metadata.
 
 The passes to encode DXIL to binary in the DX Container follow the flow:
 
diff --git a/llvm/docs/Extensions.rst b/llvm/docs/Extensions.rst
index 89a0e80fef5ef..91a3ac05ef0e5 100644
--- a/llvm/docs/Extensions.rst
+++ b/llvm/docs/Extensions.rst
@@ -416,7 +416,36 @@ as offsets relative to prior addresses.
 The following versioning schemes are currently supported (newer versions support
 features of the older versions).
 
-Version 3 (newest): Capable of encoding callsite offsets. Enabled by the 6th bit
+Version 4 (newest): Capable of encoding basic block hashes. This feature is
+enabled by the 7th bit of the feature byte.
+
+Example:
+
+.. code-block:: gas
+
+  .section  ".llvm_bb_addr_map","",@llvm_bb_addr_map
+  .byte     4                             # version number
+  .byte     96                            # feature byte
+  .quad     .Lfunc_begin0                 # address of the function
+  .byte     2                             # number of basic blocks
+  # BB record for BB_0
+   .byte     0                            # BB_0 ID
+   .uleb128  .Lfunc_begin0-.Lfunc_begin0  # BB_0 offset relative to function entry (always zero)
+   .byte     0                            # number of callsites in this block
+   .uleb128  .LBB_END0_0-.Lfunc_begin0    # BB_0 size
+   .byte     x                            # BB_0 metadata
+   .quad     9080480745856761856          # BB_0 hash
+  # BB record for BB_1
+   .byte     1                            # BB_1 ID
+   .uleb128  .LBB0_1-.LBB_END0_0          # BB_1 offset relative to the end of last block (BB_0).
+   .byte     2                            # number of callsites in this block
+   .uleb128  .LBB0_1_CS0-.LBB0_1          # offset of callsite end relative to the previous offset (.LBB0_1)
+   .uleb128  .LBB0_1_CS1-.LBB0_1_CS0      # offset of callsite end relative to the previous offset (.LBB0_1_CS0)
+   .uleb128  .LBB_END0_1-.LBB0_1_CS1      # BB_1 size offset (Offset of the block end relative to the previous offset).
+   .byte     y                            # BB_1 metadata
+   .quad     2363478788702666771          # BB_1 hash
+
+Version 3: Capable of encoding callsite offsets. Enabled by the 6th bit
 of the feature byte.
 
 Example:
@@ -601,6 +630,35 @@ sees fit (generally the section that would provide the best locality).
 
 .. _CFI jump table: https://clang.llvm.org/docs/ControlFlowIntegrityDesign.html#forward-edge-cfi-for-indirect-function-calls
 
+``SHT_LLVM_CALL_GRAPH`` Section (Call Graph)
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section is used to store the call graph. It has a type of
+``SHT_LLVM_CALL_GRAPH`` (0x6fff4c0f). Details of call graph section layout
+are described in :doc:`CallGraphSection`.
+
+For example:
+
+.. code-block:: gas
+
+  .section  ".llvm.callgraph","",@llvm_call_graph
+  .byte   0
+  .byte   7
+  .quad   .Lball
+  .quad   0
+  .byte   3
+  .quad   foo
+  .quad   bar
+  .quad   baz
+  .byte   3
+  .quad   4524972987496481828
+  .quad   3498816979441845844
+  .quad   8646233951371320954
+
+This indicates that ``ball`` calls ``foo``, ``bar`` and ``baz`` directly;
+``ball`` indirectly calls functions whose types are ``4524972987496481828``,
+``3498816979441845844`` and ``8646233951371320954``.
+
 CodeView-Dependent
 ------------------
 
diff --git a/llvm/docs/GlobalISel/GenericOpcode.rst b/llvm/docs/GlobalISel/GenericOpcode.rst
index b055327466739..661a11537cf57 100644
--- a/llvm/docs/GlobalISel/GenericOpcode.rst
+++ b/llvm/docs/GlobalISel/GenericOpcode.rst
@@ -504,7 +504,7 @@ undefined.
 G_ABDS, G_ABDU
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Compute the absolute difference (signed and unsigned), e.g. abs(x-y).
+Compute the absolute difference (signed and unsigned), e.g. trunc(abs(ext(x)-ext(y)).
 
 .. code-block:: none
 
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
index ccf8c717768f6..976749a9158ff 100644
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -11456,9 +11456,9 @@ If the ``load`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``release`` and ``acq_rel`` orderings are not valid on ``load`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
-eight. ``align`` must be
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than
+or equal to eight. ``align`` must be
 explicitly specified on atomic loads. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
 require a lock and have poor performance. ``!nontemporal`` does not have any
@@ -11595,9 +11595,9 @@ If the ``store`` is marked as ``atomic``, it takes an extra :ref:`ordering
 <ordering>` and optional ``syncscope("<target-scope>")`` argument. The
 ``acquire`` and ``acq_rel`` orderings aren't valid on ``store`` instructions.
 Atomic loads produce :ref:`defined <memmodel>` results when they may see
-multiple atomic stores. The type of the pointee must be an integer, pointer, or
-floating-point type whose bit width is a power of two greater than or equal to
-eight. ``align`` must be
+multiple atomic stores. The type of the pointee must be an integer, pointer,
+floating-point, or vector type whose bit width is a power of two greater than
+or equal to eight. ``align`` must be
 explicitly specified on atomic stores. Note: if the alignment is not greater or
 equal to the size of the `<value>` type, the atomic operation is likely to
 require a lock and have poor performance. ``!nontemporal`` does not have any
diff --git a/llvm/docs/MLGO.rst b/llvm/docs/MLGO.rst
index bf3de11a2640e..2443835ea2fff 100644
--- a/llvm/docs/MLGO.rst
+++ b/llvm/docs/MLGO.rst
@@ -434,8 +434,27 @@ The latter is also used in tests.
 There is no C++ implementation of a log reader. We do not have a scenario
 motivating one.
 
-IR2Vec Embeddings
-=================
+Embeddings
+==========
+
+LLVM provides embedding frameworks to generate vector representations of code
+at different abstraction levels. These embeddings capture syntactic, semantic,
+and structural properties of the code and can be used as features for machine
+learning models in various compiler optimization tasks.
+
+Two embedding frameworks are available:
+
+- **IR2Vec**: Generates embeddings for LLVM IR
+- **MIR2Vec**: Generates embeddings for Machine IR
+
+Both frameworks follow a similar architecture with vocabulary-based embedding
+generation, where a vocabulary maps code entities to n-dimensional floating
+point vectors. These embeddings can be computed at multiple granularity levels
+(instruction, basic block, and function) and used for ML-guided compiler
+optimizations.
+
+IR2Vec
+------
 
 IR2Vec is a program embedding approach designed specifically for LLVM IR. It
 is implemented as a function analysis pass in LLVM. The IR2Vec embeddings
@@ -466,7 +485,7 @@ The core components are:
     compute embeddings for instructions, basic blocks, and functions.
 
 Using IR2Vec
-------------
+^^^^^^^^^^^^
 
 .. note::
 
@@ -526,7 +545,7 @@ embeddings can be computed and accessed via an ``ir2vec::Embedder`` instance.
    between different code snippets, or perform other analyses as needed.
 
 Further Details
----------------
+^^^^^^^^^^^^^^^
 
 For more detailed information about the IR2Vec algorithm, its parameters, and
 advanced usage, please refer to the original paper:
@@ -538,6 +557,123 @@ triplets from LLVM IR, see :doc:`CommandGuide/llvm-ir2vec`.
 The LLVM source code for ``IR2Vec`` can also be explored to understand the 
 implementation details.
 
+MIR2Vec
+-------
+
+MIR2Vec is an extension of IR2Vec designed specifically for LLVM Machine IR 
+(MIR). It generates embeddings for machine-level instructions, basic blocks, 
+and functions. MIR2Vec operates on the target-specific machine representation,
+capturing machine instruction semantics including opcodes, operands, and 
+register information at the machine level.
+
+MIR2Vec extends the vocabulary to include:
+
+- **Machine Opcodes**: Target-specific instruction opcodes derived from the
+  TargetInstrInfo, grouped by instruction semantics.
+
+- **Common Operands**: All common operand types (excluding register operands),
+  defined by the ``MachineOperand::MachineOperandType`` enum.
+
+- **Physical Register Classes**: Register classes defined by the target,
+  specialized for physical registers.
+
+- **Virtual Register Classes**: Register classes defined by the target,
+  specialized for virtual registers.
+
+The core components are:
+
+- **Vocabulary**: A mapping from machine IR entities (opcodes, operands, register
+  classes) to their vector representations. This is managed by 
+  ``MIR2VecVocabLegacyAnalysis`` for the legacy pass manager, with a 
+  ``MIR2VecVocabProvider`` that can be used standalone or wrapped by pass 
+  managers. The vocabulary (.json file) contains sections for opcodes, common 
+  operands, physical register classes, and virtual register classes.
+
+  .. note::
+    
+    The vocabulary file should contain these sections for it to be valid.
+
+- **Embedder**: A class (``mir2vec::MIREmbedder``) that uses the vocabulary to
+  compute embeddings for machine instructions, machine basic blocks, and 
+  machine functions. Currently, ``SymbolicMIREmbedder`` is the available 
+  implementation.
+
+Using MIR2Vec
+^^^^^^^^^^^^^
+
+.. note::
+
+   This section describes how to use MIR2Vec within LLVM passes. `llvm-ir2vec`
+   tool ` :doc:`CommandGuide/llvm-ir2vec` can be used for generating MIR2Vec
+   embeddings from Machine IR files (.mir), which can be useful for generating
+   embeddings outside of compiler passes.
+
+To generate MIR2Vec embeddings in a compiler pass, first obtain the vocabulary,
+then create an embedder instance to compute and access embeddings.
+
+1. **Get the Vocabulary**:
+   In a MachineFunctionPass, get the vocabulary from the analysis:
+
+   .. code-block:: c++
+
+      auto &VocabAnalysis = getAnalysis<MIR2VecVocabLegacyAnalysis>();
+      auto VocabOrErr = VocabAnalysis.getMIR2VecVocabulary(*MF.getFunction().getParent());
+      if (!VocabOrErr) {
+        // Handle error: vocabulary is not available or invalid
+        return;
+      }
+      const mir2vec::MIRVocabulary &Vocabulary = *VocabOrErr;
+
+   Note that ``MIR2VecVocabLegacyAnalysis`` is an immutable pass.
+
+2. **Create Embedder instance**:
+   With the vocabulary, create an embedder for a specific machine function:
+
+   .. code-block:: c++
+
+      // Assuming MF is a MachineFunction&
+      // For example, using MIR2VecKind::Symbolic:
+      std::unique_ptr<mir2vec::MIREmbedder> Emb =
+          mir2vec::MIREmbedder::create(MIR2VecKind::Symbolic, MF, Vocabulary);
+
+
+3. **Compute and Access Embeddings**:
+   Call ``getMFunctionVector()`` to get the embedding for the machine function.
+
+   .. code-block:: c++
+
+    mir2vec::Embedding FuncVector = Emb->getMFunctionVector();
+
+   Currently, ``MIREmbedder`` can generate embeddings at three levels: Machine
+   Instructions, Machine Basic Blocks, and Machine Functions. Appropriate 
+   getters are provided to access the embeddings at these levels.
+
+   .. note::
+
+    The validity of the ``MIREmbedder`` instance (and the embeddings it 
+    generates) is tied to the machine function it is associated with. If the 
+    machine function is modified, the embeddings may become stale and should 
+    be recomputed accordingly.
+
+4. **Working with Embeddings:**
+   Embeddings are represented as ``std::vector<double>``. These vectors can be
+   used as features for machine learning models, compute similarity scores
+   between different code snippets, or perform other analyses as needed.
+
+Further Details
+^^^^^^^^^^^^^^^
+
+For more detailed information about the MIR2Vec algorithm, its parameters, and
+advanced usage, please refer to the original paper:
+`RL4ReAl: Reinforcement Learning for Register Allocation <https://doi.org/10.1145/3578360.3580273>`_.
+
+For information about using MIR2Vec tool for generating embeddings from
+Machine IR, see :doc:`CommandGuide/llvm-ir2vec`.
+
+The LLVM source code for ``MIR2Vec`` can be explored to understand the 
+implementation details. See ``llvm/include/llvm/CodeGen/MIR2Vec.h`` and 
+``llvm/lib/CodeGen/MIR2Vec.cpp``.
+
 Building with ML support
 ========================
 
diff --git a/llvm/docs/ProgrammersManual.rst b/llvm/docs/ProgrammersManual.rst
index 9cdac9c59fa9b..d99b5843c2133 100644
--- a/llvm/docs/ProgrammersManual.rst
+++ b/llvm/docs/ProgrammersManual.rst
@@ -2161,6 +2161,16 @@ that are not simple pointers (use :ref:`SmallPtrSet <dss_smallptrset>` for
 pointers).  Note that ``DenseSet`` has the same requirements for the value type that
 :ref:`DenseMap <dss_densemap>` has.
 
+.. _dss_radixtree:
+
+llvm/ADT/RadixTree.h
+^^^^^^^^^^^^^^^^^^^^
+
+``RadixTree`` is a trie-based data structure that stores range-like keys and
+their associated values. It is particularly efficient for storing keys that
+share common prefixes, as it can compress these prefixes to save memory. It
+supports efficient search of matching prefixes.
+
 .. _dss_sparseset:
 
 llvm/ADT/SparseSet.h
diff --git a/llvm/docs/QualGroup.rst b/llvm/docs/QualGroup.rst
index 5c05e4ed72075..0e73ec5c8be78 100644
--- a/llvm/docs/QualGroup.rst
+++ b/llvm/docs/QualGroup.rst
@@ -181,6 +181,46 @@ Membership Review
 
 To ensure the group remains active and focused, member participation will be reviewed every six months. Inactive members may be removed following this review.
 
+Decision Taking
+---------------
+
+The LLVM Qualification Working Group aims to make decisions transparently, collaboratively, and without unnecessary formality. The goal is to maintain efficiency while encouraging broad participation and mutual understanding.
+
+This section describes the lightweight process used to handle proposals and decisions within the group. It may be revised as the group evolves and gains experience.
+
+Principles
+^^^^^^^^^^
+
+* **Consensus first:** The preferred mode of decision-making is consensus through open discussion (primarily on Discord or during sync-up meetings).
+* **Inclusiveness and respect:** All viewpoints are encouraged, and members are expected to contribute constructively toward reaching a shared understanding.
+* **Transparency:** Discussions leading to a decision should be visible to the group and, whenever appropriate, summarized in public channels (e.g., Discourse meeting notes, Discord channel, documentation updates).
+
+Consensus and Time Limits
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Discussions remain open until a clear consensus emerges, meaning no sustained objections have been raised after reasonable discussion.
+
+To prevent open-ended debates, if no new viewpoints are expressed after an agreed period (e.g., 2 weeks), the moderator (typically the person who started the discussion thread) may take one of the following actions:
+
+* **Summarize the apparent consensus** and close the discussion, or
+* **Postpone the topic** to the next sync-up meeting if the outcome remains unclear, or
+* **Call for a short vote** to confirm the group’s position.
+
+Voting Procedure
+^^^^^^^^^^^^^^^^
+
+When consensus cannot be reached or when a clear yes/no decision is needed:
+
+* The moderator may call for a **simple vote** using emoji reactions on Discord or a similar visible method.
+* A decision passes if it receives a **majority (>50%)** of votes among **participants who voted.** Non-votes are **not counted** in the total.
+* To ensure decisions reflect the collective position of the group, **at least three-quarters of the total core members** must participate in the vote for it to be considered valid.
+* If results are evenly split **(50/50)**, or if participation falls below this threshold, the topic may be postponed to the next sync-up meeting for further discussion.
+
+Documentation
+^^^^^^^^^^^^^
+
+Final decisions should be briefly documented (e.g., in meeting minutes, the corresponding GitHub issue, or Discord discussion thread). Once stable, the resulting policy or outcome may be reflected in this documentation for reference.
+
 Current Topics & Backlog
 ========================
 
@@ -205,10 +245,11 @@ Slides used to support discussions during sync-up meetings are stored in LLVM's
 
 Available slides:
 
+* (add future entries here)
+* `October 2025 <https://docs.google.com/presentation/d/1ND2SkjgcHvcEbQmMd8ExL-PpRXouP49T-wfy3xf2yRQ/edit?usp=sharing>`_
 * `September 2025 <https://docs.google.com/presentation/d/1SZAE-QHfJED6CxJCCtBkPDxcw7XU9ORX54TJyXe1ppc/edit?usp=sharing>`_
 * `August 2025 <https://docs.google.com/presentation/d/1K8GWoRm8ZAeyyGvTeV5f-sMOhMr7WHiEk6_Nm5Fk10o/edit?usp=sharing>`_
 * `July 2025 <https://docs.google.com/presentation/d/1ktURe9qz5ggbdOQYK-2ISpiC18B-Y_35WvGyAnnxEpw/edit?usp=sharing>`_
-* (add future entries here)
 
 AI Transcription Policy
 =======================
diff --git a/llvm/docs/Reference.rst b/llvm/docs/Reference.rst
index 9b1bf1be9c8bf..212aefa0e73fb 100644
--- a/llvm/docs/Reference.rst
+++ b/llvm/docs/Reference.rst
@@ -250,3 +250,7 @@ Additional Topics
 
 :doc:`ContentAddressableStorage`
    A reference guide for using LLVM's CAS library.
+
+:doc:`CIBestPractices`
+   A list of guidelines and best practices to use when working on LLVM's
+   CI systems.
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index 754cd409f83eb..36383b12788f9 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -66,6 +66,7 @@ Changes to the LLVM IR
   `@llvm.masked.gather` and `@llvm.masked.scatter` intrinsics has been removed.
   Instead, the `align` attribute should be placed on the pointer (or vector of
   pointers) argument.
+* A `load atomic` may now be used with vector types on x86.
 
 Changes to LLVM infrastructure
 ------------------------------
@@ -93,6 +94,9 @@ Changes to Vectorizers
 Changes to the AArch64 Backend
 ------------------------------
 
+* Assembler/disassembler support has been added for Armv9.7-A (2025)
+  architecture extensions.
+
 Changes to the AMDGPU Backend
 -----------------------------
 
diff --git a/llvm/docs/Security.rst b/llvm/docs/Security.rst
index 5cb8d04c0da2a..d95b55217aef5 100644
--- a/llvm/docs/Security.rst
+++ b/llvm/docs/Security.rst
@@ -51,7 +51,7 @@ username for an individual isn't available, the brackets will be empty.
 * Nikhil Gupta (Nvidia) []
 * Oliver Hunt (Apple) [@ojhunt]
 * Peter Smith (ARM) [@smithp35]
-* Pietro Albini (Ferrous Systems; Rust) [@pietroalbini]
+* Pietro Albini (Oxide Computer Company; Rust) [@pietroalbini]
 * Serge Guelton (Mozilla) [@serge-sans-paille]
 * Sergey Zverev (Intel) [@offsake]
 * Shayne Hiet-Block (Microsoft) [@GreatKeeper]
diff --git a/llvm/docs/SourceLevelDebugging.rst b/llvm/docs/SourceLevelDebugging.rst
index f057b2d2d2e36..12b5e3e549df7 100644
--- a/llvm/docs/SourceLevelDebugging.rst
+++ b/llvm/docs/SourceLevelDebugging.rst
@@ -674,7 +674,7 @@ Compiled to LLVM, this function would be represented like this:
     ret void, !dbg !24
   }
 
-  attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
 
   !llvm.dbg.cu = !{!0}
diff --git a/llvm/docs/TableGen/index.rst b/llvm/docs/TableGen/index.rst
index e916c152f5a43..e334b2e4b23ac 100644
--- a/llvm/docs/TableGen/index.rst
+++ b/llvm/docs/TableGen/index.rst
@@ -20,7 +20,7 @@ domain-specific information.  Because there may be a large number of these
 records, it is specifically designed to allow writing flexible descriptions and
 for common features of these records to be factored out.  This reduces the
 amount of duplication in the description, reduces the chance of error, and makes
-it easier to structure domain specific information.
+it easier to structure domain-specific information.
 
 The TableGen front end parses a file, instantiates the declarations, and
 hands the result off to a domain-specific `backend`_ for processing.  See
@@ -44,8 +44,8 @@ distribution, respectively.
 The TableGen program
 ====================
 
-TableGen files are interpreted by the TableGen program: `llvm-tblgen` available
-on your build directory under `bin`. It is not installed in the system (or where
+TableGen files are interpreted by the TableGen program: ``llvm-tblgen`` available
+in your build directory under ``bin``. It is not installed in the system (or where
 your sysroot is set to), since it has no use beyond LLVM's build process.
 
 Running TableGen
@@ -86,7 +86,7 @@ the `-dump-json` option.
 
 If you plan to use TableGen, you will most likely have to write a `backend`_
 that extracts the information specific to what you need and formats it in the
-appropriate way. You can do this by extending TableGen itself in C++, or by
+appropriate way. You can do this by extending TableGen itself in C++ or by
 writing a script in any language that can consume the JSON output.
 
 Example
@@ -152,7 +152,7 @@ of the x86 architecture.  ``def ADD32rr`` defines a record named
 ``ADD32rr``, and the comment at the end of the line indicates the superclasses
 of the definition.  The body of the record contains all of the data that
 TableGen assembled for the record, indicating that the instruction is part of
-the "X86" namespace, the pattern indicating how the instruction is selected by
+the ``X86`` namespace, the pattern indicating how the instruction is selected by
 the code generator, that it is a two-address instruction, has a particular
 encoding, etc.  The contents and semantics of the information in the record are
 specific to the needs of the X86 backend, and are only shown as an example.
@@ -175,13 +175,13 @@ TableGen, all of the information was derived from the following definition:
 This definition makes use of the custom class ``I`` (extended from the custom
 class ``X86Inst``), which is defined in the X86-specific TableGen file, to
 factor out the common features that instructions of its class share.  A key
-feature of TableGen is that it allows the end-user to define the abstractions
+feature of TableGen is that it allows the end user to define the abstractions
 they prefer to use when describing their information.
 
 Syntax
 ======
 
-TableGen has a syntax that is loosely based on C++ templates, with built-in
+TableGen has a syntax loosely based on C++ templates, with built-in
 types and specification. In addition, TableGen's syntax introduces some
 automation concepts like multiclass, foreach, let, etc.
 
@@ -193,41 +193,41 @@ which are considered 'records'.
 
 **TableGen records** have a unique name, a list of values, and a list of
 superclasses.  The list of values is the main data that TableGen builds for each
-record; it is this that holds the domain specific information for the
+record; it is this that holds the domain-specific information for the
 application.  The interpretation of this data is left to a specific `backend`_,
-but the structure and format rules are taken care of and are fixed by
+but the structure and format rules are taken care of and fixed by
 TableGen.
 
 **TableGen definitions** are the concrete form of 'records'.  These generally do
-not have any undefined values, and are marked with the '``def``' keyword.
+not have any undefined values and are marked with the '``def``' keyword.
 
 .. code-block:: text
 
   def FeatureFPARMv8 : SubtargetFeature<"fp-armv8", "HasFPARMv8", "true",
                                         "Enable ARMv8 FP">;
 
-In this example, FeatureFPARMv8 is ``SubtargetFeature`` record initialised
+In this example, ``FeatureFPARMv8`` is ``SubtargetFeature`` record initialised
 with some values. The names of the classes are defined via the
-keyword `class` either on the same file or some other included. Most target
+keyword `class` either in the same file or some other included. Most target
 TableGen files include the generic ones in ``include/llvm/Target``.
 
 **TableGen classes** are abstract records that are used to build and describe
 other records.  These classes allow the end-user to build abstractions for
-either the domain they are targeting (such as "Register", "RegisterClass", and
-"Instruction" in the LLVM code generator) or for the implementor to help factor
-out common properties of records (such as "FPInst", which is used to represent
+either the domain they are targeting (such as ``Register``, ``RegisterClass``, and
+``Instruction`` in the LLVM code generator) or for the implementor to help factor
+out common properties of records (such as ``FPInst``, which is used to represent
 floating point instructions in the X86 backend).  TableGen keeps track of all of
 the classes that are used to build up a definition, so the backend can find all
-definitions of a particular class, such as "Instruction".
+definitions of a particular class, such as ``Instruction``.
 
 .. code-block:: text
 
  class ProcNoItin<string Name, list<SubtargetFeature> Features>
        : Processor<Name, NoItineraries, Features>;
 
-Here, the class ProcNoItin, receiving parameters `Name` of type `string` and
-a list of target features is specializing the class Processor by passing the
-arguments down as well as hard-coding NoItineraries.
+Here, the class ``ProcNoItin``, receiving parameters ``Name`` of type ``string`` and
+a list of target features is specializing the class ``Processor`` by passing the
+arguments down as well as hard-coding ``NoItineraries``.
 
 **TableGen multiclasses** are groups of abstract records that are instantiated
 all at once.  Each instantiation can result in multiple TableGen definitions.
@@ -295,8 +295,8 @@ TableGen Deficiencies
 
 Despite being very generic, TableGen has some deficiencies that have been
 pointed out numerous times. The common theme is that, while TableGen allows
-you to build domain specific languages, the final languages that you create
-lack the power of other DSLs, which in turn increase considerably the size
+you to build domain-specific languages, the final languages that you create
+lack the power of other DSLs, which in turn considerably increases the size
 and complexity of TableGen files.
 
 At the same time, TableGen allows you to create virtually any meaning of
@@ -305,6 +305,6 @@ design and make it very hard for newcomers to understand the evil TableGen
 file.
 
 There are some in favor of extending the semantics even more, but making sure
-backends adhere to strict rules. Others are suggesting we should move to less,
+backends adhere to strict rules. Others suggest moving to fewer,
 more powerful DSLs designed with specific purposes, or even reusing existing
 DSLs.
diff --git a/llvm/include/llvm/ADT/Bitfields.h b/llvm/include/llvm/ADT/Bitfields.h
index 1fbc41c472581..be9546e9cc4cb 100644
--- a/llvm/include/llvm/ADT/Bitfields.h
+++ b/llvm/include/llvm/ADT/Bitfields.h
@@ -100,8 +100,8 @@ template <typename Bitfield, typename StorageType> struct Impl {
   using IntegerType = typename Bitfield::IntegerType;
 
   static constexpr size_t StorageBits = sizeof(StorageType) * CHAR_BIT;
-  static_assert(Bitfield::FirstBit <= StorageBits, "Data must fit in mask");
-  static_assert(Bitfield::LastBit <= StorageBits, "Data must fit in mask");
+  static_assert(Bitfield::FirstBit < StorageBits, "Data must fit in mask");
+  static_assert(Bitfield::LastBit < StorageBits, "Data must fit in mask");
   static constexpr StorageType LowMask =
       maskTrailingOnes<StorageType>(Bitfield::Bits);
   static constexpr StorageType Mask = LowMask << Bitfield::Shift;
diff --git a/llvm/include/llvm/ADT/BitmaskEnum.h b/llvm/include/llvm/ADT/BitmaskEnum.h
index d464cbcfcdffd..9555fadda6e47 100644
--- a/llvm/include/llvm/ADT/BitmaskEnum.h
+++ b/llvm/include/llvm/ADT/BitmaskEnum.h
@@ -106,7 +106,7 @@ struct is_bitmask_enum : std::false_type {};
 
 template <typename E>
 struct is_bitmask_enum<
-    E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>>
+    E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>>
     : std::true_type {};
 
 /// Trait class to determine bitmask enumeration largest bit.
@@ -114,7 +114,7 @@ template <typename E, typename Enable = void> struct largest_bitmask_enum_bit;
 
 template <typename E>
 struct largest_bitmask_enum_bit<
-    E, std::enable_if_t<sizeof(E::LLVM_BITMASK_LARGEST_ENUMERATOR) >= 0>> {
+    E, std::void_t<decltype(E::LLVM_BITMASK_LARGEST_ENUMERATOR)>> {
   using UnderlyingTy = std::underlying_type_t<E>;
   static constexpr UnderlyingTy value =
       static_cast<UnderlyingTy>(E::LLVM_BITMASK_LARGEST_ENUMERATOR);
diff --git a/llvm/include/llvm/ADT/ConcurrentHashtable.h b/llvm/include/llvm/ADT/ConcurrentHashtable.h
index 6a943c5b062e7..0cc03cf7a692a 100644
--- a/llvm/include/llvm/ADT/ConcurrentHashtable.h
+++ b/llvm/include/llvm/ADT/ConcurrentHashtable.h
@@ -177,7 +177,7 @@ class ConcurrentHashTableByPtr {
 
 #if LLVM_ENABLE_THREADS
     // Lock bucket.
-    CurBucket.Guard.lock();
+    std::scoped_lock<std::mutex> Lock(CurBucket.Guard);
 #endif
 
     HashesPtr BucketHashes = CurBucket.Hashes;
@@ -195,11 +195,6 @@ class ConcurrentHashTableByPtr {
 
         CurBucket.NumberOfEntries++;
         RehashBucket(CurBucket);
-
-#if LLVM_ENABLE_THREADS
-        CurBucket.Guard.unlock();
-#endif
-
         return {NewData, true};
       }
 
@@ -208,10 +203,6 @@ class ConcurrentHashTableByPtr {
         KeyDataTy *EntryData = BucketEntries[CurEntryIdx];
         if (Info::isEqual(Info::getKey(*EntryData), NewValue)) {
           // Already existed entry matched with inserted data is found.
-#if LLVM_ENABLE_THREADS
-          CurBucket.Guard.unlock();
-#endif
-
           return {EntryData, false};
         }
       }
diff --git a/llvm/include/llvm/ADT/DenseMap.h b/llvm/include/llvm/ADT/DenseMap.h
index 25b5262800a10..22ef7ed64451e 100644
--- a/llvm/include/llvm/ADT/DenseMap.h
+++ b/llvm/include/llvm/ADT/DenseMap.h
@@ -130,13 +130,13 @@ class DenseMapBase : public DebugEpochBase {
       return;
     }
 
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     if constexpr (std::is_trivially_destructible_v<ValueT>) {
       // Use a simpler loop when values don't need destruction.
       for (BucketT &B : buckets())
         B.getFirst() = EmptyKey;
     } else {
-      const KeyT TombstoneKey = getTombstoneKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
       unsigned NumEntries = getNumEntries();
       for (BucketT &B : buckets()) {
         if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey)) {
@@ -154,6 +154,17 @@ class DenseMapBase : public DebugEpochBase {
     setNumTombstones(0);
   }
 
+  void shrink_and_clear() {
+    auto [Reallocate, NewNumBuckets] = derived().planShrinkAndClear();
+    destroyAll();
+    if (!Reallocate) {
+      initEmpty();
+      return;
+    }
+    derived().deallocateBuckets();
+    derived().init(NewNumBuckets);
+  }
+
   /// Return true if the specified key is in the map, false otherwise.
   [[nodiscard]] bool contains(const_arg_type_t<KeyT> Val) const {
     return doFind(Val) != nullptr;
@@ -314,7 +325,7 @@ class DenseMapBase : public DebugEpochBase {
       return false; // not in map.
 
     TheBucket->getSecond().~ValueT();
-    TheBucket->getFirst() = getTombstoneKey();
+    TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
     return true;
@@ -322,7 +333,7 @@ class DenseMapBase : public DebugEpochBase {
   void erase(iterator I) {
     BucketT *TheBucket = &*I;
     TheBucket->getSecond().~ValueT();
-    TheBucket->getFirst() = getTombstoneKey();
+    TheBucket->getFirst() = KeyInfoT::getTombstoneKey();
     decrementNumEntries();
     incrementNumTombstones();
   }
@@ -353,10 +364,17 @@ class DenseMapBase : public DebugEpochBase {
   DenseMapBase() = default;
 
   void destroyAll() {
+    // No need to iterate through the buckets if both KeyT and ValueT are
+    // trivially destructible.
+    if constexpr (std::is_trivially_destructible_v<KeyT> &&
+                  std::is_trivially_destructible_v<ValueT>)
+      return;
+
     if (getNumBuckets() == 0) // Nothing to do.
       return;
 
-    const KeyT EmptyKey = getEmptyKey(), TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     for (BucketT &B : buckets()) {
       if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
           !KeyInfoT::isEqual(B.getFirst(), TombstoneKey))
@@ -366,12 +384,14 @@ class DenseMapBase : public DebugEpochBase {
   }
 
   void initEmpty() {
+    static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
+                  "Must pass the derived type to this template!");
     setNumEntries(0);
     setNumTombstones(0);
 
     assert((getNumBuckets() & (getNumBuckets() - 1)) == 0 &&
            "# initial buckets must be a power of two!");
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     for (BucketT &B : buckets())
       ::new (&B.getFirst()) KeyT(EmptyKey);
   }
@@ -391,8 +411,8 @@ class DenseMapBase : public DebugEpochBase {
     initEmpty();
 
     // Insert all the old elements.
-    const KeyT EmptyKey = getEmptyKey();
-    const KeyT TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     for (BucketT &B : OldBuckets) {
       if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
           !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
@@ -412,9 +432,16 @@ class DenseMapBase : public DebugEpochBase {
     }
   }
 
-  template <typename OtherBaseT>
-  void copyFrom(
-      const DenseMapBase<OtherBaseT, KeyT, ValueT, KeyInfoT, BucketT> &other) {
+  void copyFrom(const DerivedT &other) {
+    this->destroyAll();
+    derived().deallocateBuckets();
+    setNumEntries(0);
+    setNumTombstones(0);
+    if (!derived().allocateBuckets(other.getNumBuckets())) {
+      // The bucket list is empty.  No work to do.
+      return;
+    }
+
     assert(&other != this);
     assert(getNumBuckets() == other.getNumBuckets());
 
@@ -429,8 +456,8 @@ class DenseMapBase : public DebugEpochBase {
       memcpy(reinterpret_cast<void *>(Buckets), OtherBuckets,
              NumBuckets * sizeof(BucketT));
     } else {
-      const KeyT EmptyKey = getEmptyKey();
-      const KeyT TombstoneKey = getTombstoneKey();
+      const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
       for (size_t I = 0; I < NumBuckets; ++I) {
         ::new (&Buckets[I].getFirst()) KeyT(OtherBuckets[I].getFirst());
         if (!KeyInfoT::isEqual(Buckets[I].getFirst(), EmptyKey) &&
@@ -440,19 +467,6 @@ class DenseMapBase : public DebugEpochBase {
     }
   }
 
-  template <typename LookupKeyT>
-  static unsigned getHashValue(const LookupKeyT &Val) {
-    return KeyInfoT::getHashValue(Val);
-  }
-
-  static const KeyT getEmptyKey() {
-    static_assert(std::is_base_of_v<DenseMapBase, DerivedT>,
-                  "Must pass the derived type to this template!");
-    return KeyInfoT::getEmptyKey();
-  }
-
-  static const KeyT getTombstoneKey() { return KeyInfoT::getTombstoneKey(); }
-
 private:
   DerivedT &derived() { return *static_cast<DerivedT *>(this); }
   const DerivedT &derived() const {
@@ -526,8 +540,6 @@ class DenseMapBase : public DebugEpochBase {
 
   void grow(unsigned AtLeast) { derived().grow(AtLeast); }
 
-  void shrink_and_clear() { derived().shrink_and_clear(); }
-
   template <typename LookupKeyT>
   BucketT *findBucketForInsertion(const LookupKeyT &Lookup,
                                   BucketT *TheBucket) {
@@ -560,7 +572,7 @@ class DenseMapBase : public DebugEpochBase {
     incrementNumEntries();
 
     // If we are writing over a tombstone, remember this.
-    const KeyT EmptyKey = getEmptyKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
     if (!KeyInfoT::isEqual(TheBucket->getFirst(), EmptyKey))
       decrementNumTombstones();
 
@@ -574,8 +586,8 @@ class DenseMapBase : public DebugEpochBase {
     if (NumBuckets == 0)
       return nullptr;
 
-    const KeyT EmptyKey = getEmptyKey();
-    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
     unsigned ProbeAmt = 1;
     while (true) {
       const BucketT *Bucket = BucketsPtr + BucketNo;
@@ -612,13 +624,13 @@ class DenseMapBase : public DebugEpochBase {
 
     // FoundTombstone - Keep track of whether we find a tombstone while probing.
     BucketT *FoundTombstone = nullptr;
-    const KeyT EmptyKey = getEmptyKey();
-    const KeyT TombstoneKey = getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     assert(!KeyInfoT::isEqual(Val, EmptyKey) &&
            !KeyInfoT::isEqual(Val, TombstoneKey) &&
            "Empty/Tombstone value shouldn't be inserted into map!");
 
-    unsigned BucketNo = getHashValue(Val) & (NumBuckets - 1);
+    unsigned BucketNo = KeyInfoT::getHashValue(Val) & (NumBuckets - 1);
     unsigned ProbeAmt = 1;
     while (true) {
       BucketT *ThisBucket = BucketsPtr + BucketNo;
@@ -719,7 +731,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   DenseMap(const DenseMap &other) : BaseT() {
     init(0);
-    copyFrom(other);
+    this->copyFrom(other);
   }
 
   DenseMap(DenseMap &&other) : BaseT() {
@@ -755,7 +767,7 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
 
   DenseMap &operator=(const DenseMap &other) {
     if (&other != this)
-      copyFrom(other);
+      this->copyFrom(other);
     return *this;
   }
 
@@ -767,56 +779,6 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
     return *this;
   }
 
-  void copyFrom(const DenseMap &other) {
-    this->destroyAll();
-    deallocateBuckets();
-    if (allocateBuckets(other.NumBuckets)) {
-      this->BaseT::copyFrom(other);
-    } else {
-      NumEntries = 0;
-      NumTombstones = 0;
-    }
-  }
-
-  void grow(unsigned AtLeast) {
-    unsigned OldNumBuckets = NumBuckets;
-    BucketT *OldBuckets = Buckets;
-
-    allocateBuckets(std::max<unsigned>(
-        64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
-    assert(Buckets);
-    if (!OldBuckets) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    this->moveFromOldBuckets(
-        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
-
-    // Free the old table.
-    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
-                      alignof(BucketT));
-  }
-
-  void shrink_and_clear() {
-    unsigned OldNumBuckets = NumBuckets;
-    unsigned OldNumEntries = NumEntries;
-    this->destroyAll();
-
-    // Reduce the number of buckets.
-    unsigned NewNumBuckets = 0;
-    if (OldNumEntries)
-      NewNumBuckets = std::max(64, 1 << (Log2_32_Ceil(OldNumEntries) + 1));
-    if (NewNumBuckets == NumBuckets) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    deallocate_buffer(Buckets, sizeof(BucketT) * OldNumBuckets,
-                      alignof(BucketT));
-    init(NewNumBuckets);
-  }
-
 private:
   unsigned getNumEntries() const { return NumEntries; }
 
@@ -855,6 +817,38 @@ class DenseMap : public DenseMapBase<DenseMap<KeyT, ValueT, KeyInfoT, BucketT>,
       NumTombstones = 0;
     }
   }
+
+  void grow(unsigned AtLeast) {
+    unsigned OldNumBuckets = NumBuckets;
+    BucketT *OldBuckets = Buckets;
+
+    allocateBuckets(std::max<unsigned>(
+        64, static_cast<unsigned>(NextPowerOf2(AtLeast - 1))));
+    assert(Buckets);
+    if (!OldBuckets) {
+      this->BaseT::initEmpty();
+      return;
+    }
+
+    this->moveFromOldBuckets(
+        llvm::make_range(OldBuckets, OldBuckets + OldNumBuckets));
+
+    // Free the old table.
+    deallocate_buffer(OldBuckets, sizeof(BucketT) * OldNumBuckets,
+                      alignof(BucketT));
+  }
+
+  // Plan how to shrink the bucket table.  Return:
+  // - {false, 0} to reuse the existing bucket table
+  // - {true, N} to reallocate a bucket table with N entries
+  std::pair<bool, unsigned> planShrinkAndClear() const {
+    unsigned NewNumBuckets = 0;
+    if (NumEntries)
+      NewNumBuckets = std::max(64u, 1u << (Log2_32_Ceil(NumEntries) + 1));
+    if (NewNumBuckets == NumBuckets)
+      return {false, 0};          // Reuse.
+    return {true, NewNumBuckets}; // Reallocate.
+  }
 };
 
 template <typename KeyT, typename ValueT, unsigned InlineBuckets = 4,
@@ -896,7 +890,7 @@ class SmallDenseMap
 
   SmallDenseMap(const SmallDenseMap &other) : BaseT() {
     init(0);
-    copyFrom(other);
+    this->copyFrom(other);
   }
 
   SmallDenseMap(SmallDenseMap &&other) : BaseT() {
@@ -928,8 +922,8 @@ class SmallDenseMap
     NumEntries = TmpNumEntries;
     std::swap(NumTombstones, RHS.NumTombstones);
 
-    const KeyT EmptyKey = this->getEmptyKey();
-    const KeyT TombstoneKey = this->getTombstoneKey();
+    const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+    const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
     if (Small && RHS.Small) {
       // If we're swapping inline bucket arrays, we have to cope with some of
       // the tricky bits of DenseMap's storage system: the buckets are not
@@ -995,7 +989,7 @@ class SmallDenseMap
 
   SmallDenseMap &operator=(const SmallDenseMap &other) {
     if (&other != this)
-      copyFrom(other);
+      this->copyFrom(other);
     return *this;
   }
 
@@ -1007,86 +1001,6 @@ class SmallDenseMap
     return *this;
   }
 
-  void copyFrom(const SmallDenseMap &other) {
-    this->destroyAll();
-    deallocateBuckets();
-    allocateBuckets(other.getNumBuckets());
-    this->BaseT::copyFrom(other);
-  }
-
-  void init(unsigned InitNumEntries) {
-    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
-    allocateBuckets(InitBuckets);
-    this->BaseT::initEmpty();
-  }
-
-  void grow(unsigned AtLeast) {
-    if (AtLeast > InlineBuckets)
-      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
-
-    if (Small) {
-      // First move the inline buckets into a temporary storage.
-      AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
-      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
-      BucketT *TmpEnd = TmpBegin;
-
-      // Loop over the buckets, moving non-empty, non-tombstones into the
-      // temporary storage. Have the loop move the TmpEnd forward as it goes.
-      const KeyT EmptyKey = this->getEmptyKey();
-      const KeyT TombstoneKey = this->getTombstoneKey();
-      for (BucketT &B : inlineBuckets()) {
-        if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
-            !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
-          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
-                 "Too many inline buckets!");
-          ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
-          ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
-          ++TmpEnd;
-          B.getSecond().~ValueT();
-        }
-        B.getFirst().~KeyT();
-      }
-
-      // AtLeast == InlineBuckets can happen if there are many tombstones,
-      // and grow() is used to remove them. Usually we always switch to the
-      // large rep here.
-      allocateBuckets(AtLeast);
-      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
-      return;
-    }
-
-    LargeRep OldRep = std::move(*getLargeRep());
-    getLargeRep()->~LargeRep();
-    allocateBuckets(AtLeast);
-
-    this->moveFromOldBuckets(OldRep.buckets());
-
-    // Free the old table.
-    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
-                      alignof(BucketT));
-  }
-
-  void shrink_and_clear() {
-    unsigned OldSize = this->size();
-    this->destroyAll();
-
-    // Reduce the number of buckets.
-    unsigned NewNumBuckets = 0;
-    if (OldSize) {
-      NewNumBuckets = 1 << (Log2_32_Ceil(OldSize) + 1);
-      if (NewNumBuckets > InlineBuckets && NewNumBuckets < 64u)
-        NewNumBuckets = 64;
-    }
-    if ((Small && NewNumBuckets <= InlineBuckets) ||
-        (!Small && NewNumBuckets == getLargeRep()->NumBuckets)) {
-      this->BaseT::initEmpty();
-      return;
-    }
-
-    deallocateBuckets();
-    init(NewNumBuckets);
-  }
-
 private:
   unsigned getNumEntries() const { return NumEntries; }
 
@@ -1152,7 +1066,7 @@ class SmallDenseMap
     getLargeRep()->~LargeRep();
   }
 
-  void allocateBuckets(unsigned Num) {
+  bool allocateBuckets(unsigned Num) {
     if (Num <= InlineBuckets) {
       Small = true;
     } else {
@@ -1161,6 +1075,76 @@ class SmallDenseMap
           allocate_buffer(sizeof(BucketT) * Num, alignof(BucketT)));
       new (getLargeRep()) LargeRep{NewBuckets, Num};
     }
+    return true;
+  }
+
+  void init(unsigned InitNumEntries) {
+    auto InitBuckets = BaseT::getMinBucketToReserveForEntries(InitNumEntries);
+    allocateBuckets(InitBuckets);
+    this->BaseT::initEmpty();
+  }
+
+  void grow(unsigned AtLeast) {
+    if (AtLeast > InlineBuckets)
+      AtLeast = std::max<unsigned>(64, NextPowerOf2(AtLeast - 1));
+
+    if (Small) {
+      // First move the inline buckets into a temporary storage.
+      AlignedCharArrayUnion<BucketT[InlineBuckets]> TmpStorage;
+      BucketT *TmpBegin = reinterpret_cast<BucketT *>(&TmpStorage);
+      BucketT *TmpEnd = TmpBegin;
+
+      // Loop over the buckets, moving non-empty, non-tombstones into the
+      // temporary storage. Have the loop move the TmpEnd forward as it goes.
+      const KeyT EmptyKey = KeyInfoT::getEmptyKey();
+      const KeyT TombstoneKey = KeyInfoT::getTombstoneKey();
+      for (BucketT &B : inlineBuckets()) {
+        if (!KeyInfoT::isEqual(B.getFirst(), EmptyKey) &&
+            !KeyInfoT::isEqual(B.getFirst(), TombstoneKey)) {
+          assert(size_t(TmpEnd - TmpBegin) < InlineBuckets &&
+                 "Too many inline buckets!");
+          ::new (&TmpEnd->getFirst()) KeyT(std::move(B.getFirst()));
+          ::new (&TmpEnd->getSecond()) ValueT(std::move(B.getSecond()));
+          ++TmpEnd;
+          B.getSecond().~ValueT();
+        }
+        B.getFirst().~KeyT();
+      }
+
+      // AtLeast == InlineBuckets can happen if there are many tombstones,
+      // and grow() is used to remove them. Usually we always switch to the
+      // large rep here.
+      allocateBuckets(AtLeast);
+      this->moveFromOldBuckets(llvm::make_range(TmpBegin, TmpEnd));
+      return;
+    }
+
+    LargeRep OldRep = std::move(*getLargeRep());
+    getLargeRep()->~LargeRep();
+    allocateBuckets(AtLeast);
+
+    this->moveFromOldBuckets(OldRep.buckets());
+
+    // Free the old table.
+    deallocate_buffer(OldRep.Buckets, sizeof(BucketT) * OldRep.NumBuckets,
+                      alignof(BucketT));
+  }
+
+  // Plan how to shrink the bucket table.  Return:
+  // - {false, 0} to reuse the existing bucket table
+  // - {true, N} to reallocate a bucket table with N entries
+  std::pair<bool, unsigned> planShrinkAndClear() const {
+    unsigned NewNumBuckets = 0;
+    if (!this->empty()) {
+      NewNumBuckets = 1u << (Log2_32_Ceil(this->size()) + 1);
+      if (NewNumBuckets > InlineBuckets)
+        NewNumBuckets = std::max(64u, NewNumBuckets);
+    }
+    bool Reuse = Small ? NewNumBuckets <= InlineBuckets
+                       : NewNumBuckets == getLargeRep()->NumBuckets;
+    if (Reuse)
+      return {false, 0};          // Reuse.
+    return {true, NewNumBuckets}; // Reallocate.
   }
 };
 
diff --git a/llvm/include/llvm/ADT/FoldingSet.h b/llvm/include/llvm/ADT/FoldingSet.h
index 82a88c440ff24..675b5c6a35bbc 100644
--- a/llvm/include/llvm/ADT/FoldingSet.h
+++ b/llvm/include/llvm/ADT/FoldingSet.h
@@ -332,6 +332,14 @@ class FoldingSetNodeID {
   /// Use a SmallVector to avoid a heap allocation in the common case.
   SmallVector<unsigned, 32> Bits;
 
+  template <typename T> void AddIntegerImpl(T I) {
+    static_assert(std::is_integral_v<T> && sizeof(T) <= sizeof(unsigned) * 2,
+                  "T must be an integer type no wider than 64 bits");
+    Bits.push_back(static_cast<unsigned>(I));
+    if constexpr (sizeof(unsigned) < sizeof(T))
+      Bits.push_back(static_cast<unsigned long long>(I) >> 32);
+  }
+
 public:
   FoldingSetNodeID() = default;
 
@@ -348,24 +356,12 @@ class FoldingSetNodeID {
                   "unexpected pointer size");
     AddInteger(reinterpret_cast<uintptr_t>(Ptr));
   }
-  void AddInteger(signed I) { Bits.push_back(I); }
-  void AddInteger(unsigned I) { Bits.push_back(I); }
-  void AddInteger(long I) { AddInteger((unsigned long)I); }
-  void AddInteger(unsigned long I) {
-    if (sizeof(long) == sizeof(int))
-      AddInteger(unsigned(I));
-    else if (sizeof(long) == sizeof(long long)) {
-      AddInteger((unsigned long long)I);
-    } else {
-      llvm_unreachable("unexpected sizeof(long)");
-    }
-  }
-  void AddInteger(long long I) { AddInteger((unsigned long long)I); }
-  void AddInteger(unsigned long long I) {
-    AddInteger(unsigned(I));
-    AddInteger(unsigned(I >> 32));
-  }
-
+  void AddInteger(signed I) { AddIntegerImpl(I); }
+  void AddInteger(unsigned I) { AddIntegerImpl(I); }
+  void AddInteger(long I) { AddIntegerImpl(I); }
+  void AddInteger(unsigned long I) { AddIntegerImpl(I); }
+  void AddInteger(long long I) { AddIntegerImpl(I); }
+  void AddInteger(unsigned long long I) { AddIntegerImpl(I); }
   void AddBoolean(bool B) { AddInteger(B ? 1U : 0U); }
   LLVM_ABI void AddString(StringRef String);
   LLVM_ABI void AddNodeID(const FoldingSetNodeID &ID);
diff --git a/llvm/include/llvm/ADT/ImmutableSet.h b/llvm/include/llvm/ADT/ImmutableSet.h
index 8b2425e4e8fef..1b40dac4d092c 100644
--- a/llvm/include/llvm/ADT/ImmutableSet.h
+++ b/llvm/include/llvm/ADT/ImmutableSet.h
@@ -635,9 +635,7 @@ class ImutAVLFactory {
     // if find a collision compare those trees by their contents.
     unsigned digest = TNew->computeDigest();
     TreeTy *&entry = Cache[maskCacheIndex(digest)];
-    do {
-      if (!entry)
-        break;
+    if (entry) {
       for (TreeTy *T = entry ; T != nullptr; T = T->next) {
         // Compare the Contents('T') with Contents('TNew')
         typename TreeTy::iterator TI = T->begin(), TE = T->end();
@@ -653,7 +651,6 @@ class ImutAVLFactory {
       entry->prev = TNew;
       TNew->next = entry;
     }
-    while (false);
 
     entry = TNew;
     TNew->IsCanonicalized = true;
diff --git a/llvm/include/llvm/ADT/IndexedMap.h b/llvm/include/llvm/ADT/IndexedMap.h
index cda0316dc78fa..02193c79a6f0c 100644
--- a/llvm/include/llvm/ADT/IndexedMap.h
+++ b/llvm/include/llvm/ADT/IndexedMap.h
@@ -22,52 +22,61 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/identity.h"
 #include <cassert>
 
 namespace llvm {
 
-template <typename T, typename ToIndexT = identity<unsigned>> class IndexedMap {
+namespace detail {
+template <class Ty> struct IdentityIndex {
+  using argument_type = Ty;
+
+  Ty &operator()(Ty &self) const { return self; }
+  const Ty &operator()(const Ty &self) const { return self; }
+};
+} // namespace detail
+
+template <typename T, typename ToIndexT = detail::IdentityIndex<unsigned>>
+class IndexedMap {
   using IndexT = typename ToIndexT::argument_type;
   // Prefer SmallVector with zero inline storage over std::vector. IndexedMaps
   // can grow very large and SmallVector grows more efficiently as long as T
   // is trivially copyable.
   using StorageT = SmallVector<T, 0>;
 
-  StorageT storage_;
-  T nullVal_;
-  ToIndexT toIndex_;
+  StorageT Storage;
+  T NullVal = T();
+  ToIndexT ToIndex;
 
 public:
-  IndexedMap() : nullVal_(T()) {}
+  IndexedMap() = default;
 
-  explicit IndexedMap(const T &val) : nullVal_(val) {}
+  explicit IndexedMap(const T &Val) : NullVal(Val) {}
 
-  typename StorageT::reference operator[](IndexT n) {
-    assert(toIndex_(n) < storage_.size() && "index out of bounds!");
-    return storage_[toIndex_(n)];
+  typename StorageT::reference operator[](IndexT N) {
+    assert(ToIndex(N) < Storage.size() && "index out of bounds!");
+    return Storage[ToIndex(N)];
   }
 
-  typename StorageT::const_reference operator[](IndexT n) const {
-    assert(toIndex_(n) < storage_.size() && "index out of bounds!");
-    return storage_[toIndex_(n)];
+  typename StorageT::const_reference operator[](IndexT N) const {
+    assert(ToIndex(N) < Storage.size() && "index out of bounds!");
+    return Storage[ToIndex(N)];
   }
 
-  void reserve(typename StorageT::size_type s) { storage_.reserve(s); }
+  void reserve(typename StorageT::size_type S) { Storage.reserve(S); }
 
-  void resize(typename StorageT::size_type s) { storage_.resize(s, nullVal_); }
+  void resize(typename StorageT::size_type S) { Storage.resize(S, NullVal); }
 
-  void clear() { storage_.clear(); }
+  void clear() { Storage.clear(); }
 
-  void grow(IndexT n) {
-    unsigned NewSize = toIndex_(n) + 1;
-    if (NewSize > storage_.size())
+  void grow(IndexT N) {
+    unsigned NewSize = ToIndex(N) + 1;
+    if (NewSize > Storage.size())
       resize(NewSize);
   }
 
-  bool inBounds(IndexT n) const { return toIndex_(n) < storage_.size(); }
+  bool inBounds(IndexT N) const { return ToIndex(N) < Storage.size(); }
 
-  typename StorageT::size_type size() const { return storage_.size(); }
+  typename StorageT::size_type size() const { return Storage.size(); }
 };
 
 } // namespace llvm
diff --git a/llvm/include/llvm/ADT/RadixTree.h b/llvm/include/llvm/ADT/RadixTree.h
new file mode 100644
index 0000000000000..9e2ab9753d50c
--- /dev/null
+++ b/llvm/include/llvm/ADT/RadixTree.h
@@ -0,0 +1,350 @@
+//===-- llvm/ADT/RadixTree.h - Radix Tree implementation --------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//===----------------------------------------------------------------------===//
+//
+// This file implements a Radix Tree.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ADT_RADIXTREE_H
+#define LLVM_ADT_RADIXTREE_H
+
+#include "llvm/ADT/ADL.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/iterator.h"
+#include "llvm/ADT/iterator_range.h"
+#include <cassert>
+#include <cstddef>
+#include <iterator>
+#include <list>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+/// \brief A Radix Tree implementation.
+///
+/// A Radix Tree (also known as a compact prefix tree or radix trie) is a
+/// data structure that stores a dynamic set or associative array where keys
+/// are strings and values are associated with these keys. Unlike a regular
+/// trie, the edges of a radix tree can be labeled with sequences of characters
+/// as well as single characters. This makes radix trees more efficient for
+/// storing sparse data sets, where many nodes in a regular trie would have
+/// only one child.
+///
+/// This implementation supports arbitrary key types that can be iterated over
+/// (e.g., `std::string`, `std::vector<char>`, `ArrayRef<char>`). The key type
+/// must provide `begin()` and `end()` for iteration.
+///
+/// The tree stores `std::pair<const KeyType, T>` as its value type.
+///
+/// Example usage:
+/// \code
+///   llvm::RadixTree<StringRef, int> Tree;
+///   Tree.emplace("apple", 1);
+///   Tree.emplace("grapefruit", 2);
+///   Tree.emplace("grape", 3);
+///
+///   // Find prefixes
+///   for (const auto &[Key, Value] : Tree.find_prefixes("grapefruit juice")) {
+///     // pair will be {"grape", 3}
+///     // pair will be {"grapefruit", 2}
+///     llvm::outs() << Key << ": " << Value << "\n";
+///   }
+///
+///   // Iterate over all elements
+///   for (const auto &[Key, Value] : Tree)
+///     llvm::outs() << Key << ": " << Value << "\n";
+/// \endcode
+///
+/// \note
+/// The `RadixTree` takes ownership of the `KeyType` and `T` objects
+/// inserted into it. When an element is removed or the tree is destroyed,
+/// these objects will be destructed.
+/// However, if `KeyType` is a reference-like type, e.g., StringRef or range,
+/// the user must guarantee that the referenced data has a lifetime longer than
+/// the tree.
+template <typename KeyType, typename T> class RadixTree {
+public:
+  using key_type = KeyType;
+  using mapped_type = T;
+  using value_type = std::pair<const KeyType, mapped_type>;
+
+private:
+  using KeyConstIteratorType =
+      decltype(adl_begin(std::declval<const key_type &>()));
+  using KeyConstIteratorRangeType = iterator_range<KeyConstIteratorType>;
+  using KeyValueType =
+      remove_cvref_t<decltype(*adl_begin(std::declval<key_type &>()))>;
+  using ContainerType = std::list<value_type>;
+
+  /// Represents an internal node in the Radix Tree.
+  struct Node {
+    KeyConstIteratorRangeType Key{KeyConstIteratorType{},
+                                  KeyConstIteratorType{}};
+    std::vector<Node> Children;
+
+    /// An iterator to the value associated with this node.
+    ///
+    /// If this node does not have a value (i.e., it's an internal node that
+    /// only serves as a path to other values), this iterator will be equal
+    /// to default constructed `ContainerType::iterator()`.
+    typename ContainerType::iterator Value;
+
+    /// The first character of the Key. Used for fast child lookup.
+    KeyValueType KeyFront;
+
+    Node() = default;
+    Node(const KeyConstIteratorRangeType &Key)
+        : Key(Key), KeyFront(*Key.begin()) {
+      assert(!Key.empty());
+    }
+
+    Node(Node &&) = default;
+    Node &operator=(Node &&) = default;
+
+    Node(const Node &) = delete;
+    Node &operator=(const Node &) = delete;
+
+    const Node *findChild(const KeyConstIteratorRangeType &Key) const {
+      if (Key.empty())
+        return nullptr;
+      for (const Node &Child : Children) {
+        assert(!Child.Key.empty()); // Only root can be empty.
+        if (Child.KeyFront == *Key.begin())
+          return &Child;
+      }
+      return nullptr;
+    }
+
+    Node *findChild(const KeyConstIteratorRangeType &Query) {
+      const Node *This = this;
+      return const_cast<Node *>(This->findChild(Query));
+    }
+
+    size_t countNodes() const {
+      size_t R = 1;
+      for (const Node &C : Children)
+        R += C.countNodes();
+      return R;
+    }
+
+    ///
+    /// Splits the current node into two.
+    ///
+    /// This function is used when a new key needs to be inserted that shares
+    /// a common prefix with the current node's key, but then diverges.
+    /// The current `Key` is truncated to the common prefix, and a new child
+    /// node is created for the remainder of the original node's `Key`.
+    ///
+    /// \param SplitPoint An iterator pointing to the character in the current
+    ///                   `Key` where the split should occur.
+    void split(KeyConstIteratorType SplitPoint) {
+      Node Child(make_range(SplitPoint, Key.end()));
+      Key = make_range(Key.begin(), SplitPoint);
+
+      Children.swap(Child.Children);
+      std::swap(Value, Child.Value);
+
+      Children.emplace_back(std::move(Child));
+    }
+  };
+
+  /// Root always corresponds to the empty key, which is the shortest possible
+  /// prefix for everything.
+  Node Root;
+  ContainerType KeyValuePairs;
+
+  /// Finds or creates a new tail or leaf node corresponding to the `Key`.
+  Node &findOrCreate(KeyConstIteratorRangeType Key) {
+    Node *Curr = &Root;
+    if (Key.empty())
+      return *Curr;
+
+    for (;;) {
+      auto [I1, I2] = llvm::mismatch(Key, Curr->Key);
+      Key = make_range(I1, Key.end());
+
+      if (I2 != Curr->Key.end()) {
+        // Match is partial. Either query is too short, or there is mismatching
+        // character. Split either way, and put new node in between of the
+        // current and its children.
+        Curr->split(I2);
+
+        // Split was caused by mismatch, so `findChild` would fail.
+        break;
+      }
+
+      Node *Child = Curr->findChild(Key);
+      if (!Child)
+        break;
+
+      // Move to child with the same first character.
+      Curr = Child;
+    }
+
+    if (Key.empty()) {
+      // The current node completely matches the key, return it.
+      return *Curr;
+    }
+
+    // `Key` is a suffix of original `Key` unmatched by path from the `Root` to
+    // the `Curr`, and we have no candidate in the children to match more.
+    // Create a new one.
+    return Curr->Children.emplace_back(Key);
+  }
+
+  ///
+  /// An iterator for traversing prefixes search results.
+  ///
+  /// This iterator is used by `find_prefixes` to traverse the tree and find
+  /// elements that are prefixes to the given key. It's a forward iterator.
+  ///
+  /// \tparam MappedType The type of the value pointed to by the iterator.
+  ///                    This will be `value_type` for non-const iterators
+  ///                    and `const value_type` for const iterators.
+  template <typename MappedType>
+  class IteratorImpl
+      : public iterator_facade_base<IteratorImpl<MappedType>,
+                                    std::forward_iterator_tag, MappedType> {
+    const Node *Curr = nullptr;
+    KeyConstIteratorRangeType Query{KeyConstIteratorType{},
+                                    KeyConstIteratorType{}};
+
+    void findNextValid() {
+      while (Curr && Curr->Value == typename ContainerType::iterator())
+        advance();
+    }
+
+    void advance() {
+      assert(Curr);
+      if (Query.empty()) {
+        Curr = nullptr;
+        return;
+      }
+
+      Curr = Curr->findChild(Query);
+      if (!Curr) {
+        Curr = nullptr;
+        return;
+      }
+
+      auto [I1, I2] = llvm::mismatch(Query, Curr->Key);
+      if (I2 != Curr->Key.end()) {
+        Curr = nullptr;
+        return;
+      }
+      Query = make_range(I1, Query.end());
+    }
+
+    friend class RadixTree;
+    IteratorImpl(const Node *C, const KeyConstIteratorRangeType &Q)
+        : Curr(C), Query(Q) {
+      findNextValid();
+    }
+
+  public:
+    IteratorImpl() = default;
+
+    MappedType &operator*() const { return *Curr->Value; }
+
+    IteratorImpl &operator++() {
+      advance();
+      findNextValid();
+      return *this;
+    }
+
+    bool operator==(const IteratorImpl &Other) const {
+      return Curr == Other.Curr;
+    }
+  };
+
+public:
+  RadixTree() = default;
+  RadixTree(RadixTree &&) = default;
+  RadixTree &operator=(RadixTree &&) = default;
+
+  using prefix_iterator = IteratorImpl<value_type>;
+  using const_prefix_iterator = IteratorImpl<const value_type>;
+
+  using iterator = typename ContainerType::iterator;
+  using const_iterator = typename ContainerType::const_iterator;
+
+  /// Returns true if the tree is empty.
+  bool empty() const { return KeyValuePairs.empty(); }
+
+  /// Returns the number of elements in the tree.
+  size_t size() const { return KeyValuePairs.size(); }
+
+  /// Returns the number of nodes in the tree.
+  ///
+  /// This function counts all internal nodes in the tree. It can be useful for
+  /// understanding the memory footprint or complexity of the tree structure.
+  size_t countNodes() const { return Root.countNodes(); }
+
+  /// Returns an iterator to the first element.
+  iterator begin() { return KeyValuePairs.begin(); }
+  const_iterator begin() const { return KeyValuePairs.begin(); }
+
+  /// Returns an iterator to the end of the tree.
+  iterator end() { return KeyValuePairs.end(); }
+  const_iterator end() const { return KeyValuePairs.end(); }
+
+  /// Constructs and inserts a new element into the tree.
+  ///
+  /// This function constructs an element in place within the tree. If an
+  /// element with the same key already exists, the insertion fails and the
+  /// function returns an iterator to the existing element along with `false`.
+  /// Otherwise, the new element is inserted and the function returns an
+  /// iterator to the new element along with `true`.
+  ///
+  /// \param Key The key of the element to construct.
+  /// \param Args Arguments to forward to the constructor of the mapped_type.
+  /// \return A pair consisting of an iterator to the inserted element (or to
+  ///         the element that prevented insertion) and a boolean value
+  ///         indicating whether the insertion took place.
+  template <typename... Ts>
+  std::pair<iterator, bool> emplace(key_type &&Key, Ts &&...Args) {
+    // We want to make new `Node` to refer key in the container, not the one
+    // from the argument.
+    // FIXME: Determine that we need a new node, before expanding
+    // `KeyValuePairs`.
+    const value_type &NewValue = KeyValuePairs.emplace_front(
+        std::move(Key), T(std::forward<Ts>(Args)...));
+    Node &Node = findOrCreate(NewValue.first);
+    bool HasValue = Node.Value != typename ContainerType::iterator();
+    if (!HasValue)
+      Node.Value = KeyValuePairs.begin();
+    else
+      KeyValuePairs.pop_front();
+    return {Node.Value, !HasValue};
+  }
+
+  ///
+  /// Finds all elements whose keys are prefixes of the given `Key`.
+  ///
+  /// This function returns an iterator range over all elements in the tree
+  /// whose keys are prefixes of the provided `Key`. For example, if the tree
+  /// contains "abcde", "abc", "abcdefgh", and `Key` is "abcde", this function
+  /// would return iterators to "abcde" and "abc".
+  ///
+  /// \param Key The key to search for prefixes of.
+  /// \return An `iterator_range` of `const_prefix_iterator`s, allowing
+  ///         iteration over the found prefix elements.
+  /// \note The returned iterators reference the `Key` provided by the caller.
+  ///       The caller must ensure that `Key` remains valid for the lifetime
+  ///       of the iterators.
+  iterator_range<const_prefix_iterator>
+  find_prefixes(const key_type &Key) const {
+    return iterator_range<const_prefix_iterator>{
+        const_prefix_iterator(&Root, KeyConstIteratorRangeType(Key)),
+        const_prefix_iterator{}};
+  }
+};
+
+} // namespace llvm
+
+#endif // LLVM_ADT_RADIXTREE_H
diff --git a/llvm/include/llvm/ADT/STLForwardCompat.h b/llvm/include/llvm/ADT/STLForwardCompat.h
index 4a9598c734dbf..1889b90c14126 100644
--- a/llvm/include/llvm/ADT/STLForwardCompat.h
+++ b/llvm/include/llvm/ADT/STLForwardCompat.h
@@ -126,7 +126,7 @@ struct detector<std::void_t<Op<Args...>>, Op, Args...> {
 template <template <class...> class Op, class... Args>
 using is_detected = typename detail::detector<void, Op, Args...>::value_t;
 
-struct identity_cxx20 // NOLINT(readability-identifier-naming)
+struct identity // NOLINT(readability-identifier-naming)
 {
   using is_transparent = void;
 
diff --git a/llvm/include/llvm/ADT/ScopedHashTable.h b/llvm/include/llvm/ADT/ScopedHashTable.h
index 78d4df7d56845..97d15463fdfd7 100644
--- a/llvm/include/llvm/ADT/ScopedHashTable.h
+++ b/llvm/include/llvm/ADT/ScopedHashTable.h
@@ -151,8 +151,8 @@ class ScopedHashTable : detail::AllocatorHolder<AllocatorTy> {
   using AllocTy = detail::AllocatorHolder<AllocatorTy>;
 
 public:
-  /// ScopeTy - This is a helpful typedef that allows clients to get easy access
-  /// to the name of the scope for this hash table.
+  /// ScopeTy - A type alias for easy access to the name of the scope for this
+  /// hash table.
   using ScopeTy = ScopedHashTableScope<K, V, KInfo, AllocatorTy>;
   using size_type = unsigned;
 
diff --git a/llvm/include/llvm/ADT/SmallVector.h b/llvm/include/llvm/ADT/SmallVector.h
index efae6f339f9de..51109d1f416f2 100644
--- a/llvm/include/llvm/ADT/SmallVector.h
+++ b/llvm/include/llvm/ADT/SmallVector.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_ADT_SMALLVECTOR_H
 #define LLVM_ADT_SMALLVECTOR_H
 
+#include "llvm/ADT/ADL.h"
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/Support/Compiler.h"
 #include <algorithm>
@@ -734,6 +735,12 @@ class SmallVectorImpl : public SmallVectorTemplateBase<T> {
 
   void assign(const SmallVectorImpl &RHS) { assign(RHS.begin(), RHS.end()); }
 
+  template <typename U,
+            typename = std::enable_if_t<std::is_convertible_v<U, T>>>
+  void assign(ArrayRef<U> AR) {
+    assign(AR.begin(), AR.end());
+  }
+
   iterator erase(const_iterator CI) {
     // Just cast away constness because this is a non-const member function.
     iterator I = const_cast<iterator>(CI);
@@ -1228,7 +1235,7 @@ class LLVM_GSL_OWNER SmallVector : public SmallVectorImpl<T>,
   }
 
   template <typename U,
-            typename = std::enable_if_t<std::is_convertible<U, T>::value>>
+            typename = std::enable_if_t<std::is_convertible_v<U, T>>>
   explicit SmallVector(ArrayRef<U> A) : SmallVectorImpl<T>(N) {
     this->append(A.begin(), A.end());
   }
@@ -1289,28 +1296,27 @@ inline size_t capacity_in_bytes(const SmallVector<T, N> &X) {
 
 template <typename RangeType>
 using ValueTypeFromRangeType =
-    std::remove_const_t<std::remove_reference_t<decltype(*std::begin(
-        std::declval<RangeType &>()))>>;
+    std::remove_const_t<detail::ValueOfRange<RangeType>>;
 
 /// Given a range of type R, iterate the entire range and return a
 /// SmallVector with elements of the vector.  This is useful, for example,
 /// when you want to iterate a range and then sort the results.
 template <unsigned Size, typename R>
 SmallVector<ValueTypeFromRangeType<R>, Size> to_vector(R &&Range) {
-  return {std::begin(Range), std::end(Range)};
+  return {adl_begin(Range), adl_end(Range)};
 }
 template <typename R>
 SmallVector<ValueTypeFromRangeType<R>> to_vector(R &&Range) {
-  return {std::begin(Range), std::end(Range)};
+  return {adl_begin(Range), adl_end(Range)};
 }
 
 template <typename Out, unsigned Size, typename R>
 SmallVector<Out, Size> to_vector_of(R &&Range) {
-  return {std::begin(Range), std::end(Range)};
+  return {adl_begin(Range), adl_end(Range)};
 }
 
 template <typename Out, typename R> SmallVector<Out> to_vector_of(R &&Range) {
-  return {std::begin(Range), std::end(Range)};
+  return {adl_begin(Range), adl_end(Range)};
 }
 
 // Explicit instantiations
diff --git a/llvm/include/llvm/ADT/SparseMultiSet.h b/llvm/include/llvm/ADT/SparseMultiSet.h
index 5e4e1703373a9..59de4cf6a553b 100644
--- a/llvm/include/llvm/ADT/SparseMultiSet.h
+++ b/llvm/include/llvm/ADT/SparseMultiSet.h
@@ -82,7 +82,7 @@ namespace llvm {
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
 template <typename ValueT, typename KeyT = unsigned,
-          typename KeyFunctorT = identity_cxx20, typename SparseT = uint8_t>
+          typename KeyFunctorT = identity, typename SparseT = uint8_t>
 class SparseMultiSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
diff --git a/llvm/include/llvm/ADT/SparseSet.h b/llvm/include/llvm/ADT/SparseSet.h
index 4697de097e7eb..41fd501911f2c 100644
--- a/llvm/include/llvm/ADT/SparseSet.h
+++ b/llvm/include/llvm/ADT/SparseSet.h
@@ -59,24 +59,20 @@ template <typename ValueT> struct SparseSetValTraits {
   }
 };
 
-/// SparseSetValFunctor - Helper class for selecting SparseSetValTraits. The
-/// generic implementation handles ValueT classes which either provide
-/// getSparseSetIndex() or specialize SparseSetValTraits<>.
+/// SparseSetValFunctor - Helper class for getting a value's index.
 ///
+/// In the generic case, this is done via SparseSetValTraits. When the value
+/// type is the same as the key type, the KeyFunctor is used directly.
 template <typename KeyT, typename ValueT, typename KeyFunctorT>
 struct SparseSetValFunctor {
   unsigned operator()(const ValueT &Val) const {
-    return SparseSetValTraits<ValueT>::getValIndex(Val);
+    if constexpr (std::is_same_v<KeyT, ValueT>)
+      return KeyFunctorT()(Val);
+    else
+      return SparseSetValTraits<ValueT>::getValIndex(Val);
   }
 };
 
-/// SparseSetValFunctor<KeyT, KeyT> - Helper class for the common case of
-/// identity key/value sets.
-template <typename KeyT, typename KeyFunctorT>
-struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
-  unsigned operator()(const KeyT &Key) const { return KeyFunctorT()(Key); }
-};
-
 /// SparseSet - Fast set implementation for objects that can be identified by
 /// small unsigned keys.
 ///
@@ -117,7 +113,7 @@ struct SparseSetValFunctor<KeyT, KeyT, KeyFunctorT> {
 /// @tparam SparseT     An unsigned integer type. See above.
 ///
 template <typename ValueT, typename KeyT = unsigned,
-          typename KeyFunctorT = identity_cxx20, typename SparseT = uint8_t>
+          typename KeyFunctorT = identity, typename SparseT = uint8_t>
 class SparseSet {
   static_assert(std::is_unsigned_v<SparseT>,
                 "SparseT must be an unsigned integer type");
diff --git a/llvm/include/llvm/ADT/StringSwitch.h b/llvm/include/llvm/ADT/StringSwitch.h
index 2262b1162e330..98685de8573fa 100644
--- a/llvm/include/llvm/ADT/StringSwitch.h
+++ b/llvm/include/llvm/ADT/StringSwitch.h
@@ -93,6 +93,7 @@ class StringSwitch {
     return CasesImpl({S0, S1}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &Cases(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                       T Value) {
     return CasesImpl({S0, S1, S2}, Value);
@@ -176,6 +177,7 @@ class StringSwitch {
     return CasesLowerImpl({S0, S1}, Value);
   }
 
+  [[deprecated("Pass cases in std::initializer_list instead")]]
   StringSwitch &CasesLower(StringLiteral S0, StringLiteral S1, StringLiteral S2,
                            T Value) {
     return CasesLowerImpl({S0, S1, S2}, Value);
diff --git a/llvm/include/llvm/ADT/Twine.h b/llvm/include/llvm/ADT/Twine.h
index d9f9c0f0d5d9c..e3b4d5e26fa17 100644
--- a/llvm/include/llvm/ADT/Twine.h
+++ b/llvm/include/llvm/ADT/Twine.h
@@ -285,7 +285,7 @@ class Twine {
   }
 
   /// Construct from a StringRef.
-  /*implicit*/ Twine(const StringRef &Str) : LHSKind(PtrAndLengthKind) {
+  /*implicit*/ Twine(StringRef Str) : LHSKind(PtrAndLengthKind) {
     LHS.ptrAndLength.ptr = Str.data();
     LHS.ptrAndLength.length = Str.size();
     assert(isValid() && "Invalid twine!");
@@ -352,7 +352,7 @@ class Twine {
   // right thing. Yet.
 
   /// Construct as the concatenation of a C string and a StringRef.
-  /*implicit*/ Twine(const char *LHS, const StringRef &RHS)
+  /*implicit*/ Twine(const char *LHS, StringRef RHS)
       : LHSKind(CStringKind), RHSKind(PtrAndLengthKind) {
     this->LHS.cString = LHS;
     this->RHS.ptrAndLength.ptr = RHS.data();
@@ -361,7 +361,7 @@ class Twine {
   }
 
   /// Construct as the concatenation of a StringRef and a C string.
-  /*implicit*/ Twine(const StringRef &LHS, const char *RHS)
+  /*implicit*/ Twine(StringRef LHS, const char *RHS)
       : LHSKind(PtrAndLengthKind), RHSKind(CStringKind) {
     this->LHS.ptrAndLength.ptr = LHS.data();
     this->LHS.ptrAndLength.length = LHS.size();
@@ -530,14 +530,14 @@ inline Twine operator+(const Twine &LHS, const Twine &RHS) {
 /// Additional overload to guarantee simplified codegen; this is equivalent to
 /// concat().
 
-inline Twine operator+(const char *LHS, const StringRef &RHS) {
+inline Twine operator+(const char *LHS, StringRef RHS) {
   return Twine(LHS, RHS);
 }
 
 /// Additional overload to guarantee simplified codegen; this is equivalent to
 /// concat().
 
-inline Twine operator+(const StringRef &LHS, const char *RHS) {
+inline Twine operator+(StringRef LHS, const char *RHS) {
   return Twine(LHS, RHS);
 }
 
diff --git a/llvm/include/llvm/ADT/identity.h b/llvm/include/llvm/ADT/identity.h
deleted file mode 100644
index 88d033fc01141..0000000000000
--- a/llvm/include/llvm/ADT/identity.h
+++ /dev/null
@@ -1,31 +0,0 @@
-//===- llvm/ADT/Identity.h - Provide std::identity from C++20 ---*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file provides an implementation of std::identity from C++20.
-//
-// No library is required when using these functions.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_ADT_IDENTITY_H
-#define LLVM_ADT_IDENTITY_H
-
-namespace llvm {
-
-// Similar to `std::identity` from C++20.
-template <class Ty> struct identity {
-  using is_transparent = void;
-  using argument_type = Ty;
-
-  Ty &operator()(Ty &self) const { return self; }
-  const Ty &operator()(const Ty &self) const { return self; }
-};
-
-} // namespace llvm
-
-#endif // LLVM_ADT_IDENTITY_H
diff --git a/llvm/include/llvm/Analysis/InstSimplifyFolder.h b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
index 58793ed977f68..2832beb9e337c 100644
--- a/llvm/include/llvm/Analysis/InstSimplifyFolder.h
+++ b/llvm/include/llvm/Analysis/InstSimplifyFolder.h
@@ -120,7 +120,7 @@ class LLVM_ABI InstSimplifyFolder final : public IRBuilderFolder {
   }
 
   Value *FoldBinaryIntrinsic(Intrinsic::ID ID, Value *LHS, Value *RHS, Type *Ty,
-                             Instruction *FMFSource) const override {
+                             Instruction *FMFSource = nullptr) const override {
     return simplifyBinaryIntrinsic(ID, Ty, LHS, RHS, SQ,
                                    dyn_cast_if_present<CallBase>(FMFSource));
   }
diff --git a/llvm/include/llvm/Analysis/ScalarEvolution.h b/llvm/include/llvm/Analysis/ScalarEvolution.h
index 3d3ec14796bc1..04ea769bd06d1 100644
--- a/llvm/include/llvm/Analysis/ScalarEvolution.h
+++ b/llvm/include/llvm/Analysis/ScalarEvolution.h
@@ -638,8 +638,12 @@ class ScalarEvolution {
   /// \p GEP The GEP. The indices contained in the GEP itself are ignored,
   /// instead we use IndexExprs.
   /// \p IndexExprs The expressions for the indices.
-  LLVM_ABI const SCEV *
-  getGEPExpr(GEPOperator *GEP, const SmallVectorImpl<const SCEV *> &IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(GEPOperator *GEP,
+                                  ArrayRef<const SCEV *> IndexExprs);
+  LLVM_ABI const SCEV *getGEPExpr(const SCEV *BaseExpr,
+                                  ArrayRef<const SCEV *> IndexExprs,
+                                  Type *SrcElementTy,
+                                  GEPNoWrapFlags NW = GEPNoWrapFlags::none());
   LLVM_ABI const SCEV *getAbsExpr(const SCEV *Op, bool IsNSW);
   LLVM_ABI const SCEV *getMinMaxExpr(SCEVTypes Kind,
                                      SmallVectorImpl<const SCEV *> &Operands);
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 5d3b233ed6b6a..7b7dc1b46dd80 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -227,6 +227,9 @@ class TargetTransformInfo {
   /// Get the kind of extension that an instruction represents.
   LLVM_ABI static PartialReductionExtendKind
   getPartialReductionExtendKind(Instruction *I);
+  /// Get the kind of extension that a cast opcode represents.
+  LLVM_ABI static PartialReductionExtendKind
+  getPartialReductionExtendKind(Instruction::CastOps CastOpc);
 
   /// Construct a TTI object using a type implementing the \c Concept
   /// API below.
diff --git a/llvm/include/llvm/AsmParser/AsmParserContext.h b/llvm/include/llvm/AsmParser/AsmParserContext.h
new file mode 100644
index 0000000000000..1a397486cba4f
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/AsmParserContext.h
@@ -0,0 +1,70 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+#define LLVM_ASMPARSER_ASMPARSERCONTEXT_H
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/AsmParser/FileLoc.h"
+#include "llvm/IR/Value.h"
+#include <optional>
+
+namespace llvm {
+
+/// Registry of file location information for LLVM IR constructs.
+///
+/// This class provides access to the file location information
+/// for various LLVM IR constructs. Currently, it supports Function,
+/// BasicBlock and Instruction locations.
+///
+/// When available, it can answer queries about what is at a given
+/// file location, as well as where in a file a given IR construct
+/// is.
+///
+/// This information is optionally emitted by the LLParser while
+/// it reads LLVM textual IR.
+class AsmParserContext {
+  DenseMap<Function *, FileLocRange> Functions;
+  DenseMap<BasicBlock *, FileLocRange> Blocks;
+  DenseMap<Instruction *, FileLocRange> Instructions;
+
+public:
+  std::optional<FileLocRange> getFunctionLocation(const Function *) const;
+  std::optional<FileLocRange> getBlockLocation(const BasicBlock *) const;
+  std::optional<FileLocRange> getInstructionLocation(const Instruction *) const;
+  /// Get the function at the requested location range.
+  /// If no single function occupies the queried range, or the record is
+  /// missing, a nullptr is returned.
+  Function *getFunctionAtLocation(const FileLocRange &) const;
+  /// Get the function at the requested location.
+  /// If no function occupies the queried location, or the record is missing, a
+  /// nullptr is returned.
+  Function *getFunctionAtLocation(const FileLoc &) const;
+  /// Get the block at the requested location range.
+  /// If no single block occupies the queried range, or the record is missing, a
+  /// nullptr is returned.
+  BasicBlock *getBlockAtLocation(const FileLocRange &) const;
+  /// Get the block at the requested location.
+  /// If no block occupies the queried location, or the record is missing, a
+  /// nullptr is returned.
+  BasicBlock *getBlockAtLocation(const FileLoc &) const;
+  /// Get the instruction at the requested location range.
+  /// If no single instruction occupies the queried range, or the record is
+  /// missing, a nullptr is returned.
+  Instruction *getInstructionAtLocation(const FileLocRange &) const;
+  /// Get the instruction at the requested location.
+  /// If no instruction occupies the queried location, or the record is missing,
+  /// a nullptr is returned.
+  Instruction *getInstructionAtLocation(const FileLoc &) const;
+  bool addFunctionLocation(Function *, const FileLocRange &);
+  bool addBlockLocation(BasicBlock *, const FileLocRange &);
+  bool addInstructionLocation(Instruction *, const FileLocRange &);
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/FileLoc.h b/llvm/include/llvm/AsmParser/FileLoc.h
new file mode 100644
index 0000000000000..02c1849fa986e
--- /dev/null
+++ b/llvm/include/llvm/AsmParser/FileLoc.h
@@ -0,0 +1,56 @@
+//===-- FileLoc.h ---------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_ASMPARSER_FILELOC_H
+#define LLVM_ASMPARSER_FILELOC_H
+
+#include <cassert>
+#include <utility>
+
+namespace llvm {
+
+/// Struct holding Line:Column location
+struct FileLoc {
+  /// 0-based line number
+  unsigned Line;
+  /// 0-based column number
+  unsigned Col;
+
+  bool operator<=(const FileLoc &RHS) const {
+    return Line < RHS.Line || (Line == RHS.Line && Col <= RHS.Col);
+  }
+
+  bool operator<(const FileLoc &RHS) const {
+    return Line < RHS.Line || (Line == RHS.Line && Col < RHS.Col);
+  }
+
+  FileLoc(unsigned L, unsigned C) : Line(L), Col(C) {}
+  FileLoc(std::pair<unsigned, unsigned> LC) : Line(LC.first), Col(LC.second) {}
+};
+
+/// Struct holding a semiopen range [Start; End)
+struct FileLocRange {
+  FileLoc Start;
+  FileLoc End;
+
+  FileLocRange() : Start(0, 0), End(0, 0) {}
+
+  FileLocRange(FileLoc S, FileLoc E) : Start(S), End(E) {
+    assert(Start <= End);
+  }
+
+  bool contains(FileLoc L) const { return Start <= L && L < End; }
+
+  bool contains(FileLocRange LR) const {
+    return Start <= LR.Start && LR.End <= End;
+  }
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/include/llvm/AsmParser/LLLexer.h b/llvm/include/llvm/AsmParser/LLLexer.h
index 501a7aefccd7f..0e379e5a75e65 100644
--- a/llvm/include/llvm/AsmParser/LLLexer.h
+++ b/llvm/include/llvm/AsmParser/LLLexer.h
@@ -13,22 +13,25 @@
 #ifndef LLVM_ASMPARSER_LLLEXER_H
 #define LLVM_ASMPARSER_LLLEXER_H
 
-#include "LLToken.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
+#include "llvm/AsmParser/LLToken.h"
 #include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
 #include <string>
 
 namespace llvm {
   class Type;
   class SMDiagnostic;
-  class SourceMgr;
   class LLVMContext;
 
   class LLLexer {
     const char *CurPtr;
     StringRef CurBuf;
 
+    /// The end (exclusive) of the previous token.
+    const char *PrevTokEnd = nullptr;
+
     enum class ErrorPriority {
       None,   // No error message present.
       Parser, // Errors issued by parser.
@@ -62,9 +65,7 @@ namespace llvm {
     explicit LLLexer(StringRef StartBuf, SourceMgr &SM, SMDiagnostic &,
                      LLVMContext &C);
 
-    lltok::Kind Lex() {
-      return CurKind = LexToken();
-    }
+    lltok::Kind Lex() { return CurKind = LexToken(); }
 
     typedef SMLoc LocTy;
     LocTy getLoc() const { return SMLoc::getFromPointer(TokStart); }
@@ -79,6 +80,19 @@ namespace llvm {
       IgnoreColonInIdentifiers = val;
     }
 
+    /// Get the line, column position of the start of the current token,
+    /// zero-indexed
+    std::pair<unsigned, unsigned> getTokLineColumnPos() {
+      auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(TokStart));
+      return {LC.first - 1, LC.second - 1};
+    }
+    /// Get the line, column position of the end of the previous token,
+    /// zero-indexed exclusive
+    std::pair<unsigned, unsigned> getPrevTokEndLineColumnPos() {
+      auto LC = SM.getLineAndColumn(SMLoc::getFromPointer(PrevTokEnd));
+      return {LC.first - 1, LC.second - 1};
+    }
+
     // This returns true as a convenience for the parser functions that return
     // true on error.
     bool ParseError(LocTy ErrorLoc, const Twine &Msg) {
diff --git a/llvm/include/llvm/AsmParser/LLParser.h b/llvm/include/llvm/AsmParser/LLParser.h
index 42750272a8579..1e44b177131b5 100644
--- a/llvm/include/llvm/AsmParser/LLParser.h
+++ b/llvm/include/llvm/AsmParser/LLParser.h
@@ -13,8 +13,9 @@
 #ifndef LLVM_ASMPARSER_LLPARSER_H
 #define LLVM_ASMPARSER_LLPARSER_H
 
-#include "LLLexer.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/AsmParserContext.h"
+#include "llvm/AsmParser/LLLexer.h"
 #include "llvm/AsmParser/NumberedValues.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Attributes.h"
@@ -177,6 +178,9 @@ namespace llvm {
     // Map of module ID to path.
     std::map<unsigned, StringRef> ModuleIdMap;
 
+    /// Keeps track of source locations for Values, BasicBlocks, and Functions.
+    AsmParserContext *ParserContext;
+
     /// Only the llvm-as tool may set this to false to bypass
     /// UpgradeDebuginfo so it can generate broken bitcode.
     bool UpgradeDebugInfo;
@@ -189,10 +193,11 @@ namespace llvm {
   public:
     LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
              ModuleSummaryIndex *Index, LLVMContext &Context,
-             SlotMapping *Slots = nullptr)
+             SlotMapping *Slots = nullptr,
+             AsmParserContext *ParserContext = nullptr)
         : Context(Context), OPLex(F, SM, Err, Context),
           Lex(F, SM, Err, Context), M(M), Index(Index), Slots(Slots),
-          BlockAddressPFS(nullptr) {}
+          BlockAddressPFS(nullptr), ParserContext(ParserContext) {}
     bool Run(
         bool UpgradeDebugInfo,
         DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
diff --git a/llvm/include/llvm/AsmParser/Parser.h b/llvm/include/llvm/AsmParser/Parser.h
index c900b79665404..22b0881d92b53 100644
--- a/llvm/include/llvm/AsmParser/Parser.h
+++ b/llvm/include/llvm/AsmParser/Parser.h
@@ -15,6 +15,7 @@
 
 #include "llvm/ADT/STLFunctionalExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/Support/Compiler.h"
 #include <memory>
 #include <optional>
@@ -62,7 +63,8 @@ parseAssemblyFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
 ///              parsing.
 LLVM_ABI std::unique_ptr<Module>
 parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
-                    LLVMContext &Context, SlotMapping *Slots = nullptr);
+                    LLVMContext &Context, SlotMapping *Slots = nullptr,
+                    AsmParserContext *ParserContext = nullptr);
 
 /// Holds the Module and ModuleSummaryIndex returned by the interfaces
 /// that parse both.
@@ -128,9 +130,9 @@ parseSummaryIndexAssemblyString(StringRef AsmString, SMDiagnostic &Err);
 LLVM_ABI std::unique_ptr<Module> parseAssembly(
     MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
     SlotMapping *Slots = nullptr,
-    DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
-      return std::nullopt;
-    });
+    DataLayoutCallbackTy DataLayoutCallback =
+        [](StringRef, StringRef) { return std::nullopt; },
+    AsmParserContext *ParserContext = nullptr);
 
 /// Parse LLVM Assembly including the summary index from a MemoryBuffer.
 ///
@@ -169,9 +171,9 @@ parseSummaryIndexAssembly(MemoryBufferRef F, SMDiagnostic &Err);
 LLVM_ABI bool parseAssemblyInto(
     MemoryBufferRef F, Module *M, ModuleSummaryIndex *Index, SMDiagnostic &Err,
     SlotMapping *Slots = nullptr,
-    DataLayoutCallbackTy DataLayoutCallback = [](StringRef, StringRef) {
-      return std::nullopt;
-    });
+    DataLayoutCallbackTy DataLayoutCallback =
+        [](StringRef, StringRef) { return std::nullopt; },
+    AsmParserContext *ParserContext = nullptr);
 
 /// Parse a type and a constant value in the given string.
 ///
diff --git a/llvm/include/llvm/BinaryFormat/ELF.h b/llvm/include/llvm/BinaryFormat/ELF.h
index 8d0dc64199ebf..6ee6b666c1735 100644
--- a/llvm/include/llvm/BinaryFormat/ELF.h
+++ b/llvm/include/llvm/BinaryFormat/ELF.h
@@ -1185,6 +1185,7 @@ enum : unsigned {
   SHT_LLVM_LTO = 0x6fff4c0c,                // .llvm.lto for fat LTO.
   SHT_LLVM_JT_SIZES = 0x6fff4c0d,           // LLVM jump tables sizes.
   SHT_LLVM_CFI_JUMP_TABLE = 0x6fff4c0e,     // LLVM CFI jump table.
+  SHT_LLVM_CALL_GRAPH = 0x6fff4c0f,         // LLVM Call Graph Section.
   // Android's experimental support for SHT_RELR sections.
   // https://android.googlesource.com/platform/bionic/+/b7feec74547f84559a1467aca02708ff61346d2a/libc/include/elf.h#512
   SHT_ANDROID_RELR = 0x6fffff00,   // Relocation entries; only offsets.
diff --git a/llvm/include/llvm/CodeGen/AtomicExpand.h b/llvm/include/llvm/CodeGen/AtomicExpand.h
index 1b8a988ef4866..34f520f462e96 100644
--- a/llvm/include/llvm/CodeGen/AtomicExpand.h
+++ b/llvm/include/llvm/CodeGen/AtomicExpand.h
@@ -21,7 +21,7 @@ class AtomicExpandPass : public PassInfoMixin<AtomicExpandPass> {
   const TargetMachine *TM;
 
 public:
-  AtomicExpandPass(const TargetMachine *TM) : TM(TM) {}
+  AtomicExpandPass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
index 82dd5feb31dba..48650a6df22ff 100644
--- a/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
+++ b/llvm/include/llvm/CodeGen/BasicBlockSectionsProfileReader.h
@@ -155,7 +155,7 @@ class BasicBlockSectionsProfileReaderAnalysis
 public:
   static AnalysisKey Key;
   typedef BasicBlockSectionsProfileReader Result;
-  BasicBlockSectionsProfileReaderAnalysis(const TargetMachine *TM) : TM(TM) {}
+  BasicBlockSectionsProfileReaderAnalysis(const TargetMachine &TM) : TM(&TM) {}
 
   Result run(Function &F, FunctionAnalysisManager &AM);
 
diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 4f27d9f103e91..76b6c8ec68c72 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -366,7 +366,7 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
 protected:
   explicit BasicTTIImplBase(const TargetMachine *TM, const DataLayout &DL)
       : BaseT(DL) {}
-  virtual ~BasicTTIImplBase() = default;
+  ~BasicTTIImplBase() override = default;
 
   using TargetTransformInfoImplBase::DL;
 
@@ -821,13 +821,13 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
         SimplifyAndSetOp);
   }
 
-  virtual std::optional<unsigned>
+  std::optional<unsigned>
   getCacheSize(TargetTransformInfo::CacheLevel Level) const override {
     return std::optional<unsigned>(
         getST()->getCacheSize(static_cast<unsigned>(Level)));
   }
 
-  virtual std::optional<unsigned>
+  std::optional<unsigned>
   getCacheAssociativity(TargetTransformInfo::CacheLevel Level) const override {
     std::optional<unsigned> TargetResult =
         getST()->getCacheAssociativity(static_cast<unsigned>(Level));
@@ -838,31 +838,31 @@ class BasicTTIImplBase : public TargetTransformInfoImplCRTPBase<T> {
     return BaseT::getCacheAssociativity(Level);
   }
 
-  virtual unsigned getCacheLineSize() const override {
+  unsigned getCacheLineSize() const override {
     return getST()->getCacheLineSize();
   }
 
-  virtual unsigned getPrefetchDistance() const override {
+  unsigned getPrefetchDistance() const override {
     return getST()->getPrefetchDistance();
   }
 
-  virtual unsigned getMinPrefetchStride(unsigned NumMemAccesses,
-                                        unsigned NumStridedMemAccesses,
-                                        unsigned NumPrefetches,
-                                        bool HasCall) const override {
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override {
     return getST()->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
                                          NumPrefetches, HasCall);
   }
 
-  virtual unsigned getMaxPrefetchIterationsAhead() const override {
+  unsigned getMaxPrefetchIterationsAhead() const override {
     return getST()->getMaxPrefetchIterationsAhead();
   }
 
-  virtual bool enableWritePrefetching() const override {
+  bool enableWritePrefetching() const override {
     return getST()->enableWritePrefetching();
   }
 
-  virtual bool shouldPrefetchAddressSpace(unsigned AS) const override {
+  bool shouldPrefetchAddressSpace(unsigned AS) const override {
     return getST()->shouldPrefetchAddressSpace(AS);
   }
 
diff --git a/llvm/include/llvm/CodeGen/CodeGenPrepare.h b/llvm/include/llvm/CodeGen/CodeGenPrepare.h
index dee3a9ee53d76..e673d0f5831a0 100644
--- a/llvm/include/llvm/CodeGen/CodeGenPrepare.h
+++ b/llvm/include/llvm/CodeGen/CodeGenPrepare.h
@@ -26,7 +26,7 @@ class CodeGenPreparePass : public PassInfoMixin<CodeGenPreparePass> {
   const TargetMachine *TM;
 
 public:
-  CodeGenPreparePass(const TargetMachine *TM) : TM(TM) {}
+  CodeGenPreparePass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
index 3950b95f859c3..7a6feda9bda56 100644
--- a/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
+++ b/llvm/include/llvm/CodeGen/CodeGenTargetMachineImpl.h
@@ -42,7 +42,7 @@ class LLVM_ABI CodeGenTargetMachineImpl : public TargetMachine {
 
   /// Create a pass configuration object to be used by addPassToEmitX methods
   /// for generating a pipeline of CodeGen passes.
-  virtual TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
+  TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
 
   /// Add passes to the specified pass manager to get the specified file
   /// emitted.  Typically this will involve several steps of code generation.
diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
index 39c5a8d479a5f..af66f2d5776ba 100644
--- a/llvm/include/llvm/CodeGen/CommandFlags.h
+++ b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -58,8 +58,6 @@ LLVM_ABI CodeGenFileType getFileType();
 
 LLVM_ABI FramePointerKind getFramePointerUsage();
 
-LLVM_ABI bool getEnableUnsafeFPMath();
-
 LLVM_ABI bool getEnableNoInfsFPMath();
 
 LLVM_ABI bool getEnableNoNaNsFPMath();
diff --git a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
index 4383249658e60..7b74f2a8cadc0 100644
--- a/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
+++ b/llvm/include/llvm/CodeGen/ComplexDeinterleavingPass.h
@@ -24,10 +24,10 @@ class TargetMachine;
 struct ComplexDeinterleavingPass
     : public PassInfoMixin<ComplexDeinterleavingPass> {
 private:
-  TargetMachine *TM;
+  const TargetMachine *TM;
 
 public:
-  ComplexDeinterleavingPass(TargetMachine *TM) : TM(TM) {}
+  ComplexDeinterleavingPass(const TargetMachine &TM) : TM(&TM) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/CodeGen/DebugHandlerBase.h b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
index fee4bb116bb87..e72801b6caf34 100644
--- a/llvm/include/llvm/CodeGen/DebugHandlerBase.h
+++ b/llvm/include/llvm/CodeGen/DebugHandlerBase.h
@@ -118,7 +118,7 @@ class DebugHandlerBase : public AsmPrinterHandler {
 
   // AsmPrinterHandler overrides.
 public:
-  virtual ~DebugHandlerBase() override;
+  ~DebugHandlerBase() override;
 
   void beginModule(Module *M) override;
 
diff --git a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
index bc8dc1b23fa9e..6da10d8148239 100644
--- a/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
+++ b/llvm/include/llvm/CodeGen/DroppedVariableStatsMIR.h
@@ -44,12 +44,11 @@ class LLVM_ABI DroppedVariableStatsMIR : public DroppedVariableStats {
                                                  StringRef FuncOrModName);
   /// Override base class method to run on an llvm::MachineFunction
   /// specifically.
-  virtual void
-  visitEveryInstruction(unsigned &DroppedCount,
-                        DenseMap<VarID, DILocation *> &InlinedAtsMap,
-                        VarID Var) override;
+  void visitEveryInstruction(unsigned &DroppedCount,
+                             DenseMap<VarID, DILocation *> &InlinedAtsMap,
+                             VarID Var) override;
   /// Override base class method to run on DBG_VALUEs specifically.
-  virtual void visitEveryDebugRecord(
+  void visitEveryDebugRecord(
       DenseSet<VarID> &VarIDSet,
       DenseMap<StringRef, DenseMap<VarID, DILocation *>> &InlinedAtsMap,
       StringRef FuncName, bool Before) override;
diff --git a/llvm/include/llvm/CodeGen/DwarfEHPrepare.h b/llvm/include/llvm/CodeGen/DwarfEHPrepare.h
index 3f625cd89808a..5b68b8cf83eb4 100644
--- a/llvm/include/llvm/CodeGen/DwarfEHPrepare.h
+++ b/llvm/include/llvm/CodeGen/DwarfEHPrepare.h
@@ -24,7 +24,7 @@ class DwarfEHPreparePass : public PassInfoMixin<DwarfEHPreparePass> {
   const TargetMachine *TM;
 
 public:
-  explicit DwarfEHPreparePass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit DwarfEHPreparePass(const TargetMachine &TM_) : TM(&TM_) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/ExpandFp.h b/llvm/include/llvm/CodeGen/ExpandFp.h
index f1f441bb5af86..28e6aecc2beb5 100644
--- a/llvm/include/llvm/CodeGen/ExpandFp.h
+++ b/llvm/include/llvm/CodeGen/ExpandFp.h
@@ -22,7 +22,7 @@ class ExpandFpPass : public PassInfoMixin<ExpandFpPass> {
   CodeGenOptLevel OptLevel;
 
 public:
-  explicit ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel);
+  explicit ExpandFpPass(const TargetMachine &TM, CodeGenOptLevel OptLevel);
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
   static bool isRequired() { return true; }
diff --git a/llvm/include/llvm/CodeGen/ExpandLargeDivRem.h b/llvm/include/llvm/CodeGen/ExpandLargeDivRem.h
index 6fc44094925dc..b73a382e93b5a 100644
--- a/llvm/include/llvm/CodeGen/ExpandLargeDivRem.h
+++ b/llvm/include/llvm/CodeGen/ExpandLargeDivRem.h
@@ -20,7 +20,7 @@ class ExpandLargeDivRemPass : public PassInfoMixin<ExpandLargeDivRemPass> {
   const TargetMachine *TM;
 
 public:
-  explicit ExpandLargeDivRemPass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit ExpandLargeDivRemPass(const TargetMachine &TM_) : TM(&TM_) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
diff --git a/llvm/include/llvm/CodeGen/ExpandMemCmp.h b/llvm/include/llvm/CodeGen/ExpandMemCmp.h
index 94a877854f327..0b845e439fce4 100644
--- a/llvm/include/llvm/CodeGen/ExpandMemCmp.h
+++ b/llvm/include/llvm/CodeGen/ExpandMemCmp.h
@@ -19,7 +19,7 @@ class ExpandMemCmpPass : public PassInfoMixin<ExpandMemCmpPass> {
   const TargetMachine *TM;
 
 public:
-  explicit ExpandMemCmpPass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit ExpandMemCmpPass(const TargetMachine &TM_) : TM(&TM_) {}
 
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
index ea3f1a8375c4a..6701ae0510581 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CSEInfo.h
@@ -40,14 +40,14 @@ class UniqueMachineInstr : public FoldingSetNode {
 // A CSE config for fully optimized builds.
 class LLVM_ABI CSEConfigFull : public CSEConfigBase {
 public:
-  virtual ~CSEConfigFull() = default;
+  ~CSEConfigFull() override = default;
   bool shouldCSEOpc(unsigned Opc) override;
 };
 
 // Commonly used for O0 config.
 class LLVM_ABI CSEConfigConstantOnly : public CSEConfigBase {
 public:
-  virtual ~CSEConfigConstantOnly() = default;
+  ~CSEConfigConstantOnly() override = default;
   bool shouldCSEOpc(unsigned Opc) override;
 };
 
@@ -118,7 +118,7 @@ class LLVM_ABI GISelCSEInfo : public GISelChangeObserver {
 public:
   GISelCSEInfo() = default;
 
-  virtual ~GISelCSEInfo();
+  ~GISelCSEInfo() override;
 
   void setMF(MachineFunction &MF);
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
index 39ff90c2687f4..7a313f493a6d3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/Combiner.h
@@ -60,7 +60,7 @@ class Combiner : public GIMatchTableExecutor {
   Combiner(MachineFunction &MF, CombinerInfo &CInfo,
            const TargetPassConfig *TPC, GISelValueTracking *VT,
            GISelCSEInfo *CSEInfo = nullptr);
-  virtual ~Combiner();
+  ~Combiner() override;
 
   virtual bool tryCombineAll(MachineInstr &I) const = 0;
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
index 93aff35b3a522..b0601eb72ba3f 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h
@@ -276,7 +276,6 @@ class CombinerHelper {
                                  SmallVector<Register> &Ops) const;
 
   /// Replace \p MI with a build_vector.
-  bool matchCombineShuffleToBuildVector(MachineInstr &MI) const;
   void applyCombineShuffleToBuildVector(MachineInstr &MI) const;
 
   /// Try to combine G_SHUFFLE_VECTOR into G_CONCAT_VECTORS.
@@ -295,8 +294,6 @@ class CombinerHelper {
   /// Replace \p MI with a concat_vectors with \p Ops.
   void applyCombineShuffleVector(MachineInstr &MI,
                                  const ArrayRef<Register> Ops) const;
-  bool matchShuffleToExtract(MachineInstr &MI) const;
-  void applyShuffleToExtract(MachineInstr &MI) const;
 
   /// Optimize memcpy intrinsics et al, e.g. constant len calls.
   /// /p MaxLen if non-zero specifies the max length of a mem libcall to inline.
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
index 2db66ba9584a3..17d656abdab34 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/GISelValueTracking.h
@@ -58,7 +58,7 @@ class LLVM_ABI GISelValueTracking : public GISelChangeObserver {
 
 public:
   GISelValueTracking(MachineFunction &MF, unsigned MaxDepth = 6);
-  ~GISelValueTracking() = default;
+  ~GISelValueTracking() override = default;
 
   const MachineFunction &getMachineFunction() const { return MF; }
 
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
index 3d7ccd55ee042..268025e7018d3 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/IRTranslator.h
@@ -656,7 +656,7 @@ class IRTranslator : public MachineFunctionPass {
       IRT->addSuccessorWithProb(Src, Dst, Prob);
     }
 
-    virtual ~GISelSwitchLowering() = default;
+    ~GISelSwitchLowering() override = default;
 
   private:
     IRTranslator *IRT;
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
index cf65f34ce805a..569407963695e 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/InstructionSelector.h
@@ -21,7 +21,7 @@ class GISelObserverWrapper;
 
 class LLVM_ABI InstructionSelector : public GIMatchTableExecutor {
 public:
-  virtual ~InstructionSelector();
+  ~InstructionSelector() override;
 
   /// Select the (possibly generic) instruction \p I to only use target-specific
   /// opcodes. It is OK to insert multiple instructions, but they cannot be
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
index c0e426c4a8db3..a458cbd94ccb1 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerHelper.h
@@ -497,6 +497,7 @@ class LegalizerHelper {
   LLVM_ABI LegalizeResult lowerMinMax(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFCopySign(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMinNumMaxNum(MachineInstr &MI);
+  LLVM_ABI LegalizeResult lowerFMinimumMaximum(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFMad(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerIntrinsicRound(MachineInstr &MI);
   LLVM_ABI LegalizeResult lowerFFloor(MachineInstr &MI);
diff --git a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
index b7ccfbb27e51c..8db99ba4ed883 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/MIPatternMatch.h
@@ -210,8 +210,8 @@ struct SpecificConstantMatch {
 };
 
 /// Matches a constant equal to \p RequestedValue.
-inline SpecificConstantMatch m_SpecificICst(APInt RequestedValue) {
-  return SpecificConstantMatch(std::move(RequestedValue));
+inline SpecificConstantMatch m_SpecificICst(const APInt &RequestedValue) {
+  return SpecificConstantMatch(RequestedValue);
 }
 
 inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
@@ -221,7 +221,7 @@ inline SpecificConstantMatch m_SpecificICst(int64_t RequestedValue) {
 /// Matcher for a specific constant splat.
 struct SpecificConstantSplatMatch {
   APInt RequestedVal;
-  SpecificConstantSplatMatch(const APInt RequestedVal)
+  SpecificConstantSplatMatch(const APInt &RequestedVal)
       : RequestedVal(RequestedVal) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     return isBuildVectorConstantSplat(Reg, MRI, RequestedVal,
@@ -230,8 +230,9 @@ struct SpecificConstantSplatMatch {
 };
 
 /// Matches a constant splat of \p RequestedValue.
-inline SpecificConstantSplatMatch m_SpecificICstSplat(APInt RequestedValue) {
-  return SpecificConstantSplatMatch(std::move(RequestedValue));
+inline SpecificConstantSplatMatch
+m_SpecificICstSplat(const APInt &RequestedValue) {
+  return SpecificConstantSplatMatch(RequestedValue);
 }
 
 inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) {
@@ -242,7 +243,7 @@ inline SpecificConstantSplatMatch m_SpecificICstSplat(int64_t RequestedValue) {
 /// Matcher for a specific constant or constant splat.
 struct SpecificConstantOrSplatMatch {
   APInt RequestedVal;
-  SpecificConstantOrSplatMatch(const APInt RequestedVal)
+  SpecificConstantOrSplatMatch(const APInt &RequestedVal)
       : RequestedVal(RequestedVal) {}
   bool match(const MachineRegisterInfo &MRI, Register Reg) {
     APInt MatchedVal;
@@ -263,8 +264,8 @@ struct SpecificConstantOrSplatMatch {
 /// Matches a \p RequestedValue constant or a constant splat of \p
 /// RequestedValue.
 inline SpecificConstantOrSplatMatch
-m_SpecificICstOrSplat(APInt RequestedValue) {
-  return SpecificConstantOrSplatMatch(std::move(RequestedValue));
+m_SpecificICstOrSplat(const APInt &RequestedValue) {
+  return SpecificConstantOrSplatMatch(RequestedValue);
 }
 
 inline SpecificConstantOrSplatMatch
diff --git a/llvm/include/llvm/CodeGen/IndirectBrExpand.h b/llvm/include/llvm/CodeGen/IndirectBrExpand.h
index f7d9d5df6fe27..572a712fc28fe 100644
--- a/llvm/include/llvm/CodeGen/IndirectBrExpand.h
+++ b/llvm/include/llvm/CodeGen/IndirectBrExpand.h
@@ -19,7 +19,7 @@ class IndirectBrExpandPass : public PassInfoMixin<IndirectBrExpandPass> {
   const TargetMachine *TM;
 
 public:
-  IndirectBrExpandPass(const TargetMachine *TM) : TM(TM) {}
+  IndirectBrExpandPass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/InterleavedAccess.h b/llvm/include/llvm/CodeGen/InterleavedAccess.h
index 31bd19a3191a2..42bfa84ddc26d 100644
--- a/llvm/include/llvm/CodeGen/InterleavedAccess.h
+++ b/llvm/include/llvm/CodeGen/InterleavedAccess.h
@@ -25,7 +25,7 @@ class InterleavedAccessPass : public PassInfoMixin<InterleavedAccessPass> {
   const TargetMachine *TM;
 
 public:
-  explicit InterleavedAccessPass(const TargetMachine *TM) : TM(TM) {}
+  explicit InterleavedAccessPass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/InterleavedLoadCombine.h b/llvm/include/llvm/CodeGen/InterleavedLoadCombine.h
index fa99aa316c2a6..2750fd42bbfc8 100644
--- a/llvm/include/llvm/CodeGen/InterleavedLoadCombine.h
+++ b/llvm/include/llvm/CodeGen/InterleavedLoadCombine.h
@@ -20,7 +20,7 @@ class InterleavedLoadCombinePass
   const TargetMachine *TM;
 
 public:
-  explicit InterleavedLoadCombinePass(const TargetMachine *TM) : TM(TM) {}
+  explicit InterleavedLoadCombinePass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/MIR2Vec.h b/llvm/include/llvm/CodeGen/MIR2Vec.h
index f6b0571f5dac6..44f009cd7790e 100644
--- a/llvm/include/llvm/CodeGen/MIR2Vec.h
+++ b/llvm/include/llvm/CodeGen/MIR2Vec.h
@@ -7,9 +7,20 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file defines the MIR2Vec vocabulary
-/// analysis(MIR2VecVocabLegacyAnalysis), the core mir2vec::MIREmbedder
-/// interface for generating Machine IR embeddings, and related utilities.
+/// This file defines the MIR2Vec framework for generating Machine IR
+/// embeddings.
+///
+/// Design Overview:
+/// ----------------------
+/// 1. MIR2VecVocabProvider - Core vocabulary loading logic (no PM dependency)
+///    - Can be used standalone or wrapped by the pass manager
+///    - Requires MachineModuleInfo with parsed machine functions
+///
+/// 2. MIR2VecVocabLegacyAnalysis - Pass manager wrapper (ImmutablePass)
+///    - Integrated and used by llc -print-mir2vec
+///
+/// 3. MIREmbedder - Generates embeddings from vocabulary
+///    - SymbolicMIREmbedder: MIR2Vec embedding implementation
 ///
 /// MIR2Vec extends IR2Vec to support Machine IR embeddings. It represents the
 /// LLVM Machine IR as embeddings which can be used as input to machine learning
@@ -35,6 +46,8 @@
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -61,7 +74,7 @@ class MIREmbedder;
 class SymbolicMIREmbedder;
 
 extern llvm::cl::OptionCategory MIR2VecCategory;
-extern cl::opt<float> OpcWeight;
+extern cl::opt<float> OpcWeight, CommonOperandWeight, RegOperandWeight;
 
 using Embedding = ir2vec::Embedding;
 using MachineInstEmbeddingsMap = DenseMap<const MachineInstr *, Embedding>;
@@ -74,43 +87,172 @@ class MIRVocabulary {
   friend class llvm::MIR2VecVocabLegacyAnalysis;
   using VocabMap = std::map<std::string, ir2vec::Embedding>;
 
-private:
-  // Define vocabulary layout - adapted for MIR
+  // MIRVocabulary Layout:
+  // +-------------------+-----------------------------------------------------+
+  // | Entity Type       | Description                                         |
+  // +-------------------+-----------------------------------------------------+
+  // | 1. Opcodes        | Target specific opcodes derived from TII, grouped   |
+  // |                   | by instruction semantics.                           |
+  // | 2. Common Operands| All common operand types, except register operands, |
+  // |                   | defined by MachineOperand::MachineOperandType enum. |
+  // | 3. Physical       | Register classes defined by the target, specialized |
+  // |    Reg classes    | by physical registers.                              |
+  // | 4. Virtual        | Register classes defined by the target, specialized |
+  // |    Reg classes    | by virtual and physical registers.                  |
+  // +-------------------+-----------------------------------------------------+
+
+  /// Layout information for the MIR vocabulary. Defines the starting index
+  /// and size of each section in the vocabulary.
   struct {
     size_t OpcodeBase = 0;
-    size_t OperandBase = 0;
+    size_t CommonOperandBase = 0;
+    size_t PhyRegBase = 0;
+    size_t VirtRegBase = 0;
     size_t TotalEntries = 0;
   } Layout;
 
-  enum class Section : unsigned { Opcodes = 0, MaxSections };
+  // TODO: See if we can have only one reg classes section instead of physical
+  // and virtual separate sections in the vocabulary. This would reduce the
+  // number of vocabulary entities significantly.
+  // We can potentially distinguish physical and virtual registers by
+  // considering them as a separate feature.
+  enum class Section : unsigned {
+    Opcodes = 0,
+    CommonOperands = 1,
+    PhyRegisters = 2,
+    VirtRegisters = 3,
+    MaxSections
+  };
 
   ir2vec::VocabStorage Storage;
-  mutable std::set<std::string> UniqueBaseOpcodeNames;
+  std::set<std::string> UniqueBaseOpcodeNames;
+  SmallVector<std::string, 24> RegisterOperandNames;
+
+  // Some instructions have optional register operands that may be NoRegister.
+  // We return a zero vector in such cases.
+  Embedding ZeroEmbedding;
+
+  // We have specialized MO_Register handling in the Register operand section,
+  // so we don't include it here. Also, no MO_DbgInstrRef for now.
+  static constexpr StringLiteral CommonOperandNames[] = {
+      "Immediate",       "CImmediate",        "FPImmediate",  "MBB",
+      "FrameIndex",      "ConstantPoolIndex", "TargetIndex",  "JumpTableIndex",
+      "ExternalSymbol",  "GlobalAddress",     "BlockAddress", "RegisterMask",
+      "RegisterLiveOut", "Metadata",          "MCSymbol",     "CFIIndex",
+      "IntrinsicID",     "Predicate",         "ShuffleMask"};
+  static_assert(std::size(CommonOperandNames) == MachineOperand::MO_Last - 1 &&
+                "Common operand names size changed, update accordingly");
+
   const TargetInstrInfo &TII;
-  void generateStorage(const VocabMap &OpcodeMap);
+  const TargetRegisterInfo &TRI;
+  const MachineRegisterInfo &MRI;
+
+  void generateStorage(const VocabMap &OpcodeMap,
+                       const VocabMap &CommonOperandMap,
+                       const VocabMap &PhyRegMap, const VocabMap &VirtRegMap);
   void buildCanonicalOpcodeMapping();
+  void buildRegisterOperandMapping();
 
   /// Get canonical index for a machine opcode
   unsigned getCanonicalOpcodeIndex(unsigned Opcode) const;
 
+  /// Get index for a common (non-register) machine operand
+  unsigned
+  getCommonOperandIndex(MachineOperand::MachineOperandType OperandType) const;
+
+  /// Get index for a register machine operand
+  unsigned getRegisterOperandIndex(Register Reg) const;
+
+  // Accessors for operand types
+  const Embedding &
+  operator[](MachineOperand::MachineOperandType OperandType) const {
+    unsigned LocalIndex = getCommonOperandIndex(OperandType);
+    return Storage[static_cast<unsigned>(Section::CommonOperands)][LocalIndex];
+  }
+
+  const Embedding &operator[](Register Reg) const {
+    // Reg is sometimes NoRegister (0) for optional operands. We return a zero
+    // vector in this case.
+    if (!Reg.isValid())
+      return ZeroEmbedding;
+    // TODO: Implement proper stack slot handling for MIR2Vec embeddings.
+    // Stack slots represent frame indices and should have their own
+    // embedding strategy rather than defaulting to register class 0.
+    // Consider: 1) Separate vocabulary section for stack slots
+    //          2) Stack slot size/alignment based embeddings
+    //          3) Frame index based categorization
+    if (Reg.isStack())
+      return ZeroEmbedding;
+
+    unsigned LocalIndex = getRegisterOperandIndex(Reg);
+    auto SectionID =
+        Reg.isPhysical() ? Section::PhyRegisters : Section::VirtRegisters;
+    return Storage[static_cast<unsigned>(SectionID)][LocalIndex];
+  }
+
+  /// Get entity ID (flat index) for a common operand type
+  /// This is used for triplet generation
+  unsigned getEntityIDForCommonOperand(
+      MachineOperand::MachineOperandType OperandType) const {
+    return Layout.CommonOperandBase + getCommonOperandIndex(OperandType);
+  }
+
+  /// Get entity ID (flat index) for a register
+  /// This is used for triplet generation
+  unsigned getEntityIDForRegister(Register Reg) const {
+    if (!Reg.isValid() || Reg.isStack())
+      return Layout
+          .VirtRegBase; // Return VirtRegBase for invalid/stack registers
+    unsigned LocalIndex = getRegisterOperandIndex(Reg);
+    size_t BaseOffset =
+        Reg.isPhysical() ? Layout.PhyRegBase : Layout.VirtRegBase;
+    return BaseOffset + LocalIndex;
+  }
+
 public:
   /// Static method for extracting base opcode names (public for testing)
   static std::string extractBaseOpcodeName(StringRef InstrName);
 
-  /// Get canonical index for base name (public for testing)
+  /// Get indices from opcode or operand names. These are public for testing.
+  /// String based lookups are inefficient and should be avoided in general.
   unsigned getCanonicalIndexForBaseName(StringRef BaseName) const;
+  unsigned getCanonicalIndexForOperandName(StringRef OperandName) const;
+  unsigned getCanonicalIndexForRegisterClass(StringRef RegName,
+                                             bool IsPhysical = true) const;
 
   /// Get the string key for a vocabulary entry at the given position
   std::string getStringKey(unsigned Pos) const;
 
   unsigned getDimension() const { return Storage.getDimension(); }
 
+  /// Get entity ID (flat index) for an opcode
+  /// This is used for triplet generation
+  unsigned getEntityIDForOpcode(unsigned Opcode) const {
+    return Layout.OpcodeBase + getCanonicalOpcodeIndex(Opcode);
+  }
+
+  /// Get entity ID (flat index) for a machine operand
+  /// This is used for triplet generation
+  unsigned getEntityIDForMachineOperand(const MachineOperand &MO) const {
+    if (MO.getType() == MachineOperand::MO_Register)
+      return getEntityIDForRegister(MO.getReg());
+    return getEntityIDForCommonOperand(MO.getType());
+  }
+
   // Accessor methods
   const Embedding &operator[](unsigned Opcode) const {
     unsigned LocalIndex = getCanonicalOpcodeIndex(Opcode);
     return Storage[static_cast<unsigned>(Section::Opcodes)][LocalIndex];
   }
 
+  const Embedding &operator[](MachineOperand Operand) const {
+    auto OperandType = Operand.getType();
+    if (OperandType == MachineOperand::MO_Register)
+      return operator[](Operand.getReg());
+    else
+      return operator[](OperandType);
+  }
+
   // Iterator access
   using const_iterator = ir2vec::VocabStorage::const_iterator;
   const_iterator begin() const { return Storage.begin(); }
@@ -120,18 +262,25 @@ class MIRVocabulary {
   MIRVocabulary() = delete;
 
   /// Factory method to create MIRVocabulary from vocabulary map
-  static Expected<MIRVocabulary> create(VocabMap &&Entries,
-                                        const TargetInstrInfo &TII);
+  static Expected<MIRVocabulary>
+  create(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap, VocabMap &&PhyRegMap,
+         VocabMap &&VirtRegMap, const TargetInstrInfo &TII,
+         const TargetRegisterInfo &TRI, const MachineRegisterInfo &MRI);
 
   /// Create a dummy vocabulary for testing purposes.
   static Expected<MIRVocabulary>
-  createDummyVocabForTest(const TargetInstrInfo &TII, unsigned Dim = 1);
+  createDummyVocabForTest(const TargetInstrInfo &TII,
+                          const TargetRegisterInfo &TRI,
+                          const MachineRegisterInfo &MRI, unsigned Dim = 1);
 
   /// Total number of entries in the vocabulary
   size_t getCanonicalSize() const { return Storage.size(); }
 
 private:
-  MIRVocabulary(VocabMap &&Entries, const TargetInstrInfo &TII);
+  MIRVocabulary(VocabMap &&OpcMap, VocabMap &&CommonOperandsMap,
+                VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+                const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+                const MachineRegisterInfo &MRI);
 };
 
 /// Base class for MIR embedders
@@ -144,11 +293,13 @@ class MIREmbedder {
   const unsigned Dimension;
 
   /// Weight for opcode embeddings
-  const float OpcWeight;
+  const float OpcWeight, CommonOperandWeight, RegOperandWeight;
 
   MIREmbedder(const MachineFunction &MF, const MIRVocabulary &Vocab)
       : MF(MF), Vocab(Vocab), Dimension(Vocab.getDimension()),
-        OpcWeight(mir2vec::OpcWeight) {}
+        OpcWeight(mir2vec::OpcWeight),
+        CommonOperandWeight(mir2vec::CommonOperandWeight),
+        RegOperandWeight(mir2vec::RegOperandWeight) {}
 
   /// Function to compute embeddings.
   Embedding computeEmbeddings() const;
@@ -204,26 +355,58 @@ class SymbolicMIREmbedder : public MIREmbedder {
 
 } // namespace mir2vec
 
+/// MIR2Vec vocabulary provider used by pass managers and standalone tools.
+/// This class encapsulates the core vocabulary loading logic and can be used
+/// independently of the pass manager infrastructure. For pass-based usage,
+/// see MIR2VecVocabLegacyAnalysis.
+///
+/// Note: This provider pattern makes new PM migration straightforward when
+/// needed. A new PM analysis wrapper can be added that delegates to this
+/// provider, similar to how MIR2VecVocabLegacyAnalysis currently wraps it.
+class MIR2VecVocabProvider {
+  using VocabMap = std::map<std::string, mir2vec::Embedding>;
+
+public:
+  MIR2VecVocabProvider(const MachineModuleInfo &MMI) : MMI(MMI) {}
+
+  Expected<mir2vec::MIRVocabulary> getVocabulary(const Module &M);
+
+private:
+  Error readVocabulary(VocabMap &OpcVocab, VocabMap &CommonOperandVocab,
+                       VocabMap &PhyRegVocabMap, VocabMap &VirtRegVocabMap);
+  const MachineModuleInfo &MMI;
+};
+
 /// Pass to analyze and populate MIR2Vec vocabulary from a module
 class MIR2VecVocabLegacyAnalysis : public ImmutablePass {
   using VocabVector = std::vector<mir2vec::Embedding>;
   using VocabMap = std::map<std::string, mir2vec::Embedding>;
-  VocabMap StrVocabMap;
-  VocabVector Vocab;
 
   StringRef getPassName() const override;
-  Error readVocabulary();
 
 protected:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<MachineModuleInfoWrapperPass>();
     AU.setPreservesAll();
   }
+  std::unique_ptr<MIR2VecVocabProvider> Provider;
 
 public:
   static char ID;
   MIR2VecVocabLegacyAnalysis() : ImmutablePass(ID) {}
-  Expected<mir2vec::MIRVocabulary> getMIR2VecVocabulary(const Module &M);
+
+  Expected<mir2vec::MIRVocabulary> getMIR2VecVocabulary(const Module &M) {
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    if (!Provider)
+      Provider = std::make_unique<MIR2VecVocabProvider>(MMI);
+    return Provider->getVocabulary(M);
+  }
+
+  MIR2VecVocabProvider &getProvider() {
+    assert(Provider && "Provider not initialized");
+    return *Provider;
+  }
 };
 
 /// This pass prints the embeddings in the MIR2Vec vocabulary
@@ -275,4 +458,4 @@ MachineFunctionPass *createMIR2VecPrinterLegacyPass(raw_ostream &OS);
 
 } // namespace llvm
 
-#endif // LLVM_CODEGEN_MIR2VEC_H
\ No newline at end of file
+#endif // LLVM_CODEGEN_MIR2VEC_H
diff --git a/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h
new file mode 100644
index 0000000000000..d044d5f940b75
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/MachineBlockHashInfo.h
@@ -0,0 +1,114 @@
+//===- llvm/CodeGen/MachineBlockHashInfo.h ----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Compute the hashes of basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_MACHINEBLOCKHASHINFO_H
+#define LLVM_CODEGEN_MACHINEBLOCKHASHINFO_H
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+
+namespace llvm {
+
+/// An object wrapping several components of a basic block hash. The combined
+/// (blended) hash is represented and stored as one uint64_t, while individual
+/// components are of smaller size (e.g., uint16_t or uint8_t).
+struct BlendedBlockHash {
+public:
+  explicit BlendedBlockHash(uint16_t Offset, uint16_t OpcodeHash,
+                            uint16_t InstrHash, uint16_t NeighborHash)
+      : Offset(Offset), OpcodeHash(OpcodeHash), InstrHash(InstrHash),
+        NeighborHash(NeighborHash) {}
+
+  explicit BlendedBlockHash(uint64_t CombinedHash) {
+    Offset = CombinedHash & 0xffff;
+    CombinedHash >>= 16;
+    OpcodeHash = CombinedHash & 0xffff;
+    CombinedHash >>= 16;
+    InstrHash = CombinedHash & 0xffff;
+    CombinedHash >>= 16;
+    NeighborHash = CombinedHash & 0xffff;
+  }
+
+  /// Combine the blended hash into uint64_t.
+  uint64_t combine() const {
+    uint64_t Hash = 0;
+    Hash |= uint64_t(NeighborHash);
+    Hash <<= 16;
+    Hash |= uint64_t(InstrHash);
+    Hash <<= 16;
+    Hash |= uint64_t(OpcodeHash);
+    Hash <<= 16;
+    Hash |= uint64_t(Offset);
+    return Hash;
+  }
+
+  /// Compute a distance between two given blended hashes. The smaller the
+  /// distance, the more similar two blocks are. For identical basic blocks,
+  /// the distance is zero.
+  /// Since OpcodeHash is highly stable, we consider a match good only if
+  /// the OpcodeHashes are identical. Mismatched OpcodeHashes lead to low
+  /// matching accuracy, and poor matches undermine the quality of final
+  /// inference. Notably, during inference, we also consider the matching
+  /// ratio of basic blocks. For MachineFunctions with a low matching
+  /// ratio, we directly skip optimization to reduce the impact of
+  /// mismatches. This ensures even very poor profiles won’t cause negative
+  /// optimization.
+  /// In the context of matching, we consider NeighborHash to be more
+  /// important. This is especially true when accounting for inlining
+  /// scenarios, where the position of a basic block in the control
+  /// flow graph is more critical.
+  uint64_t distance(const BlendedBlockHash &BBH) const {
+    assert(OpcodeHash == BBH.OpcodeHash &&
+           "incorrect blended hash distance computation");
+    uint64_t Dist = 0;
+    // Account for NeighborHash
+    Dist += NeighborHash == BBH.NeighborHash ? 0 : 1;
+    Dist <<= 16;
+    // Account for InstrHash
+    Dist += InstrHash == BBH.InstrHash ? 0 : 1;
+    Dist <<= 16;
+    // Account for Offset
+    Dist += (Offset >= BBH.Offset ? Offset - BBH.Offset : BBH.Offset - Offset);
+    return Dist;
+  }
+
+private:
+  /// The offset of the basic block from the function start.
+  uint16_t Offset{0};
+  /// Hash of the basic block instructions, excluding operands.
+  uint16_t OpcodeHash{0};
+  /// Hash of the basic block instructions, including opcodes and
+  /// operands.
+  uint16_t InstrHash{0};
+  /// OpcodeHash of the basic block together with OpcodeHashes of its
+  /// successors and predecessors.
+  uint16_t NeighborHash{0};
+};
+
+class MachineBlockHashInfo : public MachineFunctionPass {
+  DenseMap<const MachineBasicBlock *, uint64_t> MBBHashInfo;
+
+public:
+  static char ID;
+  MachineBlockHashInfo();
+
+  StringRef getPassName() const override { return "Basic Block Hash Compute"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  uint64_t getMBBHash(const MachineBasicBlock &MBB);
+};
+
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_MACHINEBLOCKHASHINFO_H
diff --git a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
index cd00e5f3934c6..bea1c7893c136 100644
--- a/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
+++ b/llvm/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -42,7 +42,7 @@ class MachineFunctionAnalysis
                              FunctionAnalysisManager::Invalidator &);
   };
 
-  MachineFunctionAnalysis(const TargetMachine *TM) : TM(TM) {};
+  MachineFunctionAnalysis(const TargetMachine &TM) : TM(&TM) {};
   LLVM_ABI Result run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
index 770f1b37f24f5..55048960fd4d5 100644
--- a/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
+++ b/llvm/include/llvm/CodeGen/MachineModuleSlotTracker.h
@@ -37,7 +37,7 @@ class LLVM_ABI MachineModuleSlotTracker : public ModuleSlotTracker {
   MachineModuleSlotTracker(const MachineModuleInfo &MMI,
                            const MachineFunction *MF,
                            bool ShouldInitializeAllMetadata = true);
-  ~MachineModuleSlotTracker();
+  ~MachineModuleSlotTracker() override;
 
   void collectMachineMDNodes(MachineMDNodeListType &L) const;
 };
diff --git a/llvm/include/llvm/CodeGen/MachineOutliner.h b/llvm/include/llvm/CodeGen/MachineOutliner.h
index fbb958ccf6488..66cab3d652104 100644
--- a/llvm/include/llvm/CodeGen/MachineOutliner.h
+++ b/llvm/include/llvm/CodeGen/MachineOutliner.h
@@ -306,7 +306,7 @@ struct GlobalOutlinedFunction : public OutlinedFunction {
   }
 
   GlobalOutlinedFunction() = delete;
-  ~GlobalOutlinedFunction() = default;
+  ~GlobalOutlinedFunction() override = default;
 };
 
 } // namespace outliner
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 7fae550d8d170..a8525554b142e 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -69,6 +69,9 @@ LLVM_ABI MachineFunctionPass *createBasicBlockSectionsPass();
 
 LLVM_ABI MachineFunctionPass *createBasicBlockPathCloningPass();
 
+/// createMachineBlockHashInfoPass - This pass computes basic block hashes.
+LLVM_ABI MachineFunctionPass *createMachineBlockHashInfoPass();
+
 /// createMachineFunctionSplitterPass - This pass splits machine functions
 /// using profile information.
 LLVM_ABI MachineFunctionPass *createMachineFunctionSplitterPass();
@@ -607,14 +610,6 @@ LLVM_ABI ModulePass *createCheckDebugMachineModulePass();
 /// caller saved registers with stack slots.
 LLVM_ABI extern char &FixupStatepointCallerSavedID;
 
-/// The pass transforms load/store <256 x i32> to AMX load/store intrinsics
-/// or split the data to two <128 x i32>.
-LLVM_ABI FunctionPass *createX86LowerAMXTypePass();
-
-/// The pass transforms amx intrinsics to scalar operation if the function has
-/// optnone attribute or it is O0.
-LLVM_ABI FunctionPass *createX86LowerAMXIntrinsicsPass();
-
 /// When learning an eviction policy, extract score(reward) information,
 /// otherwise this does nothing
 LLVM_ABI FunctionPass *createRegAllocScoringPass();
diff --git a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
index c15bc677ae53f..0af4f4720212b 100644
--- a/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
+++ b/llvm/include/llvm/CodeGen/ResourcePriorityQueue.h
@@ -75,7 +75,7 @@ namespace llvm {
 
   public:
     ResourcePriorityQueue(SelectionDAGISel *IS);
-    ~ResourcePriorityQueue();
+    ~ResourcePriorityQueue() override;
 
     bool isBottomUp() const override { return false; }
 
diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 201dc68de8b76..0dcf400962393 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -558,6 +558,11 @@ m_VSelect(const T0_P &Cond, const T1_P &T, const T2_P &F) {
   return TernaryOpc_match<T0_P, T1_P, T2_P>(ISD::VSELECT, Cond, T, F);
 }
 
+template <typename T0_P, typename T1_P, typename T2_P>
+inline auto m_SelectLike(const T0_P &Cond, const T1_P &T, const T2_P &F) {
+  return m_AnyOf(m_Select(Cond, T, F), m_VSelect(Cond, T, F));
+}
+
 template <typename T0_P, typename T1_P, typename T2_P>
 inline Result_match<0, TernaryOpc_match<T0_P, T1_P, T2_P>>
 m_Load(const T0_P &Ch, const T1_P &Ptr, const T2_P &Offset) {
diff --git a/llvm/include/llvm/CodeGen/SafeStack.h b/llvm/include/llvm/CodeGen/SafeStack.h
index e8f0d141457ba..05ad40e6c68c5 100644
--- a/llvm/include/llvm/CodeGen/SafeStack.h
+++ b/llvm/include/llvm/CodeGen/SafeStack.h
@@ -19,7 +19,7 @@ class SafeStackPass : public PassInfoMixin<SafeStackPass> {
   const TargetMachine *TM;
 
 public:
-  explicit SafeStackPass(const TargetMachine *TM_) : TM(TM_) {}
+  explicit SafeStackPass(const TargetMachine &TM_) : TM(&TM_) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
index 4eacbdca163d9..ab0d7e334df44 100644
--- a/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
+++ b/llvm/include/llvm/CodeGen/ScheduleDAGInstrs.h
@@ -18,7 +18,6 @@
 #include "llvm/ADT/PointerIntPair.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseMultiSet.h"
-#include "llvm/ADT/identity.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -90,7 +89,7 @@ namespace llvm {
   /// allocated once for the pass. It can be cleared in constant time and reused
   /// without any frees.
   using RegUnit2SUnitsMap =
-      SparseMultiSet<PhysRegSUOper, unsigned, identity_cxx20, uint16_t>;
+      SparseMultiSet<PhysRegSUOper, unsigned, identity, uint16_t>;
 
   /// Track local uses of virtual registers. These uses are gathered by the DAG
   /// builder and may be consulted by the scheduler to avoid iterating an entire
diff --git a/llvm/include/llvm/CodeGen/SelectOptimize.h b/llvm/include/llvm/CodeGen/SelectOptimize.h
index 37024a154145a..33f66bb16c4de 100644
--- a/llvm/include/llvm/CodeGen/SelectOptimize.h
+++ b/llvm/include/llvm/CodeGen/SelectOptimize.h
@@ -25,7 +25,7 @@ class SelectOptimizePass : public PassInfoMixin<SelectOptimizePass> {
   const TargetMachine *TM;
 
 public:
-  explicit SelectOptimizePass(const TargetMachine *TM) : TM(TM) {}
+  explicit SelectOptimizePass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
index 69713d0d84011..1759463ea7965 100644
--- a/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGNodes.h
@@ -418,12 +418,21 @@ struct SDNodeFlags {
     Unpredictable = 1 << 13,
     // Compare instructions which may carry the samesign flag.
     SameSign = 1 << 14,
+    // ISD::PTRADD operations that remain in bounds, i.e., the left operand is
+    // an address in a memory object in which the result of the operation also
+    // lies. WARNING: Since SDAG generally uses integers instead of pointer
+    // types, a PTRADD's pointer operand is effectively the result of an
+    // implicit inttoptr cast. Therefore, when an inbounds PTRADD uses a
+    // pointer P, transformations cannot assume that P has the provenance
+    // implied by its producer as, e.g, operations between producer and PTRADD
+    // that affect the provenance may have been optimized away.
+    InBounds = 1 << 15,
 
     // NOTE: Please update LargestValue in LLVM_DECLARE_ENUM_AS_BITMASK below
     // the class definition when adding new flags.
 
     PoisonGeneratingFlags = NoUnsignedWrap | NoSignedWrap | Exact | Disjoint |
-                            NonNeg | NoNaNs | NoInfs | SameSign,
+                            NonNeg | NoNaNs | NoInfs | SameSign | InBounds,
     FastMathFlags = NoNaNs | NoInfs | NoSignedZeros | AllowReciprocal |
                     AllowContract | ApproximateFuncs | AllowReassociation,
   };
@@ -458,6 +467,7 @@ struct SDNodeFlags {
   void setAllowReassociation(bool b) { setFlag<AllowReassociation>(b); }
   void setNoFPExcept(bool b) { setFlag<NoFPExcept>(b); }
   void setUnpredictable(bool b) { setFlag<Unpredictable>(b); }
+  void setInBounds(bool b) { setFlag<InBounds>(b); }
 
   // These are accessors for each flag.
   bool hasNoUnsignedWrap() const { return Flags & NoUnsignedWrap; }
@@ -475,6 +485,7 @@ struct SDNodeFlags {
   bool hasAllowReassociation() const { return Flags & AllowReassociation; }
   bool hasNoFPExcept() const { return Flags & NoFPExcept; }
   bool hasUnpredictable() const { return Flags & Unpredictable; }
+  bool hasInBounds() const { return Flags & InBounds; }
 
   bool operator==(const SDNodeFlags &Other) const {
     return Flags == Other.Flags;
@@ -484,7 +495,7 @@ struct SDNodeFlags {
 };
 
 LLVM_DECLARE_ENUM_AS_BITMASK(decltype(SDNodeFlags::None),
-                             SDNodeFlags::SameSign);
+                             SDNodeFlags::InBounds);
 
 inline SDNodeFlags operator|(SDNodeFlags LHS, SDNodeFlags RHS) {
   LHS |= RHS;
diff --git a/llvm/include/llvm/CodeGen/StackProtector.h b/llvm/include/llvm/CodeGen/StackProtector.h
index dfafc781067d7..fbe79353b9c33 100644
--- a/llvm/include/llvm/CodeGen/StackProtector.h
+++ b/llvm/include/llvm/CodeGen/StackProtector.h
@@ -86,7 +86,7 @@ class StackProtectorPass : public PassInfoMixin<StackProtectorPass> {
   const TargetMachine *TM;
 
 public:
-  explicit StackProtectorPass(const TargetMachine *TM) : TM(TM) {}
+  explicit StackProtectorPass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &FAM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 64a7563182a98..d6ed3a8f739b3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2127,7 +2127,7 @@ class LLVM_ABI TargetLoweringBase {
   /// performs validation and error handling, returns the function. Otherwise,
   /// returns nullptr. Must be previously inserted by insertSSPDeclarations.
   /// Should be used only when getIRStackGuard returns nullptr.
-  virtual Function *getSSPStackGuardCheck(const Module &M) const;
+  Function *getSSPStackGuardCheck(const Module &M) const;
 
 protected:
   Value *getDefaultSafeStackPointerLocation(IRBuilderBase &IRB,
@@ -3576,6 +3576,10 @@ class LLVM_ABI TargetLoweringBase {
     return nullptr;
   }
 
+  const RTLIB::RuntimeLibcallsInfo &getRuntimeLibcallsInfo() const {
+    return Libcalls;
+  }
+
   void setLibcallImpl(RTLIB::Libcall Call, RTLIB::LibcallImpl Impl) {
     Libcalls.setLibcallImpl(Call, Impl);
   }
@@ -3813,10 +3817,6 @@ class LLVM_ABI TargetLoweringBase {
   /// The list of libcalls that the target will use.
   RTLIB::RuntimeLibcallsInfo Libcalls;
 
-  /// The ISD::CondCode that should be used to test the result of each of the
-  /// comparison libcall against zero.
-  ISD::CondCode CmpLibcallCCs[RTLIB::UNKNOWN_LIBCALL];
-
   /// The bits of IndexedModeActions used to store the legalisation actions
   /// We store the data as   | ML | MS |  L |  S | each taking 4 bits.
   enum IndexedModeActionsBits {
diff --git a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
index da5338f365489..54b3f6a04d241 100644
--- a/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
+++ b/llvm/include/llvm/CodeGen/TargetRegisterInfo.h
@@ -280,7 +280,7 @@ class LLVM_ABI TargetRegisterInfo : public MCRegisterInfo {
                      unsigned Mode = 0);
 
 public:
-  virtual ~TargetRegisterInfo();
+  ~TargetRegisterInfo() override;
 
   /// Return the number of registers for the function. (may overestimate)
   virtual unsigned getNumSupportedRegs(const MachineFunction &) const {
diff --git a/llvm/include/llvm/CodeGen/TypePromotion.h b/llvm/include/llvm/CodeGen/TypePromotion.h
index efe58232cdcdd..ba32a21066257 100644
--- a/llvm/include/llvm/CodeGen/TypePromotion.h
+++ b/llvm/include/llvm/CodeGen/TypePromotion.h
@@ -26,7 +26,7 @@ class TypePromotionPass : public PassInfoMixin<TypePromotionPass> {
   const TargetMachine *TM;
 
 public:
-  TypePromotionPass(const TargetMachine *TM): TM(TM) { }
+  TypePromotionPass(const TargetMachine &TM) : TM(&TM) {}
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
 };
 
diff --git a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
index 112ff6df03445..65ff1ebe9bd8a 100644
--- a/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
+++ b/llvm/include/llvm/CodeGen/VLIWMachineScheduler.h
@@ -223,7 +223,7 @@ class ConvergingVLIWScheduler : public MachineSchedStrategy {
   enum { TopQID = 1, BotQID = 2, LogMaxQID = 2 };
 
   ConvergingVLIWScheduler() : Top(TopQID, "TopQ"), Bot(BotQID, "BotQ") {}
-  virtual ~ConvergingVLIWScheduler() = default;
+  ~ConvergingVLIWScheduler() override = default;
 
   void initialize(ScheduleDAGMI *dag) override;
 
diff --git a/llvm/include/llvm/Config/llvm-config.h.cmake b/llvm/include/llvm/Config/llvm-config.h.cmake
index 56a40fda9d91c..978be87064dc0 100644
--- a/llvm/include/llvm/Config/llvm-config.h.cmake
+++ b/llvm/include/llvm/Config/llvm-config.h.cmake
@@ -17,7 +17,7 @@
 /* The number of commits in the linear history from the
  * start of the universe up to the latest llvm main commit
  * that has been merged */
-#define LLVM_MAIN_REVISION 556323
+#define LLVM_MAIN_REVISION 556851
 
 /* Define if LLVM_ENABLE_DUMP is enabled */
 #cmakedefine LLVM_ENABLE_DUMP
diff --git a/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
index 3a36863ac7e32..5a46207d402e5 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/AppendingTypeTableBuilder.h
@@ -34,7 +34,7 @@ class LLVM_ABI AppendingTypeTableBuilder : public TypeCollection {
 
 public:
   explicit AppendingTypeTableBuilder(BumpPtrAllocator &Storage);
-  ~AppendingTypeTableBuilder();
+  ~AppendingTypeTableBuilder() override;
 
   // TypeCollection overrides
   std::optional<TypeIndex> getFirst() override;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
index a587b3e2ed628..93e1c99a0110e 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/GlobalTypeTableBuilder.h
@@ -47,7 +47,7 @@ class LLVM_ABI GlobalTypeTableBuilder : public TypeCollection {
 
 public:
   explicit GlobalTypeTableBuilder(BumpPtrAllocator &Storage);
-  ~GlobalTypeTableBuilder();
+  ~GlobalTypeTableBuilder() override;
 
   // TypeCollection overrides
   std::optional<TypeIndex> getFirst() override;
diff --git a/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h b/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
index 730278488176a..b9b2669c84085 100644
--- a/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
+++ b/llvm/include/llvm/DebugInfo/CodeView/MergingTypeTableBuilder.h
@@ -43,7 +43,7 @@ class LLVM_ABI MergingTypeTableBuilder : public TypeCollection {
 
 public:
   explicit MergingTypeTableBuilder(BumpPtrAllocator &Storage);
-  ~MergingTypeTableBuilder();
+  ~MergingTypeTableBuilder() override;
 
   // TypeCollection overrides
   std::optional<TypeIndex> getFirst() override;
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
index 547d92c6ce004..451c438a5e4f6 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h
@@ -102,9 +102,7 @@ class DWARFAbbreviationDeclaration {
   using attr_iterator_range =
       iterator_range<AttributeSpecVector::const_iterator>;
 
-  attr_iterator_range attributes() const {
-    return attr_iterator_range(AttributeSpecs.begin(), AttributeSpecs.end());
-  }
+  attr_iterator_range attributes() const { return AttributeSpecs; }
 
   dwarf::Form getFormByIndex(uint32_t idx) const {
     assert(idx < AttributeSpecs.size());
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
index 65189916b41f1..c1d9be774ca97 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFDebugFrame.h
@@ -226,9 +226,7 @@ class DWARFDebugFrame {
   /// DWARF Frame entries accessors
   iterator begin() const { return Entries.begin(); }
   iterator end() const { return Entries.end(); }
-  iterator_range<iterator> entries() const {
-    return iterator_range<iterator>(Entries.begin(), Entries.end());
-  }
+  iterator_range<iterator> entries() const { return Entries; }
 
   uint64_t getEHFrameAddress() const { return EHFrameAddress; }
 };
diff --git a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
index 964ff8e396660..7f9bf12189bbb 100644
--- a/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
+++ b/llvm/include/llvm/DebugInfo/DWARF/DWARFUnit.h
@@ -563,7 +563,7 @@ class LLVM_ABI DWARFUnit {
 
   die_iterator_range dies() {
     extractDIEsIfNeeded(false);
-    return die_iterator_range(DieArray.begin(), DieArray.end());
+    return DieArray;
   }
 
   virtual void dump(raw_ostream &OS, DIDumpOptions DumpOpts) = 0;
diff --git a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
index 07d599cf9b5c6..e3e9b2bb91e8a 100644
--- a/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
+++ b/llvm/include/llvm/DebugInfo/GSYM/GsymContext.h
@@ -30,7 +30,7 @@ class GsymReader;
 class GsymContext : public DIContext {
 public:
   GsymContext(std::unique_ptr<GsymReader> Reader);
-  ~GsymContext();
+  ~GsymContext() override;
 
   GsymContext(GsymContext &) = delete;
   GsymContext &operator=(GsymContext &) = delete;
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
index 0e7be45abfef4..34bace886a15f 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVElement.h
@@ -143,7 +143,7 @@ class LLVM_ABI LVElement : public LVObject {
         VirtualityCode(0) {}
   LVElement(const LVElement &) = delete;
   LVElement &operator=(const LVElement &) = delete;
-  virtual ~LVElement() = default;
+  ~LVElement() override = default;
 
   LVSubclassID getSubclassID() const { return SubclassID; }
 
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
index 3618ce7b0ecda..dd17f76f966ef 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLine.h
@@ -53,7 +53,7 @@ class LLVM_ABI LVLine : public LVElement {
   }
   LVLine(const LVLine &) = delete;
   LVLine &operator=(const LVLine &) = delete;
-  virtual ~LVLine() = default;
+  ~LVLine() override = default;
 
   static bool classof(const LVElement *Element) {
     return Element->getSubclassID() == LVSubclassID::LV_LINE;
@@ -117,7 +117,7 @@ class LLVM_ABI LVLineDebug final : public LVLine {
   LVLineDebug() : LVLine() { setIsLineDebug(); }
   LVLineDebug(const LVLineDebug &) = delete;
   LVLineDebug &operator=(const LVLineDebug &) = delete;
-  ~LVLineDebug() = default;
+  ~LVLineDebug() override = default;
 
   // Additional line information. It includes attributes that describes
   // states in the machine instructions (basic block, end prologue, etc).
@@ -142,7 +142,7 @@ class LLVM_ABI LVLineAssembler final : public LVLine {
   LVLineAssembler() : LVLine() { setIsLineAssembler(); }
   LVLineAssembler(const LVLineAssembler &) = delete;
   LVLineAssembler &operator=(const LVLineAssembler &) = delete;
-  ~LVLineAssembler() = default;
+  ~LVLineAssembler() override = default;
 
   // Print blanks as the line number.
   std::string noLineAsString(bool ShowZero) const override {
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
index 0718e33f5645b..090af54cd1b85 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVLocation.h
@@ -100,7 +100,7 @@ class LLVM_ABI LVLocation : public LVObject {
   LVLocation() : LVObject() { setIsLocation(); }
   LVLocation(const LVLocation &) = delete;
   LVLocation &operator=(const LVLocation &) = delete;
-  virtual ~LVLocation() = default;
+  ~LVLocation() override = default;
 
   PROPERTY(Property, IsAddressRange);
   PROPERTY(Property, IsBaseClassOffset);
@@ -171,7 +171,7 @@ class LLVM_ABI LVLocationSymbol final : public LVLocation {
   LVLocationSymbol() : LVLocation() {}
   LVLocationSymbol(const LVLocationSymbol &) = delete;
   LVLocationSymbol &operator=(const LVLocationSymbol &) = delete;
-  ~LVLocationSymbol() = default;
+  ~LVLocationSymbol() override = default;
 
   void addObject(LVAddress LowPC, LVAddress HighPC, LVUnsigned SectionOffset,
                  uint64_t LocDescOffset) override;
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
index b5c833330e59e..4fa6a9aa6f047 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVRange.h
@@ -62,7 +62,7 @@ class LLVM_ABI LVRange final : public LVObject {
   LVRange() : LVObject(), RangesTree(Allocator) {}
   LVRange(const LVRange &) = delete;
   LVRange &operator=(const LVRange &) = delete;
-  ~LVRange() = default;
+  ~LVRange() override = default;
 
   void addEntry(LVScope *Scope, LVAddress LowerAddress, LVAddress UpperAddress);
   void addEntry(LVScope *Scope);
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
index f4f3516769938..2e2619c55d58e 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVScope.h
@@ -153,7 +153,7 @@ class LLVM_ABI LVScope : public LVElement {
   }
   LVScope(const LVScope &) = delete;
   LVScope &operator=(const LVScope &) = delete;
-  virtual ~LVScope() = default;
+  ~LVScope() override = default;
 
   static bool classof(const LVElement *Element) {
     return Element->getSubclassID() == LVSubclassID::LV_SCOPE;
@@ -349,7 +349,7 @@ class LLVM_ABI LVScopeAggregate final : public LVScope {
   LVScopeAggregate() : LVScope() {}
   LVScopeAggregate(const LVScopeAggregate &) = delete;
   LVScopeAggregate &operator=(const LVScopeAggregate &) = delete;
-  ~LVScopeAggregate() = default;
+  ~LVScopeAggregate() override = default;
 
   // DW_AT_specification, DW_AT_abstract_origin.
   LVScope *getReference() const override { return Reference; }
@@ -387,7 +387,7 @@ class LLVM_ABI LVScopeAlias final : public LVScope {
   }
   LVScopeAlias(const LVScopeAlias &) = delete;
   LVScopeAlias &operator=(const LVScopeAlias &) = delete;
-  ~LVScopeAlias() = default;
+  ~LVScopeAlias() override = default;
 
   // Returns true if current scope is logically equal to the given 'Scope'.
   bool equals(const LVScope *Scope) const override;
@@ -401,7 +401,7 @@ class LLVM_ABI LVScopeArray final : public LVScope {
   LVScopeArray() : LVScope() { setIsArray(); }
   LVScopeArray(const LVScopeArray &) = delete;
   LVScopeArray &operator=(const LVScopeArray &) = delete;
-  ~LVScopeArray() = default;
+  ~LVScopeArray() override = default;
 
   void resolveExtra() override;
 
@@ -513,7 +513,7 @@ class LLVM_ABI LVScopeCompileUnit final : public LVScope {
   }
   LVScopeCompileUnit(const LVScopeCompileUnit &) = delete;
   LVScopeCompileUnit &operator=(const LVScopeCompileUnit &) = delete;
-  ~LVScopeCompileUnit() = default;
+  ~LVScopeCompileUnit() override = default;
 
   LVScope *getCompileUnitParent() const override {
     return static_cast<LVScope *>(const_cast<LVScopeCompileUnit *>(this));
@@ -643,7 +643,7 @@ class LLVM_ABI LVScopeEnumeration final : public LVScope {
   LVScopeEnumeration() : LVScope() { setIsEnumeration(); }
   LVScopeEnumeration(const LVScopeEnumeration &) = delete;
   LVScopeEnumeration &operator=(const LVScopeEnumeration &) = delete;
-  ~LVScopeEnumeration() = default;
+  ~LVScopeEnumeration() override = default;
 
   // Returns true if current scope is logically equal to the given 'Scope'.
   bool equals(const LVScope *Scope) const override;
@@ -658,7 +658,7 @@ class LLVM_ABI LVScopeFormalPack final : public LVScope {
   LVScopeFormalPack() : LVScope() { setIsTemplatePack(); }
   LVScopeFormalPack(const LVScopeFormalPack &) = delete;
   LVScopeFormalPack &operator=(const LVScopeFormalPack &) = delete;
-  ~LVScopeFormalPack() = default;
+  ~LVScopeFormalPack() override = default;
 
   // Returns true if current scope is logically equal to the given 'Scope'.
   bool equals(const LVScope *Scope) const override;
@@ -676,7 +676,7 @@ class LLVM_ABI LVScopeFunction : public LVScope {
   LVScopeFunction() : LVScope() {}
   LVScopeFunction(const LVScopeFunction &) = delete;
   LVScopeFunction &operator=(const LVScopeFunction &) = delete;
-  virtual ~LVScopeFunction() = default;
+  ~LVScopeFunction() override = default;
 
   // DW_AT_specification, DW_AT_abstract_origin.
   LVScope *getReference() const override { return Reference; }
@@ -728,7 +728,7 @@ class LLVM_ABI LVScopeFunctionInlined final : public LVScopeFunction {
   LVScopeFunctionInlined() : LVScopeFunction() { setIsInlinedFunction(); }
   LVScopeFunctionInlined(const LVScopeFunctionInlined &) = delete;
   LVScopeFunctionInlined &operator=(const LVScopeFunctionInlined &) = delete;
-  ~LVScopeFunctionInlined() = default;
+  ~LVScopeFunctionInlined() override = default;
 
   uint32_t getDiscriminator() const override { return Discriminator; }
   void setDiscriminator(uint32_t Value) override {
@@ -767,7 +767,7 @@ class LLVM_ABI LVScopeFunctionType final : public LVScopeFunction {
   LVScopeFunctionType() : LVScopeFunction() { setIsFunctionType(); }
   LVScopeFunctionType(const LVScopeFunctionType &) = delete;
   LVScopeFunctionType &operator=(const LVScopeFunctionType &) = delete;
-  ~LVScopeFunctionType() = default;
+  ~LVScopeFunctionType() override = default;
 
   void resolveExtra() override;
 };
@@ -781,7 +781,7 @@ class LLVM_ABI LVScopeModule final : public LVScope {
   }
   LVScopeModule(const LVScopeModule &) = delete;
   LVScopeModule &operator=(const LVScopeModule &) = delete;
-  ~LVScopeModule() = default;
+  ~LVScopeModule() override = default;
 
   // Returns true if current scope is logically equal to the given 'Scope'.
   bool equals(const LVScope *Scope) const override;
@@ -797,7 +797,7 @@ class LLVM_ABI LVScopeNamespace final : public LVScope {
   LVScopeNamespace() : LVScope() { setIsNamespace(); }
   LVScopeNamespace(const LVScopeNamespace &) = delete;
   LVScopeNamespace &operator=(const LVScopeNamespace &) = delete;
-  ~LVScopeNamespace() = default;
+  ~LVScopeNamespace() override = default;
 
   // Access DW_AT_extension reference.
   LVScope *getReference() const override { return Reference; }
@@ -827,7 +827,7 @@ class LLVM_ABI LVScopeRoot final : public LVScope {
   LVScopeRoot() : LVScope() { setIsRoot(); }
   LVScopeRoot(const LVScopeRoot &) = delete;
   LVScopeRoot &operator=(const LVScopeRoot &) = delete;
-  ~LVScopeRoot() = default;
+  ~LVScopeRoot() override = default;
 
   StringRef getFileFormatName() const {
     return getStringPool().getString(FileFormatNameIndex);
@@ -859,7 +859,7 @@ class LLVM_ABI LVScopeTemplatePack final : public LVScope {
   LVScopeTemplatePack() : LVScope() { setIsTemplatePack(); }
   LVScopeTemplatePack(const LVScopeTemplatePack &) = delete;
   LVScopeTemplatePack &operator=(const LVScopeTemplatePack &) = delete;
-  ~LVScopeTemplatePack() = default;
+  ~LVScopeTemplatePack() override = default;
 
   // Returns true if current scope is logically equal to the given 'Scope'.
   bool equals(const LVScope *Scope) const override;
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
index ec9017e16b659..c5314fca084da 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVSymbol.h
@@ -74,7 +74,7 @@ class LLVM_ABI LVSymbol final : public LVElement {
   }
   LVSymbol(const LVSymbol &) = delete;
   LVSymbol &operator=(const LVSymbol &) = delete;
-  ~LVSymbol() = default;
+  ~LVSymbol() override = default;
 
   static bool classof(const LVElement *Element) {
     return Element->getSubclassID() == LVSubclassID::LV_SYMBOL;
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
index 59e6a92be8ce6..af4abcf56c5ac 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Core/LVType.h
@@ -67,7 +67,7 @@ class LLVM_ABI LVType : public LVElement {
   LVType() : LVElement(LVSubclassID::LV_TYPE) { setIsType(); }
   LVType(const LVType &) = delete;
   LVType &operator=(const LVType &) = delete;
-  virtual ~LVType() = default;
+  ~LVType() override = default;
 
   static bool classof(const LVElement *Element) {
     return Element->getSubclassID() == LVSubclassID::LV_TYPE;
@@ -157,7 +157,7 @@ class LLVM_ABI LVTypeDefinition final : public LVType {
   }
   LVTypeDefinition(const LVTypeDefinition &) = delete;
   LVTypeDefinition &operator=(const LVTypeDefinition &) = delete;
-  ~LVTypeDefinition() = default;
+  ~LVTypeDefinition() override = default;
 
   // Return the underlying type for a type definition.
   LVElement *getUnderlyingType() override;
@@ -183,7 +183,7 @@ class LLVM_ABI LVTypeEnumerator final : public LVType {
   }
   LVTypeEnumerator(const LVTypeEnumerator &) = delete;
   LVTypeEnumerator &operator=(const LVTypeEnumerator &) = delete;
-  ~LVTypeEnumerator() = default;
+  ~LVTypeEnumerator() override = default;
 
   // Process the values for a DW_TAG_enumerator.
   StringRef getValue() const override {
@@ -206,7 +206,7 @@ class LLVM_ABI LVTypeImport final : public LVType {
   LVTypeImport() : LVType() { setIncludeInPrint(); }
   LVTypeImport(const LVTypeImport &) = delete;
   LVTypeImport &operator=(const LVTypeImport &) = delete;
-  ~LVTypeImport() = default;
+  ~LVTypeImport() override = default;
 
   // Returns true if current type is logically equal to the given 'Type'.
   bool equals(const LVType *Type) const override;
@@ -223,7 +223,7 @@ class LLVM_ABI LVTypeParam final : public LVType {
   LVTypeParam();
   LVTypeParam(const LVTypeParam &) = delete;
   LVTypeParam &operator=(const LVTypeParam &) = delete;
-  ~LVTypeParam() = default;
+  ~LVTypeParam() override = default;
 
   // Template parameter value.
   StringRef getValue() const override {
@@ -256,7 +256,7 @@ class LLVM_ABI LVTypeSubrange final : public LVType {
   }
   LVTypeSubrange(const LVTypeSubrange &) = delete;
   LVTypeSubrange &operator=(const LVTypeSubrange &) = delete;
-  ~LVTypeSubrange() = default;
+  ~LVTypeSubrange() override = default;
 
   int64_t getCount() const override {
     return getIsSubrangeCount() ? LowerBound : 0;
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
index 2cf4a8ec6a37f..cc8dda2f82bee 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVBinaryReader.h
@@ -192,7 +192,7 @@ class LVBinaryReader : public LVReader {
       : LVReader(Filename, FileFormatName, W, BinaryType) {}
   LVBinaryReader(const LVBinaryReader &) = delete;
   LVBinaryReader &operator=(const LVBinaryReader &) = delete;
-  virtual ~LVBinaryReader() = default;
+  ~LVBinaryReader() override = default;
 
   void addInlineeLines(LVScope *Scope, LVLines &Lines) {
     CUInlineeLines.emplace(Scope, std::make_unique<LVLines>(std::move(Lines)));
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h
index 4dd7c967ddc17..9f6fd553221db 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVCodeViewReader.h
@@ -200,7 +200,7 @@ class LVCodeViewReader final : public LVBinaryReader {
         Input(&Pdb), ExePath(ExePath), LogicalVisitor(this, W, Input) {}
   LVCodeViewReader(const LVCodeViewReader &) = delete;
   LVCodeViewReader &operator=(const LVCodeViewReader &) = delete;
-  ~LVCodeViewReader() = default;
+  ~LVCodeViewReader() override = default;
 
   void getLinkageName(const llvm::object::coff_section *CoffSection,
                       uint32_t RelocOffset, uint32_t Offset,
diff --git a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
index 2abc18b8dacac..1cf29147fe2a1 100644
--- a/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
+++ b/llvm/include/llvm/DebugInfo/LogicalView/Readers/LVDWARFReader.h
@@ -123,7 +123,7 @@ class LVDWARFReader final : public LVBinaryReader {
         Obj(Obj) {}
   LVDWARFReader(const LVDWARFReader &) = delete;
   LVDWARFReader &operator=(const LVDWARFReader &) = delete;
-  ~LVDWARFReader() = default;
+  ~LVDWARFReader() override = default;
 
   LVAddress getCUBaseAddress() const { return CUBaseAddress; }
   void setCUBaseAddress(LVAddress Address) { CUBaseAddress = Address; }
diff --git a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
index b21cd092939e6..c0b315131a911 100644
--- a/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
+++ b/llvm/include/llvm/DebugInfo/PDB/PDBSymbolTypeBuiltin.h
@@ -20,7 +20,7 @@ namespace pdb {
 class LLVM_ABI PDBSymbolTypeBuiltin : public PDBSymbol {
   DECLARE_PDB_SYMBOL_CONCRETE_TYPE(PDB_SymType::BuiltinType)
 public:
-  ~PDBSymbolTypeBuiltin();
+  ~PDBSymbolTypeBuiltin() override;
   void dump(PDBSymDumper &Dumper) const override;
 
   FORWARD_SYMBOL_METHOD(getBuiltinType)
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Core.h b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
index 8613ddd8e3b11..24a0cb74b1fbc 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Core.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Core.h
@@ -448,7 +448,7 @@ class LLVM_ABI FailedToMaterialize : public ErrorInfo<FailedToMaterialize> {
 
   FailedToMaterialize(std::shared_ptr<SymbolStringPool> SSP,
                       std::shared_ptr<SymbolDependenceMap> Symbols);
-  ~FailedToMaterialize();
+  ~FailedToMaterialize() override;
   std::error_code convertToErrorCode() const override;
   void log(raw_ostream &OS) const override;
   const SymbolDependenceMap &getSymbols() const { return *Symbols; }
@@ -1768,7 +1768,7 @@ class ExecutionSession {
   // FIXME: We should be able to derive FailedSymsForQuery from each query once
   //        we fix how the detach operation works.
   struct EmitQueries {
-    JITDylib::AsynchronousSymbolQuerySet Updated;
+    JITDylib::AsynchronousSymbolQuerySet Completed;
     JITDylib::AsynchronousSymbolQuerySet Failed;
     DenseMap<AsynchronousSymbolQuery *, std::shared_ptr<SymbolDependenceMap>>
         FailedSymsForQuery;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h
index 254b897bca614..e84eb4bec297a 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/DebugObjectManagerPlugin.h
@@ -70,7 +70,7 @@ class LLVM_ABI DebugObjectManagerPlugin : public ObjectLinkingLayer::Plugin {
   DebugObjectManagerPlugin(ExecutionSession &ES,
                            std::unique_ptr<DebugObjectRegistrar> Target,
                            bool RequireDebugSections, bool AutoRegisterCode);
-  ~DebugObjectManagerPlugin();
+  ~DebugObjectManagerPlugin() override;
 
   void notifyMaterializing(MaterializationResponsibility &MR,
                            jitlink::LinkGraph &G, jitlink::JITLinkContext &Ctx,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h
index 179fedc3972ab..f9b1d2c273bb6 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Debugging/PerfSupportPlugin.h
@@ -32,7 +32,7 @@ class LLVM_ABI PerfSupportPlugin : public ObjectLinkingLayer::Plugin {
                     ExecutorAddr RegisterPerfEndAddr,
                     ExecutorAddr RegisterPerfImplAddr, bool EmitDebugInfo,
                     bool EmitUnwindInfo);
-  ~PerfSupportPlugin();
+  ~PerfSupportPlugin() override;
 
   void modifyPassConfig(MaterializationResponsibility &MR,
                         jitlink::LinkGraph &G,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
index fa48480f265a9..031bb27aeb27c 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/EPCGenericRTDyldMemoryManager.h
@@ -52,7 +52,7 @@ class LLVM_ABI EPCGenericRTDyldMemoryManager
   EPCGenericRTDyldMemoryManager(EPCGenericRTDyldMemoryManager &&) = delete;
   EPCGenericRTDyldMemoryManager &
   operator=(EPCGenericRTDyldMemoryManager &&) = delete;
-  ~EPCGenericRTDyldMemoryManager();
+  ~EPCGenericRTDyldMemoryManager() override;
 
   uint8_t *allocateCodeSection(uintptr_t Size, unsigned Alignment,
                                unsigned SectionID,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
index fecffc2a0bb32..fa4bb9dcecd87 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/IndirectionUtils.h
@@ -285,7 +285,7 @@ class LLVM_ABI IndirectStubsManager : public RedirectableSymbolManager {
   /// Map type for initializing the manager. See init.
   using StubInitsMap = StringMap<std::pair<ExecutorAddr, JITSymbolFlags>>;
 
-  virtual ~IndirectStubsManager() = default;
+  ~IndirectStubsManager() override = default;
 
   /// Create a single stub with the given name, target address and flags.
   virtual Error createStub(StringRef StubName, ExecutorAddr StubAddr,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
index 8dfc12ee9fad4..25df380413d5b 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Layer.h
@@ -136,7 +136,7 @@ class LLVM_ABI ObjectLayer : public RTTIExtends<ObjectLayer, RTTIRoot> {
   static char ID;
 
   ObjectLayer(ExecutionSession &ES);
-  virtual ~ObjectLayer();
+  ~ObjectLayer() override;
 
   /// Returns the execution session for this layer.
   ExecutionSession &getExecutionSession() { return ES; }
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
index d3643f9301134..2079a359e1e20 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/LinkGraphLinkingLayer.h
@@ -88,7 +88,7 @@ class LLVM_ABI LinkGraphLinkingLayer : public LinkGraphLayer,
                         std::unique_ptr<jitlink::JITLinkMemoryManager> MemMgr);
 
   /// Destroy the LinkGraphLinkingLayer.
-  ~LinkGraphLinkingLayer();
+  ~LinkGraphLinkingLayer() override;
 
   /// Add a plugin.
   LinkGraphLinkingLayer &addPlugin(std::shared_ptr<Plugin> P) {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
index 9c34bf1e7a99d..b23093df80084 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachOPlatform.h
@@ -286,7 +286,6 @@ class LLVM_ABI MachOPlatform : public Platform {
     // FIXME: ObjCImageInfos and HeaderAddrs need to be cleared when
     // JITDylibs are removed.
     DenseMap<JITDylib *, ObjCImageInfo> ObjCImageInfos;
-    DenseMap<JITDylib *, ExecutorAddr> HeaderAddrs;
   };
 
   using GetJITDylibHeaderSendResultFn =
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
index 1fb472a177d6d..8c6a8f5899c17 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h
@@ -58,7 +58,7 @@ class LLVM_ABI RTDyldObjectLinkingLayer
   RTDyldObjectLinkingLayer(ExecutionSession &ES,
                            GetMemoryManagerFunction GetMemoryManager);
 
-  ~RTDyldObjectLinkingLayer();
+  ~RTDyldObjectLinkingLayer() override;
 
   /// Emit the object.
   void emit(std::unique_ptr<MaterializationResponsibility> R,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
index 4a32113b460e1..6adaa8ae43895 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/Shared/ExecutorAddress.h
@@ -34,7 +34,7 @@ using ExecutorAddrDiff = uint64_t;
 class ExecutorAddr {
 public:
   /// A wrap/unwrap function that leaves pointers unmodified.
-  using rawPtr = llvm::identity_cxx20;
+  using rawPtr = llvm::identity;
 
 #if __has_feature(ptrauth_calls)
   template <typename T> class PtrauthSignDefault {
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
index 7acb6a4db08c2..81761831bdd96 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/SimpleRemoteEPC.h
@@ -69,7 +69,7 @@ class LLVM_ABI SimpleRemoteEPC : public ExecutorProcessControl,
   SimpleRemoteEPC &operator=(const SimpleRemoteEPC &) = delete;
   SimpleRemoteEPC(SimpleRemoteEPC &&) = delete;
   SimpleRemoteEPC &operator=(SimpleRemoteEPC &&) = delete;
-  ~SimpleRemoteEPC();
+  ~SimpleRemoteEPC() override;
 
   Expected<int32_t> runAsMain(ExecutorAddr MainFnAddr,
                               ArrayRef<std::string> Args) override;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
index 85c2d655837b5..2c385de48ddf6 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/ExecutorSharedMemoryMapperService.h
@@ -29,7 +29,7 @@ namespace rt_bootstrap {
 class LLVM_ABI ExecutorSharedMemoryMapperService final
     : public ExecutorBootstrapService {
 public:
-  ~ExecutorSharedMemoryMapperService(){};
+  ~ExecutorSharedMemoryMapperService() override {};
 
   Expected<std::pair<ExecutorAddr, std::string>> reserve(uint64_t Size);
   Expected<ExecutorAddr> initialize(ExecutorAddr Reservation,
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
index 7526a29dc1066..7ca2ff2a2fd94 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorDylibManager.h
@@ -37,7 +37,7 @@ namespace rt_bootstrap {
 /// Simple page-based allocator.
 class LLVM_ABI SimpleExecutorDylibManager : public ExecutorBootstrapService {
 public:
-  virtual ~SimpleExecutorDylibManager();
+  ~SimpleExecutorDylibManager() override;
 
   Expected<tpctypes::DylibHandle> open(const std::string &Path, uint64_t Mode);
 
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
index 6224e92887147..45256ec42463f 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TargetProcess/SimpleExecutorMemoryManager.h
@@ -32,7 +32,7 @@ namespace rt_bootstrap {
 /// Simple page-based allocator.
 class LLVM_ABI SimpleExecutorMemoryManager : public ExecutorBootstrapService {
 public:
-  virtual ~SimpleExecutorMemoryManager();
+  ~SimpleExecutorMemoryManager() override;
 
   Expected<ExecutorAddr> reserve(uint64_t Size);
   Expected<ExecutorAddr> initialize(tpctypes::FinalizeRequest &FR);
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
index 9cf6e00ad7131..b73da194cd187 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/TaskDispatch.h
@@ -37,7 +37,7 @@ class LLVM_ABI Task : public RTTIExtends<Task, RTTIRoot> {
 public:
   static char ID;
 
-  virtual ~Task() = default;
+  ~Task() override = default;
 
   /// Description of the task to be performed. Used for logging.
   virtual void printDescription(raw_ostream &OS) = 0;
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
index 45834f14e9bdf..d8506ce830bdc 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/WaitingOnGraph.h
@@ -186,9 +186,7 @@ template <typename ContainerIdT, typename ElementIdT> class WaitingOnGraph {
         SmallVector<ElementId> SortedElems(ContainerElems.begin(),
                                            ContainerElems.end());
         llvm::sort(SortedElems);
-        Hash = hash_combine(
-            Hash, Container,
-            hash_combine_range(SortedElems.begin(), SortedElems.end()));
+        Hash = hash_combine(Hash, Container, hash_combine_range(SortedElems));
       }
       return Hash;
     }
diff --git a/llvm/include/llvm/IR/Attributes.td b/llvm/include/llvm/IR/Attributes.td
index bd98b3a9b4923..c5a39197af2ac 100644
--- a/llvm/include/llvm/IR/Attributes.td
+++ b/llvm/include/llvm/IR/Attributes.td
@@ -413,7 +413,6 @@ def LessPreciseFPMAD : StrBoolAttr<"less-precise-fpmad">;
 def NoInfsFPMath : StrBoolAttr<"no-infs-fp-math">;
 def NoNansFPMath : StrBoolAttr<"no-nans-fp-math">;
 def NoSignedZerosFPMath : StrBoolAttr<"no-signed-zeros-fp-math">;
-def UnsafeFPMath : StrBoolAttr<"unsafe-fp-math">;
 def NoJumpTables : StrBoolAttr<"no-jump-tables">;
 def NoInlineLineTables : StrBoolAttr<"no-inline-line-tables">;
 def ProfileSampleAccurate : StrBoolAttr<"profile-sample-accurate">;
@@ -477,7 +476,6 @@ def : MergeRule<"setAND<LessPreciseFPMADAttr>">;
 def : MergeRule<"setAND<NoInfsFPMathAttr>">;
 def : MergeRule<"setAND<NoNansFPMathAttr>">;
 def : MergeRule<"setAND<NoSignedZerosFPMathAttr>">;
-def : MergeRule<"setAND<UnsafeFPMathAttr>">;
 def : MergeRule<"setOR<NoImplicitFloatAttr>">;
 def : MergeRule<"setOR<NoJumpTablesAttr>">;
 def : MergeRule<"setOR<ProfileSampleAccurateAttr>">;
diff --git a/llvm/include/llvm/IR/AutoUpgrade.h b/llvm/include/llvm/IR/AutoUpgrade.h
index 31096e86c5180..540d60afc5a01 100644
--- a/llvm/include/llvm/IR/AutoUpgrade.h
+++ b/llvm/include/llvm/IR/AutoUpgrade.h
@@ -96,6 +96,16 @@ namespace llvm {
   /// info. Return true if module is modified.
   LLVM_ABI bool UpgradeDebugInfo(Module &M);
 
+  /// Copies module attributes to the functions in the module.
+  /// Currently only effects ARM, Thumb and AArch64 targets.
+  /// Supported attributes:
+  ///  - branch-target-enforcement
+  ///  - branch-protection-pauth-lr
+  ///  - guarded-control-stack
+  ///  - sign-return-address
+  ///  - sign-return-address-with-bkey
+  void copyModuleAttrToFunctions(Module &M);
+
   /// Check whether a string looks like an old loop attachment tag.
   inline bool mayBeOldLoopAttachmentTag(StringRef Name) {
     return Name.starts_with("llvm.vectorizer.");
diff --git a/llvm/include/llvm/IR/DebugInfo.h b/llvm/include/llvm/IR/DebugInfo.h
index 220e4c1cad4f7..33e6df0ecb873 100644
--- a/llvm/include/llvm/IR/DebugInfo.h
+++ b/llvm/include/llvm/IR/DebugInfo.h
@@ -140,25 +140,17 @@ class DebugInfoFinder {
   using type_iterator = SmallVectorImpl<DIType *>::const_iterator;
   using scope_iterator = SmallVectorImpl<DIScope *>::const_iterator;
 
-  iterator_range<compile_unit_iterator> compile_units() const {
-    return make_range(CUs.begin(), CUs.end());
-  }
+  iterator_range<compile_unit_iterator> compile_units() const { return CUs; }
 
-  iterator_range<subprogram_iterator> subprograms() const {
-    return make_range(SPs.begin(), SPs.end());
-  }
+  iterator_range<subprogram_iterator> subprograms() const { return SPs; }
 
   iterator_range<global_variable_expression_iterator> global_variables() const {
-    return make_range(GVs.begin(), GVs.end());
+    return GVs;
   }
 
-  iterator_range<type_iterator> types() const {
-    return make_range(TYs.begin(), TYs.end());
-  }
+  iterator_range<type_iterator> types() const { return TYs; }
 
-  iterator_range<scope_iterator> scopes() const {
-    return make_range(Scopes.begin(), Scopes.end());
-  }
+  iterator_range<scope_iterator> scopes() const { return Scopes; }
 
   unsigned compile_unit_count() const { return CUs.size(); }
   unsigned global_variable_count() const { return GVs.size(); }
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index e8b053d015917..73f5e9feeefd3 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1493,24 +1493,23 @@ def int_eh_sjlj_setup_dispatch  : Intrinsic<[], []>;
 //
 def int_var_annotation : DefaultAttrsIntrinsic<
     [], [llvm_anyptr_ty, llvm_anyptr_ty, LLVMMatchType<1>, llvm_i32_ty, LLVMMatchType<1>],
-    [IntrInaccessibleMemOnly], "llvm.var.annotation">;
+    [IntrInaccessibleMemOnly]>;
 
 def int_ptr_annotation : DefaultAttrsIntrinsic<
     [llvm_anyptr_ty],
     [LLVMMatchType<0>, llvm_anyptr_ty, LLVMMatchType<1>, llvm_i32_ty, LLVMMatchType<1>],
-    [IntrInaccessibleMemOnly], "llvm.ptr.annotation">;
+    [IntrInaccessibleMemOnly]>;
 
 def int_annotation : DefaultAttrsIntrinsic<
     [llvm_anyint_ty],
     [LLVMMatchType<0>, llvm_anyptr_ty, LLVMMatchType<1>, llvm_i32_ty],
-    [IntrInaccessibleMemOnly], "llvm.annotation">;
+    [IntrInaccessibleMemOnly]>;
 
 // Annotates the current program point with metadata strings which are emitted
 // as CodeView debug info records. This is expensive, as it disables inlining
 // and is modelled as having side effects.
 def int_codeview_annotation : DefaultAttrsIntrinsic<[], [llvm_metadata_ty],
-                                        [IntrInaccessibleMemOnly, IntrNoDuplicate],
-                                        "llvm.codeview.annotation">;
+                                        [IntrInaccessibleMemOnly, IntrNoDuplicate]>;
 
 //===------------------------ Trampoline Intrinsics -----------------------===//
 //
@@ -1887,8 +1886,7 @@ def int_clear_cache : Intrinsic<[], [llvm_ptr_ty, llvm_ptr_ty],
 
 // Intrinsic to detect whether its argument is a constant.
 def int_is_constant : DefaultAttrsIntrinsic<[llvm_i1_ty], [llvm_any_ty],
-                                [IntrNoMem, IntrConvergent],
-                                "llvm.is.constant">;
+                                [IntrNoMem, IntrConvergent]>;
 
 // Introduce a use of the argument without generating any code.
 def int_fake_use : DefaultAttrsIntrinsic<[], [llvm_vararg_ty],
diff --git a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
index 1965922de2c8a..e573b2be73941 100644
--- a/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
+++ b/llvm/include/llvm/IR/IntrinsicsAMDGPU.td
@@ -3814,6 +3814,20 @@ def int_amdgcn_perm_pk16_b8_u4 : ClangBuiltin<"__builtin_amdgcn_perm_pk16_b8_u4"
   DefaultAttrsIntrinsic<[llvm_v4i32_ty], [llvm_i64_ty, llvm_i64_ty, llvm_v2i32_ty],
                         [IntrNoMem, IntrSpeculatable]>;
 
+class AMDGPUAddMinMax<LLVMType Ty, string Name> : ClangBuiltin<"__builtin_amdgcn_"#Name>,
+  DefaultAttrsIntrinsic<[Ty], [Ty, Ty, Ty, llvm_i1_ty /* clamp */],
+                        [IntrNoMem, IntrSpeculatable, ImmArg<ArgIndex<3>>]>;
+
+def int_amdgcn_add_max_i32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_i32">;
+def int_amdgcn_add_max_u32 : AMDGPUAddMinMax<llvm_i32_ty, "add_max_u32">;
+def int_amdgcn_add_min_i32 : AMDGPUAddMinMax<llvm_i32_ty, "add_min_i32">;
+def int_amdgcn_add_min_u32 : AMDGPUAddMinMax<llvm_i32_ty, "add_min_u32">;
+
+def int_amdgcn_pk_add_max_i16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_max_i16">;
+def int_amdgcn_pk_add_max_u16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_max_u16">;
+def int_amdgcn_pk_add_min_i16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_min_i16">;
+def int_amdgcn_pk_add_min_u16 : AMDGPUAddMinMax<llvm_v2i16_ty, "pk_add_min_u16">;
+
 class AMDGPUCooperativeAtomicStore<LLVMType Ty> : Intrinsic <
   [],
   [llvm_anyptr_ty,         // pointer to store to
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagon.td b/llvm/include/llvm/IR/IntrinsicsHexagon.td
index 67b873d16cb5a..20ba51ade35a7 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagon.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagon.td
@@ -447,3 +447,15 @@ def int_hexagon_instrprof_custom
 
 
 include "llvm/IR/IntrinsicsHexagonDep.td"
+
+class Hexagon__ptri32i32v64i16_Intrinsic<string GCCIntSuffix,
+      list<IntrinsicProperty> intr_properties = [IntrNoMem]>
+  : Hexagon_Intrinsic<GCCIntSuffix,
+       [], [llvm_ptr_ty,llvm_i32_ty,llvm_i32_ty,llvm_v64i16_ty],
+       intr_properties>;
+
+def int_hexagon_V6_vgather_vscattermh :
+Hexagon__ptri32i32v64i16_Intrinsic<"HEXAGON_V6_vgather_vscattermh", [IntrArgMemOnly]>;
+
+def int_hexagon_V6_vgather_vscattermh_128B :
+Hexagon__ptri32i32v32i32_Intrinsic<"HEXAGON_V6_vgather_vscattermh_128B", [IntrArgMemOnly]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
index 0cd6008b986fd..fe95377f8e1a5 100644
--- a/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
+++ b/llvm/include/llvm/IR/IntrinsicsHexagonDep.td
@@ -6832,3 +6832,17 @@ Hexagon_v32i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8">;
 
 def int_hexagon_V6_vsub_hf_f8_128B :
 Hexagon_v64i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_f8_128B">;
+
+// V81 HVX Instructions.
+
+def int_hexagon_V6_vsub_hf_mix :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix">;
+
+def int_hexagon_V6_vsub_hf_mix_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_hf_mix_128B">;
+
+def int_hexagon_V6_vsub_sf_mix :
+Hexagon_v16i32_v16i32v16i32_Intrinsic<"HEXAGON_V6_vsub_sf_mix">;
+
+def int_hexagon_V6_vsub_sf_mix_128B :
+Hexagon_v32i32_v32i32v32i32_Intrinsic<"HEXAGON_V6_vsub_sf_mix_128B">;
diff --git a/llvm/include/llvm/IR/IntrinsicsNVVM.td b/llvm/include/llvm/IR/IntrinsicsNVVM.td
index 3af1750ffcf3f..c9df6c43fd396 100644
--- a/llvm/include/llvm/IR/IntrinsicsNVVM.td
+++ b/llvm/include/llvm/IR/IntrinsicsNVVM.td
@@ -456,7 +456,7 @@ class WMMA_REGS<string Geom, string Frag, string PtxEltType, bit IsSparse = fals
 }
 
 class WMMA_NAME_LDST<string Op, WMMA_REGS Frag, string Layout, int WithStride> {
-  string intr = "llvm.nvvm.wmma."
+  string intr_name = "llvm.nvvm.wmma."
                 # Frag.geom
                 # "." # Op
                 # "." # Frag.frag
@@ -467,7 +467,7 @@ class WMMA_NAME_LDST<string Op, WMMA_REGS Frag, string Layout, int WithStride> {
   // TODO(tra): record name should ideally use the same field order as the intrinsic.
   // E.g. string record = !subst("llvm", "int",
   //                      !subst(".", "_", llvm));
-  string record = "int_nvvm_wmma_"
+  string record_name = "int_nvvm_wmma_"
                 # Frag.geom
                 # "_" # Op
                 # "_" # Frag.frag
@@ -496,7 +496,7 @@ class MMA_SIGNATURE<WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
 class WMMA_NAME<string ALayout, string BLayout, int Satfinite, string Rnd, string b1op,
                 WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
   string signature = MMA_SIGNATURE<A, B, C, D>.ret;
-  string record = "int_nvvm_wmma_"
+  string record_name = "int_nvvm_wmma_"
                   # A.geom
                   # "_mma"
                   # !subst(".", "_", b1op)
@@ -510,7 +510,7 @@ class WMMA_NAME<string ALayout, string BLayout, int Satfinite, string Rnd, strin
 class MMA_NAME<string ALayout, string BLayout, int Satfinite, string b1op, string Kind,
                WMMA_REGS A, WMMA_REGS B, WMMA_REGS C, WMMA_REGS D> {
   string signature = MMA_SIGNATURE<A, B, C, D>.ret;
-  string record = "int_nvvm_mma"
+  string record_name = "int_nvvm_mma"
                   # !subst(".", "_", b1op)
                   # "_" # A.geom
                   # "_" # ALayout
@@ -524,7 +524,7 @@ class MMA_SP_NAME<string Metadata, string Kind, int Satfinite,
                   WMMA_REGS A, WMMA_REGS B,
                   WMMA_REGS C, WMMA_REGS D> {
   string signature = MMA_SIGNATURE<A, B, C, D>.ret;
-  string record = "int_nvvm_mma"
+  string record_name = "int_nvvm_mma"
                   # "_" # !subst("::", "_", Metadata)
                   # "_" # A.geom
                   # "_row_col"
@@ -533,26 +533,37 @@ class MMA_SP_NAME<string Metadata, string Kind, int Satfinite,
                   # signature;
 }
 
+// Helper class that takes an intrinsic name and construct a record name.
+// Additionally, sets `intr_name` to be non-empty if the default name assigned
+// to this intrinsic will not match the name given.
+class IntrinsicName<string name> {
+  string record_name = !subst(".", "_",
+                  !subst("llvm.", "int_", name));
+  // Use explicit intrinsic name if it has an _ in it, else rely on LLVM
+  // assigned default name.
+  string intr_name = !if(!ne(!find(name, "_"), -1), name, "");
+}
+
 class LDMATRIX_NAME<WMMA_REGS Frag, int Trans> {
-  string intr = "llvm.nvvm.ldmatrix.sync.aligned"
+  defvar name = "llvm.nvvm.ldmatrix.sync.aligned"
                 # "." # Frag.geom
                 # "." # Frag.frag
                 # !if(Trans, ".trans", "")
                 # "." # Frag.ptx_elt_type
                 ;
-  string record = !subst(".", "_",
-                  !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 class STMATRIX_NAME<WMMA_REGS Frag, int Trans> {
-  string intr = "llvm.nvvm.stmatrix.sync.aligned"
+  defvar name = "llvm.nvvm.stmatrix.sync.aligned"
                 # "." # Frag.geom
                 # "." # Frag.frag
                 # !if(Trans, ".trans", "")
                 # "." # Frag.ptx_elt_type
                 ;
-  string record = !subst(".", "_",
-                  !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 // Generates list of 4-tuples of WMMA_REGS representing a valid MMA op.
@@ -1042,45 +1053,49 @@ class NVVM_TCGEN05_MMA_BASE<string Space, bit Sp> {
 class NVVM_TCGEN05_MMA<bit Sp, string Space,
                        bit AShift, bit ScaleInputD>:
         NVVM_TCGEN05_MMA_BASE<Space, Sp> {
-  string intr = "llvm.nvvm.tcgen05.mma"
+  string name = "llvm.nvvm.tcgen05.mma"
                 # !if(!eq(Sp, 1), ".sp", "")
                 # "." # Space
                 # !if(!eq(ScaleInputD, 1), ".scale_d", "")
                 # !if(!eq(AShift, 1), ".ashift", "");
-  string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 class NVVM_TCGEN05_MMA_BLOCKSCALE<bit Sp, string Space,
                                   string Kind, string ScaleVecSize>:
         NVVM_TCGEN05_MMA_BASE<Space, Sp> {
-  string intr = "llvm.nvvm.tcgen05.mma"
+  string name = "llvm.nvvm.tcgen05.mma"
                 # !if(!eq(Sp, 1), ".sp", "")
                 # "." # Space
                 # "." # Kind
                 # ".block_scale" # ScaleVecSize;
-  string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 class NVVM_TCGEN05_MMA_WS<bit Sp, string Space, bit ZeroColMask>:
         NVVM_TCGEN05_MMA_BASE<Space, Sp> {
-  string intr = "llvm.nvvm.tcgen05.mma.ws"
+  string name = "llvm.nvvm.tcgen05.mma.ws"
                 # !if(!eq(Sp, 1), ".sp", "")
                 # "." # Space
                 # !if(!eq(ZeroColMask, 1), ".zero_col_mask", "");
-  string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 class NVVM_TCGEN05_MMA_DISABLE_OUTPUT_LANE<bit Sp, string Space,
                                            int CtaGroup, bit AShift,
                                            bit ScaleInputD>:
         NVVM_TCGEN05_MMA_BASE<Space, Sp> {
-  string intr = "llvm.nvvm.tcgen05.mma"
+  string name = "llvm.nvvm.tcgen05.mma"
                 # !if(!eq(Sp, 1), ".sp", "")
                 # "." # Space
                 # !if(!eq(ScaleInputD, 1), ".scale_d", "")
                 # ".disable_output_lane.cg" # CtaGroup
                 # !if(!eq(AShift, 1), ".ashift", "");
-  string record = !subst(".", "_", !subst("llvm.", "int_", intr));
+  string intr_name = IntrinsicName<name>.intr_name;
+  string record_name = IntrinsicName<name>.record_name;
 }
 
 class NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED<string Kind, string ScaleVecSize> {
@@ -2273,7 +2288,7 @@ class NVVM_WMMA_LD<WMMA_REGS Frag, string Layout, int WithStride>
   : Intrinsic<Frag.regs,
               !if(WithStride, [llvm_anyptr_ty, llvm_i32_ty], [llvm_anyptr_ty]),
               [IntrWillReturn, IntrReadMem, IntrArgMemOnly, IntrNoCallback, ReadOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
-              WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.intr>;
+              WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.intr_name>;
 
 // WMMA.STORE.D
 class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
@@ -2283,18 +2298,18 @@ class NVVM_WMMA_ST<WMMA_REGS Frag, string Layout, int WithStride>
                 Frag.regs,
                 !if(WithStride, [llvm_i32_ty], [])),
               [IntrWriteMem, IntrArgMemOnly, IntrNoCallback, WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
-              WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.intr>;
+              WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.intr_name>;
 
 // Create all load/store variants
 foreach layout = ["row", "col"] in {
   foreach stride = [0, 1] in {
     foreach frag = NVVM_MMA_OPS.all_ld_ops in
       if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
-        def WMMA_NAME_LDST<"load", frag, layout, stride>.record
+        def WMMA_NAME_LDST<"load", frag, layout, stride>.record_name
              : NVVM_WMMA_LD<frag, layout, stride>;
     foreach frag = NVVM_MMA_OPS.all_st_ops in
       if NVVM_WMMA_LDST_SUPPORTED<frag, layout>.ret then
-        def WMMA_NAME_LDST<"store", frag, layout, stride>.record
+        def WMMA_NAME_LDST<"store", frag, layout, stride>.record_name
              : NVVM_WMMA_ST<frag, layout, stride>;
   }
 }
@@ -2313,7 +2328,7 @@ foreach layout_a = ["row", "col"] in {
           foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
             if NVVM_WMMA_SUPPORTED<op, layout_a, layout_b, satf, rnd>.ret then {
               def WMMA_NAME<layout_a, layout_b, satf, rnd, b1op,
-                                op[0], op[1], op[2], op[3]>.record
+                                op[0], op[1], op[2], op[3]>.record_name
                 : NVVM_MMA<op[0], op[1], op[2], op[3]>;
             }
           } // b1op
@@ -2330,7 +2345,7 @@ foreach layout_a = ["row", "col"] in {
         foreach b1op = NVVM_MMA_B1OPS<op>.ret in {
           foreach kind = ["", "kind::f8f6f4"] in {
             if NVVM_MMA_SUPPORTED<op, layout_a, layout_b, kind, satf>.ret then {
-                def MMA_NAME<layout_a, layout_b, satf, b1op, kind, op[0], op[1], op[2], op[3]>.record
+                def MMA_NAME<layout_a, layout_b, satf, b1op, kind, op[0], op[1], op[2], op[3]>.record_name
                 : NVVM_MMA<op[0], op[1], op[2], op[3]>;
             }
           } // kind
@@ -2379,7 +2394,7 @@ foreach metadata = ["sp", "sp::ordered_metadata"] in {
       foreach op = NVVM_MMA_OPS.all_mma_sp_ops in {
         if NVVM_MMA_SP_SUPPORTED<op, metadata, kind, satf>.ret then {
           def MMA_SP_NAME<metadata, kind, satf,
-                          op[0], op[1], op[2], op[3]>.record
+                          op[0], op[1], op[2], op[3]>.record_name
             : NVVM_MMA_SP<op[0], op[1], op[2], op[3]>;
         }
       } // op
@@ -2392,12 +2407,12 @@ class NVVM_LDMATRIX<WMMA_REGS Frag, int Transposed>
   : Intrinsic<Frag.regs, [llvm_anyptr_ty],
               [IntrReadMem, IntrArgMemOnly, IntrNoCallback, ReadOnly<ArgIndex<0>>,
                NoCapture<ArgIndex<0>>],
-              LDMATRIX_NAME<Frag, Transposed>.intr>;
+              LDMATRIX_NAME<Frag, Transposed>.intr_name>;
 
 foreach transposed = [0, 1] in {
   foreach frag = NVVM_MMA_OPS.all_ldmatrix_ops in {
     if NVVM_LDMATRIX_SUPPORTED<frag, transposed>.ret then {
-      def LDMATRIX_NAME<frag, transposed>.record
+      def LDMATRIX_NAME<frag, transposed>.record_name
         : NVVM_LDMATRIX<frag, transposed>;
     }
   }
@@ -2409,12 +2424,12 @@ class NVVM_STMATRIX<WMMA_REGS Frag, int Transposed>
           !listconcat([llvm_anyptr_ty], Frag.regs),
           [IntrWriteMem, IntrArgMemOnly, IntrNoCallback,
            WriteOnly<ArgIndex<0>>, NoCapture<ArgIndex<0>>],
-          STMATRIX_NAME<Frag, Transposed>.intr>;
+          STMATRIX_NAME<Frag, Transposed>.intr_name>;
 
 foreach transposed = [0, 1] in {
   foreach frag = NVVM_MMA_OPS.all_stmatrix_ops in {
     if NVVM_STMATRIX_SUPPORTED<frag, transposed>.ret then {
-      def STMATRIX_NAME<frag, transposed>.record
+      def STMATRIX_NAME<frag, transposed>.record_name
         : NVVM_STMATRIX<frag, transposed>;
     }
   }
@@ -2767,14 +2782,15 @@ foreach cta_group = ["cg1", "cg2"] in {
                      "64x128b_warpx2_02_13",
                      "64x128b_warpx2_01_23",
                      "32x128b_warpx4"] in {
-      defvar intr_suffix = StrJoin<"_", [shape, src_fmt, cta_group]>.ret;
-      defvar name_suffix = StrJoin<".", [shape, src_fmt, cta_group]>.ret;
+      defvar name = "llvm.nvvm.tcgen05.cp."  #
+                    StrJoin<".", [shape, src_fmt, cta_group]>.ret;
 
-      def int_nvvm_tcgen05_cp_ # intr_suffix : Intrinsic<[],
+      defvar intrinsic_name = IntrinsicName<name>;
+      def intrinsic_name.record_name : Intrinsic<[],
         [llvm_tmem_ptr_ty,   // tmem_addr
          llvm_i64_ty],       // smem descriptor
         [IntrConvergent, IntrInaccessibleMemOrArgMemOnly, NoCapture<ArgIndex<0>>],
-        "llvm.nvvm.tcgen05.cp." # name_suffix>;
+        intrinsic_name.intr_name>;
     }
   }
 }
@@ -2881,9 +2897,9 @@ foreach sp = [0, 1] in {
           ]
         );
 
-        def mma.record:
+        def mma.record_name:
               DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
-                mma.intr>;
+                mma.intr_name>;
       }
     }
   }
@@ -2918,8 +2934,8 @@ foreach sp = [0, 1] in {
                      Range<ArgIndex<!add(nargs, 1)>, 0, !if(!eq(ashift, 1), 2, 4)>]
                   );
 
-          def mma.record: DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
-                            mma.intr>;
+          def mma.record_name : DefaultAttrsIntrinsicFlags<[], args, flags,
+                                  intrinsic_properties, mma.intr_name>;
         } // ashift
       } // scale_d
     } // cta_group
@@ -2944,11 +2960,11 @@ foreach sp = [0, 1] in {
         defvar collector_usage = ArgIndex<!add(nargs, 1)>;
 
         if NVVM_TCGEN05_MMA_BLOCKSCALE_SUPPORTED<kind, scale_vec_size>.ret then {
-          def mma.record: DefaultAttrsIntrinsicFlags<[], args, flags,
+          def mma.record_name : DefaultAttrsIntrinsicFlags<[], args, flags,
             !listconcat(mma.common_intr_props,
                         [Range<cta_group, 1, 3>,
                          Range<collector_usage, 0, 4>]),
-            mma.intr>;
+            mma.intr_name>;
         }
       }
     }
@@ -2977,9 +2993,9 @@ foreach sp = [0, 1] in {
          Range<ArgIndex<!add(nargs, 2)>, 0, 4>]
       );
 
-      def mma.record:
+      def mma.record_name:
             DefaultAttrsIntrinsicFlags<[], args, flags, intrinsic_properties,
-              mma.intr>;
+              mma.intr_name>;
     }
   }
 }
diff --git a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
index 4a0272ca0a0a3..b1bb24e863b35 100644
--- a/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
+++ b/llvm/include/llvm/IR/IntrinsicsRISCVXsf.td
@@ -162,6 +162,10 @@ let TargetPrefix = "riscv" in {
   defm "" : RISCVSFCustomVC_XVV<["x", "i", "v", "f"]>;
   defm "" : RISCVSFCustomVC_XVW<["x", "i", "v", "f"]>;
 
+  // XSfvfexp* and XSfvfexpa*
+  defm sf_vfexp : RISCVUnaryAA;
+  defm sf_vfexpa : RISCVUnaryAA;
+
   // XSfvqmaccdod
   def int_riscv_sf_vqmaccu_2x8x2  : RISCVSFCustomVMACC;
   def int_riscv_sf_vqmacc_2x8x2   : RISCVSFCustomVMACC;
diff --git a/llvm/include/llvm/IR/ModuleSummaryIndex.h b/llvm/include/llvm/IR/ModuleSummaryIndex.h
index 0062cece17944..eb60bee309cf5 100644
--- a/llvm/include/llvm/IR/ModuleSummaryIndex.h
+++ b/llvm/include/llvm/IR/ModuleSummaryIndex.h
@@ -172,9 +172,13 @@ struct alignas(8) GlobalValueSummaryInfo {
 
   /// Add a summary corresponding to a global value definition in a module with
   /// the corresponding GUID.
-  void addSummary(std::unique_ptr<GlobalValueSummary> Summary) {
-    return SummaryList.push_back(std::move(Summary));
-  }
+  inline void addSummary(std::unique_ptr<GlobalValueSummary> Summary);
+
+  /// Verify that the HasLocal flag is consistent with the SummaryList. Should
+  /// only be called prior to index-based internalization and promotion.
+  inline void verifyLocal() const;
+
+  bool hasLocal() const { return HasLocal; }
 
 private:
   /// List of global value summary structures for a particular value held
@@ -183,6 +187,22 @@ struct alignas(8) GlobalValueSummaryInfo {
   /// compiling without sufficient distinguishing path, or (theoretically) hash
   /// collisions. Each summary is from a different module.
   GlobalValueSummaryList SummaryList;
+
+  /// True if the SummaryList contains at least one summary with local linkage.
+  /// In most cases there should be only one, unless translation units with
+  /// same-named locals were compiled without distinguishing path. And generally
+  /// there should not be a mix of local and non-local summaries, because the
+  /// GUID for a local is computed with the path prepended and a ';' delimiter.
+  /// In extremely rare cases there could be a GUID hash collision. Having the
+  /// flag saves having to walk through all summaries to prove the existence or
+  /// not of any locals.
+  /// NOTE: this flag is set when the index is built. It does not reflect
+  /// index-based internalization and promotion decisions. Generally most
+  /// index-based analysis occurs before then, but any users should assert that
+  /// the withInternalizeAndPromote() flag is not set on the index.
+  /// TODO: Replace checks in various ThinLTO analyses that loop through all
+  /// summaries to handle the local case with a check of the flag.
+  bool HasLocal : 1;
 };
 
 /// Map from global value GUID to corresponding summary structures. Use a
@@ -219,6 +239,10 @@ struct ValueInfo {
     return getRef()->second.getSummaryList();
   }
 
+  void verifyLocal() const { getRef()->second.verifyLocal(); }
+
+  bool hasLocal() const { return getRef()->second.hasLocal(); }
+
   // Even if the index is built with GVs available, we may not have one for
   // summary entries synthesized for profiled indirect call targets.
   bool hasName() const { return !haveGVs() || getValue(); }
@@ -649,7 +673,23 @@ class GlobalValueSummary {
   friend class ModuleSummaryIndex;
 };
 
-GlobalValueSummaryInfo::GlobalValueSummaryInfo(bool HaveGVs) : U(HaveGVs) {}
+GlobalValueSummaryInfo::GlobalValueSummaryInfo(bool HaveGVs)
+    : U(HaveGVs), HasLocal(false) {}
+
+void GlobalValueSummaryInfo::addSummary(
+    std::unique_ptr<GlobalValueSummary> Summary) {
+  if (GlobalValue::isLocalLinkage(Summary->linkage()))
+    HasLocal = true;
+  return SummaryList.push_back(std::move(Summary));
+}
+
+void GlobalValueSummaryInfo::verifyLocal() const {
+  assert(HasLocal ==
+         llvm::any_of(SummaryList,
+                      [](const std::unique_ptr<GlobalValueSummary> &Summary) {
+                        return GlobalValue::isLocalLinkage(Summary->linkage());
+                      }));
+}
 
 /// Alias summary information.
 class AliasSummary : public GlobalValueSummary {
@@ -1449,6 +1489,9 @@ class ModuleSummaryIndex {
   /// every summary of a GV is synchronized.
   bool WithDSOLocalPropagation = false;
 
+  /// Indicates that summary-based internalization and promotion has run.
+  bool WithInternalizeAndPromote = false;
+
   /// Indicates that we have whole program visibility.
   bool WithWholeProgramVisibility = false;
 
@@ -1653,6 +1696,9 @@ class ModuleSummaryIndex {
   bool withDSOLocalPropagation() const { return WithDSOLocalPropagation; }
   void setWithDSOLocalPropagation() { WithDSOLocalPropagation = true; }
 
+  bool withInternalizeAndPromote() const { return WithInternalizeAndPromote; }
+  void setWithInternalizeAndPromote() { WithInternalizeAndPromote = true; }
+
   bool withWholeProgramVisibility() const { return WithWholeProgramVisibility; }
   void setWithWholeProgramVisibility() { WithWholeProgramVisibility = true; }
 
diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 99f70b101c2ed..e3ec7e1764da7 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -198,6 +198,26 @@ struct constantexpr_match {
 /// expression.
 inline constantexpr_match m_ConstantExpr() { return constantexpr_match(); }
 
+template <typename SubPattern_t> struct Splat_match {
+  SubPattern_t SubPattern;
+  Splat_match(const SubPattern_t &SP) : SubPattern(SP) {}
+
+  template <typename OpTy> bool match(OpTy *V) const {
+    if (auto *C = dyn_cast<Constant>(V)) {
+      auto *Splat = C->getSplatValue();
+      return Splat ? SubPattern.match(Splat) : false;
+    }
+    // TODO: Extend to other cases (e.g. shufflevectors).
+    return false;
+  }
+};
+
+/// Match a constant splat. TODO: Extend this to non-constant splats.
+template <typename T>
+inline Splat_match<T> m_ConstantSplat(const T &SubPattern) {
+  return SubPattern;
+}
+
 /// Match an arbitrary basic block value and ignore it.
 inline class_match<BasicBlock> m_BasicBlock() {
   return class_match<BasicBlock>();
@@ -2925,6 +2945,12 @@ inline typename m_Intrinsic_Ty<Opnd0>::Ty m_VecReverse(const Opnd0 &Op0) {
   return m_Intrinsic<Intrinsic::vector_reverse>(Op0);
 }
 
+template <typename Opnd0, typename Opnd1, typename Opnd2>
+inline typename m_Intrinsic_Ty<Opnd0, Opnd1, Opnd2>::Ty
+m_VectorInsert(const Opnd0 &Op0, const Opnd1 &Op1, const Opnd2 &Op2) {
+  return m_Intrinsic<Intrinsic::vector_insert>(Op0, Op1, Op2);
+}
+
 //===----------------------------------------------------------------------===//
 // Matchers for two-operands operators with the operators in either order
 //
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.td b/llvm/include/llvm/IR/RuntimeLibcalls.td
index a8b647c22317e..ec16995a4f57c 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.td
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.td
@@ -32,6 +32,15 @@ def isWindowsMSVCEnvironment : RuntimeLibcallPredicate<
 def isNotOSLinuxAndNotOSOpenBSD : RuntimeLibcallPredicate<
   [{!TT.isOSLinux() && !TT.isOSOpenBSD()}]>;
 
+def isNotOSAIXAndNotOSOpenBSD : RuntimeLibcallPredicate<
+  [{!TT.isOSAIX() && !TT.isOSOpenBSD()}]>;
+
+// OpenBSD uses __guard_local. AIX uses __ssp_canary_word, MSVC/Windows
+// Itanium uses __security_cookie
+def hasStackChkFail : RuntimeLibcallPredicate<
+  [{ !TT.isOSOpenBSD() && !TT.isWindowsMSVCEnvironment() &&
+     !TT.isWindowsItaniumEnvironment()}]>;
+
 def isWindowsMSVCOrItaniumEnvironment : RuntimeLibcallPredicate<
   [{TT.isWindowsMSVCEnvironment() || TT.isWindowsItaniumEnvironment()}]>;
 
@@ -1241,9 +1250,9 @@ defvar LibmHasLdexpF80 = LibcallImpls<(add ldexpl_f80), isNotOSWindowsOrIsCygwin
 defvar LibmHasFrexpF128 = LibcallImpls<(add frexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
 defvar LibmHasLdexpF128 = LibcallImpls<(add ldexpl_f128), isNotOSWindowsOrIsCygwinMinGW>;
 
-defvar has__stack_chk_fail = LibcallImpls<(add __stack_chk_fail), isNotOSOpenBSD>;
+defvar has__stack_chk_fail = LibcallImpls<(add __stack_chk_fail), hasStackChkFail>;
 defvar has__stack_chk_guard =
-    LibcallImpls<(add __stack_chk_guard), isNotOSOpenBSD>;
+    LibcallImpls<(add __stack_chk_guard), hasStackChkFail>;
 defvar has__stack_smash_handler = LibcallImpls<(add __stack_smash_handler), isOSOpenBSD>;
 defvar has___guard_local = LibcallImpls<(add __guard_local), isOSOpenBSD>;
 
@@ -1396,8 +1405,8 @@ defvar ExceptionModelCallsArm64EC = (add
 def WindowsARM64ECSystemLibrary
     : SystemRuntimeLibrary<isWindowsArm64EC,
                            (add WinArm64ECDefaultRuntimeLibcallImpls,
-                                arm64ec___stack_chk_fail,
-                                __stack_chk_guard,
+                                AvailableIf<arm64ec___stack_chk_fail, hasStackChkFail>,
+                                AvailableIf<__stack_chk_guard, hasStackChkFail>,
                                 LibcallImpls<(add __security_check_cookie_arm64ec,
                                                   __security_cookie),
                                               isWindowsMSVCEnvironment>,
@@ -2117,7 +2126,9 @@ defvar MSP430DefaultOptOut = [
   __fixdfdi, __fixunsdfsi, __modsi3, __floatunsisf,
   __fixunsdfdi, __ltsf2, __floatdisf, __floatdidf,
   __lshrsi3, __subsf3, __umodhi3, __floatunsidf,
-  __floatundidf
+  __floatundidf, __gtdf2, __eqdf2, __gedf2, __ltdf2, __ledf2,
+  __adddf3, __divdf3, __divdi3, __moddi3,
+  __muldf3, __subdf3, __udivdi3, __umoddi3
 ];
 
 // EABI Libcalls - EABI Section 6.2
@@ -2316,11 +2327,11 @@ def PPCSystemLibrary
            LibmHasSinCosPPCF128,
            AvailableIf<memcpy, isNotAIX>,
            LibcallImpls<(add Int128RTLibcalls), isPPC64>,
-           has__stack_chk_fail,
            has__stack_smash_handler,
            has___guard_local,
            AvailableIf<__ssp_canary_word, isAIX>,
-           AvailableIf<__stack_chk_guard, isNotAIX>)>;
+           AvailableIf<__stack_chk_fail, isNotOSOpenBSD>,
+           AvailableIf<__stack_chk_guard, isNotOSAIXAndNotOSOpenBSD>)>;
 
 //===----------------------------------------------------------------------===//
 // RISCV Runtime Libcalls
diff --git a/llvm/include/llvm/IRReader/IRReader.h b/llvm/include/llvm/IRReader/IRReader.h
index 790140f19934e..00cf12d342ae0 100644
--- a/llvm/include/llvm/IRReader/IRReader.h
+++ b/llvm/include/llvm/IRReader/IRReader.h
@@ -15,6 +15,7 @@
 #define LLVM_IRREADER_IRREADER_H
 
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Support/Compiler.h"
 #include <memory>
@@ -50,19 +51,19 @@ getLazyIRFileModule(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
 /// for it.  Otherwise, attempt to parse it as LLVM Assembly and return
 /// a Module for it.
 /// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIR(MemoryBufferRef Buffer,
-                                         SMDiagnostic &Err,
-                                         LLVMContext &Context,
-                                         ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err, LLVMContext &Context,
+        ParserCallbacks Callbacks = {},
+        AsmParserContext *ParserContext = nullptr);
 
 /// If the given file holds a bitcode image, return a Module for it.
 /// Otherwise, attempt to parse it as LLVM Assembly and return a Module
 /// for it.
 /// \param DataLayoutCallback Override datalayout in the llvm assembly.
-LLVM_ABI std::unique_ptr<Module> parseIRFile(StringRef Filename,
-                                             SMDiagnostic &Err,
-                                             LLVMContext &Context,
-                                             ParserCallbacks Callbacks = {});
+LLVM_ABI std::unique_ptr<Module>
+parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+            ParserCallbacks Callbacks = {},
+            AsmParserContext *ParserContext = nullptr);
 }
 
 #endif
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index d507ba267d791..581b4ad161daa 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -62,6 +62,7 @@ LLVM_ABI void initializeBasicBlockSectionsPass(PassRegistry &);
 LLVM_ABI void initializeBarrierNoopPass(PassRegistry &);
 LLVM_ABI void initializeBasicAAWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeBlockFrequencyInfoWrapperPassPass(PassRegistry &);
+LLVM_ABI void initializeMachineBlockHashInfoPass(PassRegistry &);
 LLVM_ABI void initializeBranchFolderLegacyPass(PassRegistry &);
 LLVM_ABI void initializeBranchProbabilityInfoWrapperPassPass(PassRegistry &);
 LLVM_ABI void initializeBranchRelaxationLegacyPass(PassRegistry &);
diff --git a/llvm/include/llvm/LTO/LTO.h b/llvm/include/llvm/LTO/LTO.h
index 000472fe9a3c4..3a4dc5a3dfcf8 100644
--- a/llvm/include/llvm/LTO/LTO.h
+++ b/llvm/include/llvm/LTO/LTO.h
@@ -317,6 +317,8 @@ LLVM_ABI ThinBackend createInProcessThinBackend(
 /// distributor.
 /// RemoteCompiler specifies the path to a Clang executable to be invoked for
 /// the backend jobs.
+/// RemoteCompilerPrependArgs specifies a list of prepend arguments to be
+/// applied to the backend compilations.
 /// RemoteCompilerArgs specifies a list of arguments to be applied to the
 /// backend compilations.
 /// SaveTemps is a debugging tool that prevents temporary files created by this
@@ -326,6 +328,7 @@ LLVM_ABI ThinBackend createOutOfProcessThinBackend(
     bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
     StringRef LinkerOutputFile, StringRef Distributor,
     ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
+    ArrayRef<StringRef> RemoteCompilerPrependArgs,
     ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps);
 
 /// This ThinBackend writes individual module indexes to files, instead of
@@ -459,6 +462,19 @@ class LTO {
     ModuleMapType ModuleMap;
     // The bitcode modules to compile, if specified by the LTO Config.
     std::optional<ModuleMapType> ModulesToCompile;
+
+    void setPrevailingModuleForGUID(GlobalValue::GUID GUID, StringRef Module) {
+      PrevailingModuleForGUID[GUID] = Module;
+    }
+    bool isPrevailingModuleForGUID(GlobalValue::GUID GUID,
+                                   StringRef Module) const {
+      auto It = PrevailingModuleForGUID.find(GUID);
+      return It != PrevailingModuleForGUID.end() && It->second == Module;
+    }
+
+  private:
+    // Make this private so all accesses must go through above accessor methods
+    // to avoid inadvertently creating new entries on lookups.
     DenseMap<GlobalValue::GUID, StringRef> PrevailingModuleForGUID;
   } ThinLTO;
 
diff --git a/llvm/include/llvm/MC/MCContext.h b/llvm/include/llvm/MC/MCContext.h
index 4a528eecfc900..74abe3403bbed 100644
--- a/llvm/include/llvm/MC/MCContext.h
+++ b/llvm/include/llvm/MC/MCContext.h
@@ -175,7 +175,7 @@ class MCContext {
   unsigned GetInstance(unsigned LocalLabelVal);
 
   /// SHT_LLVM_BB_ADDR_MAP version to emit.
-  uint8_t BBAddrMapVersion = 3;
+  uint8_t BBAddrMapVersion = 4;
 
   /// The file name of the log file from the environment variable
   /// AS_SECURE_LOG_FILE.  Which must be set before the .secure_log_unique
diff --git a/llvm/include/llvm/MC/MCDXContainerWriter.h b/llvm/include/llvm/MC/MCDXContainerWriter.h
index 44d2af3e8a766..9b6ba896a5f09 100644
--- a/llvm/include/llvm/MC/MCDXContainerWriter.h
+++ b/llvm/include/llvm/MC/MCDXContainerWriter.h
@@ -23,7 +23,7 @@ class MCDXContainerTargetWriter : public MCObjectTargetWriter {
   MCDXContainerTargetWriter() {}
 
 public:
-  virtual ~MCDXContainerTargetWriter();
+  ~MCDXContainerTargetWriter() override;
 
   Triple::ObjectFormatType getFormat() const override {
     return Triple::DXContainer;
diff --git a/llvm/include/llvm/MC/MCELFObjectWriter.h b/llvm/include/llvm/MC/MCELFObjectWriter.h
index a8b00aa2b45ac..35614cb9c9117 100644
--- a/llvm/include/llvm/MC/MCELFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCELFObjectWriter.h
@@ -62,7 +62,7 @@ class MCELFObjectTargetWriter : public MCObjectTargetWriter {
                           bool HasRelocationAddend_, uint8_t ABIVersion_ = 0);
 
 public:
-  virtual ~MCELFObjectTargetWriter() = default;
+  ~MCELFObjectTargetWriter() override = default;
 
   Triple::ObjectFormatType getFormat() const override { return Triple::ELF; }
   static bool classof(const MCObjectTargetWriter *W) {
diff --git a/llvm/include/llvm/MC/MCGOFFObjectWriter.h b/llvm/include/llvm/MC/MCGOFFObjectWriter.h
index 78a491522dbd6..ec07637dd2847 100644
--- a/llvm/include/llvm/MC/MCGOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCGOFFObjectWriter.h
@@ -21,7 +21,7 @@ class MCGOFFObjectTargetWriter : public MCObjectTargetWriter {
   MCGOFFObjectTargetWriter() = default;
 
 public:
-  virtual ~MCGOFFObjectTargetWriter() = default;
+  ~MCGOFFObjectTargetWriter() override = default;
 
   Triple::ObjectFormatType getFormat() const override { return Triple::GOFF; }
 
diff --git a/llvm/include/llvm/MC/MCMachObjectWriter.h b/llvm/include/llvm/MC/MCMachObjectWriter.h
index 41416a2f9c698..2163312b9e3c8 100644
--- a/llvm/include/llvm/MC/MCMachObjectWriter.h
+++ b/llvm/include/llvm/MC/MCMachObjectWriter.h
@@ -48,7 +48,7 @@ class LLVM_ABI MCMachObjectTargetWriter : public MCObjectTargetWriter {
   }
 
 public:
-  virtual ~MCMachObjectTargetWriter();
+  ~MCMachObjectTargetWriter() override;
 
   Triple::ObjectFormatType getFormat() const override { return Triple::MachO; }
   static bool classof(const MCObjectTargetWriter *W) {
diff --git a/llvm/include/llvm/MC/MCObjectStreamer.h b/llvm/include/llvm/MC/MCObjectStreamer.h
index 1899cb6331c6f..d9aecd881b51c 100644
--- a/llvm/include/llvm/MC/MCObjectStreamer.h
+++ b/llvm/include/llvm/MC/MCObjectStreamer.h
@@ -68,7 +68,7 @@ class MCObjectStreamer : public MCStreamer {
   MCObjectStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> TAB,
                    std::unique_ptr<MCObjectWriter> OW,
                    std::unique_ptr<MCCodeEmitter> Emitter);
-  ~MCObjectStreamer();
+  ~MCObjectStreamer() override;
 
 public:
   /// state management
diff --git a/llvm/include/llvm/MC/MCWasmObjectWriter.h b/llvm/include/llvm/MC/MCWasmObjectWriter.h
index 3d5b09407507a..9dd588d0a15b9 100644
--- a/llvm/include/llvm/MC/MCWasmObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWasmObjectWriter.h
@@ -27,7 +27,7 @@ class MCWasmObjectTargetWriter : public MCObjectTargetWriter {
   explicit MCWasmObjectTargetWriter(bool Is64Bit_, bool IsEmscripten);
 
 public:
-  virtual ~MCWasmObjectTargetWriter();
+  ~MCWasmObjectTargetWriter() override;
 
   Triple::ObjectFormatType getFormat() const override { return Triple::Wasm; }
   static bool classof(const MCObjectTargetWriter *W) {
diff --git a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
index b11eacc0dc61d..4a9cf9c56ff01 100644
--- a/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
+++ b/llvm/include/llvm/MC/MCWinCOFFObjectWriter.h
@@ -29,7 +29,7 @@ class MCWinCOFFObjectTargetWriter : public MCObjectTargetWriter {
   MCWinCOFFObjectTargetWriter(unsigned Machine_);
 
 public:
-  virtual ~MCWinCOFFObjectTargetWriter() = default;
+  ~MCWinCOFFObjectTargetWriter() override = default;
 
   Triple::ObjectFormatType getFormat() const override { return Triple::COFF; }
   static bool classof(const MCObjectTargetWriter *W) {
diff --git a/llvm/include/llvm/Object/DXContainer.h b/llvm/include/llvm/Object/DXContainer.h
index 5a5a4dbaae2ad..c5888b87d6ad7 100644
--- a/llvm/include/llvm/Object/DXContainer.h
+++ b/llvm/include/llvm/Object/DXContainer.h
@@ -245,10 +245,10 @@ class RootSignature {
   uint32_t getStaticSamplersOffset() const { return StaticSamplersOffset; }
   uint32_t getNumRootParameters() const { return ParametersHeaders.size(); }
   llvm::iterator_range<param_header_iterator> param_headers() const {
-    return llvm::make_range(ParametersHeaders.begin(), ParametersHeaders.end());
+    return ParametersHeaders;
   }
   llvm::iterator_range<samplers_iterator> samplers() const {
-    return llvm::make_range(StaticSamplers.begin(), StaticSamplers.end());
+    return StaticSamplers;
   }
   uint32_t getFlags() const { return Flags; }
 
diff --git a/llvm/include/llvm/Object/MachO.h b/llvm/include/llvm/Object/MachO.h
index 3f4a21d2d1bac..01e7c6b07dd36 100644
--- a/llvm/include/llvm/Object/MachO.h
+++ b/llvm/include/llvm/Object/MachO.h
@@ -121,9 +121,7 @@ class ExportEntry {
   NodeList Stack;
   bool Done = false;
 
-  iterator_range<node_iterator> nodes() const {
-    return make_range(Stack.begin(), Stack.end());
-  }
+  iterator_range<node_iterator> nodes() const { return Stack; }
 };
 using export_iterator = content_iterator<ExportEntry>;
 
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 6a235c017a993..2f20792568e63 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -736,8 +736,8 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPasses(
     addPass(LowerEmuTLSPass());
 
   addPass(PreISelIntrinsicLoweringPass(&TM));
-  addPass(ExpandLargeDivRemPass(&TM));
-  addPass(ExpandFpPass(&TM, getOptLevel()));
+  addPass(ExpandLargeDivRemPass(TM));
+  addPass(ExpandFpPass(TM, getOptLevel()));
 
   derived().addIRPasses(addPass);
   derived().addCodeGenPrepare(addPass);
@@ -773,7 +773,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addIRPasses(
     // target lowering hook.
     if (!Opt.DisableMergeICmps)
       addPass(MergeICmpsPass());
-    addPass(ExpandMemCmpPass(&TM));
+    addPass(ExpandMemCmpPass(TM));
   }
 
   // Run GC lowering passes for builtin collectors
@@ -812,7 +812,7 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addIRPasses(
 
   // Convert conditional moves to conditional jumps when profitable.
   if (getOptLevel() != CodeGenOptLevel::None && !Opt.DisableSelectOptimize)
-    addPass(SelectOptimizePass(&TM));
+    addPass(SelectOptimizePass(TM));
 
   if (Opt.EnableGlobalMergeFunc)
     addPass(GlobalMergeFuncPass());
@@ -839,14 +839,14 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addPassesToHandleExceptions(
   case ExceptionHandling::ARM:
   case ExceptionHandling::AIX:
   case ExceptionHandling::ZOS:
-    addPass(DwarfEHPreparePass(&TM));
+    addPass(DwarfEHPreparePass(TM));
     break;
   case ExceptionHandling::WinEH:
     // We support using both GCC-style and MSVC-style exceptions on Windows, so
     // add both preparation passes. Each pass will only actually run if it
     // recognizes the personality function.
     addPass(WinEHPreparePass());
-    addPass(DwarfEHPreparePass(&TM));
+    addPass(DwarfEHPreparePass(TM));
     break;
   case ExceptionHandling::Wasm:
     // Wasm EH uses Windows EH instructions, but it does not need to demote PHIs
@@ -871,7 +871,7 @@ template <typename Derived, typename TargetMachineT>
 void CodeGenPassBuilder<Derived, TargetMachineT>::addCodeGenPrepare(
     AddIRPass &addPass) const {
   if (getOptLevel() != CodeGenOptLevel::None && !Opt.DisableCGP)
-    addPass(CodeGenPreparePass(&TM));
+    addPass(CodeGenPreparePass(TM));
   // TODO: Default ctor'd RewriteSymbolPass is no-op.
   // addPass(RewriteSymbolPass());
 }
@@ -892,8 +892,8 @@ void CodeGenPassBuilder<Derived, TargetMachineT>::addISelPrepare(
   addPass(CallBrPreparePass());
   // Add both the safe stack and the stack protection passes: each of them will
   // only protect functions that have corresponding attributes.
-  addPass(SafeStackPass(&TM));
-  addPass(StackProtectorPass(&TM));
+  addPass(SafeStackPass(TM));
+  addPass(StackProtectorPass(TM));
 
   if (Opt.PrintISelInput)
     addPass(PrintFunctionPass(dbgs(),
diff --git a/llvm/include/llvm/Remarks/RemarkLinker.h b/llvm/include/llvm/Remarks/RemarkLinker.h
index 67208f40592a5..2d98f2dc5b219 100644
--- a/llvm/include/llvm/Remarks/RemarkLinker.h
+++ b/llvm/include/llvm/Remarks/RemarkLinker.h
@@ -101,9 +101,7 @@ struct RemarkLinker {
   /// for (const Remark &R : RL.remarks() { [...] }
   using iterator = pointee_iterator<decltype(Remarks)::const_iterator>;
 
-  iterator_range<iterator> remarks() const {
-    return {Remarks.begin(), Remarks.end()};
-  }
+  iterator_range<iterator> remarks() const { return Remarks; }
 };
 
 /// Returns a buffer with the contents of the remarks section depending on the
diff --git a/llvm/include/llvm/Support/AllocToken.h b/llvm/include/llvm/Support/AllocToken.h
new file mode 100644
index 0000000000000..e40d8163a9d7c
--- /dev/null
+++ b/llvm/include/llvm/Support/AllocToken.h
@@ -0,0 +1,68 @@
+//===- llvm/Support/AllocToken.h - Allocation Token Calculation -----*- C++ -*//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_SUPPORT_ALLOCTOKEN_H
+#define LLVM_SUPPORT_ALLOCTOKEN_H
+
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringRef.h"
+#include <cstdint>
+#include <optional>
+
+namespace llvm {
+
+/// Modes for generating allocation token IDs.
+enum class AllocTokenMode {
+  /// Incrementally increasing token ID.
+  Increment,
+
+  /// Simple mode that returns a statically-assigned random token ID.
+  Random,
+
+  /// Token ID based on allocated type hash.
+  TypeHash,
+
+  /// Token ID based on allocated type hash, where the top half ID-space is
+  /// reserved for types that contain pointers and the bottom half for types
+  /// that do not contain pointers.
+  TypeHashPointerSplit,
+};
+
+/// The default allocation token mode.
+inline constexpr AllocTokenMode DefaultAllocTokenMode =
+    AllocTokenMode::TypeHashPointerSplit;
+
+/// Returns the AllocTokenMode from its canonical string name; if an invalid
+/// name was provided returns nullopt.
+LLVM_ABI std::optional<AllocTokenMode>
+getAllocTokenModeFromString(StringRef Name);
+
+/// Metadata about an allocation used to generate a token ID.
+struct AllocTokenMetadata {
+  SmallString<64> TypeName;
+  bool ContainsPointer;
+};
+
+/// Calculates stable allocation token ID. Returns std::nullopt for stateful
+/// modes that are only available in the AllocToken pass.
+///
+/// \param Mode The token generation mode.
+/// \param Metadata The metadata about the allocation.
+/// \param MaxTokens The maximum number of tokens (must not be 0)
+/// \return The calculated allocation token ID, or std::nullopt.
+LLVM_ABI std::optional<uint64_t>
+getAllocToken(AllocTokenMode Mode, const AllocTokenMetadata &Metadata,
+              uint64_t MaxTokens);
+
+} // end namespace llvm
+
+#endif // LLVM_SUPPORT_ALLOCTOKEN_H
diff --git a/llvm/include/llvm/Support/DataExtractor.h b/llvm/include/llvm/Support/DataExtractor.h
index 3792f53407dd9..f1710b918ce78 100644
--- a/llvm/include/llvm/Support/DataExtractor.h
+++ b/llvm/include/llvm/Support/DataExtractor.h
@@ -19,12 +19,8 @@ namespace llvm {
 /// An auxiliary type to facilitate extraction of 3-byte entities.
 struct Uint24 {
   uint8_t Bytes[3];
-  Uint24(uint8_t U) {
-    Bytes[0] = Bytes[1] = Bytes[2] = U;
-  }
-  Uint24(uint8_t U0, uint8_t U1, uint8_t U2) {
-    Bytes[0] = U0; Bytes[1] = U1; Bytes[2] = U2;
-  }
+  Uint24(uint8_t U) : Bytes{U, U, U} {}
+  Uint24(uint8_t U0, uint8_t U1, uint8_t U2) : Bytes{U0, U1, U2} {}
   uint32_t getAsUint32(bool IsLittleEndian) const {
     int LoIx = IsLittleEndian ? 0 : 2;
     return Bytes[LoIx] + (Bytes[1] << 8) + (Bytes[2-LoIx] << 16);
diff --git a/llvm/include/llvm/Support/GlobPattern.h b/llvm/include/llvm/Support/GlobPattern.h
index c1b44849b9794..8cae6a38d8326 100644
--- a/llvm/include/llvm/Support/GlobPattern.h
+++ b/llvm/include/llvm/Support/GlobPattern.h
@@ -63,21 +63,30 @@ class GlobPattern {
   // Returns true for glob pattern "*". Can be used to avoid expensive
   // preparation/acquisition of the input for match().
   bool isTrivialMatchAll() const {
-    if (!Prefix.empty())
+    if (PrefixSize)
       return false;
-    if (!Suffix.empty())
+    if (SuffixSize)
       return false;
     if (SubGlobs.size() != 1)
       return false;
     return SubGlobs[0].getPat() == "*";
   }
 
-  StringRef prefix() const { return Prefix; }
-  StringRef suffix() const { return Suffix; }
+  // The following functions are just shortcuts for faster matching. They are
+  // conservative to simplify implementations.
+
+  // Returns plain prefix of the pattern.
+  StringRef prefix() const { return Pattern.take_front(PrefixSize); }
+  // Returns plain suffix of the pattern.
+  StringRef suffix() const { return Pattern.take_back(SuffixSize); }
+  // Returns the longest plain substring of the pattern between prefix and
+  // suffix.
+  StringRef longest_substr() const;
 
 private:
-  StringRef Prefix;
-  StringRef Suffix;
+  StringRef Pattern;
+  size_t PrefixSize = 0;
+  size_t SuffixSize = 0;
 
   struct SubGlobPattern {
     /// \param Pat the pattern to match against
diff --git a/llvm/include/llvm/Support/Recycler.h b/llvm/include/llvm/Support/Recycler.h
index b51c58678e653..6502a70dbf89c 100644
--- a/llvm/include/llvm/Support/Recycler.h
+++ b/llvm/include/llvm/Support/Recycler.h
@@ -19,6 +19,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
+#include <type_traits>
 
 namespace llvm {
 
@@ -72,19 +73,19 @@ class Recycler {
   /// deleted; calling clear is one way to ensure this.
   template<class AllocatorType>
   void clear(AllocatorType &Allocator) {
-    while (FreeList) {
-      T *t = reinterpret_cast<T *>(pop_val());
-      Allocator.Deallocate(t, Size, Align);
+    if constexpr (std::is_same_v<std::decay_t<AllocatorType>,
+                                 BumpPtrAllocator>) {
+      // For BumpPtrAllocator, Deallocate is a no-op, so just drop the free
+      // list.
+      FreeList = nullptr;
+    } else {
+      while (FreeList) {
+        T *t = reinterpret_cast<T *>(pop_val());
+        Allocator.Deallocate(t, Size, Align);
+      }
     }
   }
 
-  /// Special case for BumpPtrAllocator which has an empty Deallocate()
-  /// function.
-  ///
-  /// There is no need to traverse the free list, pulling all the objects into
-  /// cache.
-  void clear(BumpPtrAllocator &) { FreeList = nullptr; }
-
   template<class SubClass, class AllocatorType>
   SubClass *Allocate(AllocatorType &Allocator) {
     static_assert(alignof(SubClass) <= Align,
diff --git a/llvm/include/llvm/Support/SpecialCaseList.h b/llvm/include/llvm/Support/SpecialCaseList.h
index ead765562504d..860f73c798e41 100644
--- a/llvm/include/llvm/Support/SpecialCaseList.h
+++ b/llvm/include/llvm/Support/SpecialCaseList.h
@@ -13,7 +13,10 @@
 #define LLVM_SUPPORT_SPECIALCASELIST_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/RadixTree.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/GlobPattern.h"
@@ -162,6 +165,11 @@ class SpecialCaseList {
     };
 
     std::vector<GlobMatcher::Glob> Globs;
+
+    RadixTree<iterator_range<StringRef::const_iterator>,
+              RadixTree<iterator_range<StringRef::const_reverse_iterator>,
+                        SmallVector<const GlobMatcher::Glob *, 1>>>
+        PrefixSuffixToGlob;
   };
 
   /// Represents a set of patterns and their line numbers
diff --git a/llvm/include/llvm/Support/Timer.h b/llvm/include/llvm/Support/Timer.h
index 6a4475882000a..527d67f3b360c 100644
--- a/llvm/include/llvm/Support/Timer.h
+++ b/llvm/include/llvm/Support/Timer.h
@@ -209,6 +209,7 @@ class TimerGroup {
   std::string Description;
   Timer *FirstTimer = nullptr; ///< First timer in the group.
   std::vector<PrintRecord> TimersToPrint;
+  bool PrintOnExit;
 
   TimerGroup **Prev; ///< Pointer to Next field of previous timergroup in list.
   TimerGroup *Next;  ///< Pointer to next timergroup in list.
@@ -217,13 +218,15 @@ class TimerGroup {
 
   friend class TimerGlobals;
   explicit TimerGroup(StringRef Name, StringRef Description,
-                      sys::SmartMutex<true> &lock);
+                      sys::SmartMutex<true> &lock, bool PrintOnExit);
 
 public:
-  LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description);
+  LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description,
+                               bool PrintOnExit = true);
 
   LLVM_ABI explicit TimerGroup(StringRef Name, StringRef Description,
-                               const StringMap<TimeRecord> &Records);
+                               const StringMap<TimeRecord> &Records,
+                               bool PrintOnExit = true);
 
   LLVM_ABI ~TimerGroup();
 
diff --git a/llvm/include/llvm/Target/GlobalISel/Combine.td b/llvm/include/llvm/Target/GlobalISel/Combine.td
index 3d21f522e97ce..47d5d68174b38 100644
--- a/llvm/include/llvm/Target/GlobalISel/Combine.td
+++ b/llvm/include/llvm/Target/GlobalISel/Combine.td
@@ -484,13 +484,6 @@ def propagate_undef_shuffle_mask: GICombineRule<
          [{ return Helper.matchUndefShuffleVectorMask(*${root}); }]),
   (apply [{ Helper.replaceInstWithUndef(*${root}); }])>;
 
-// Replace a G_SHUFFLE_VECTOR with a G_EXTRACT_VECTOR_ELT.
-def shuffle_to_extract: GICombineRule<
-  (defs root:$root),
-  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
-         [{ return Helper.matchShuffleToExtract(*${root}); }]),
-  (apply [{ Helper.applyShuffleToExtract(*${root}); }])>;
-
   // Replace an insert/extract element of an out of bounds index with undef.
   def insert_extract_vec_elt_out_of_bounds : GICombineRule<
   (defs root:$root),
@@ -1674,8 +1667,7 @@ def combine_shuffle_concat : GICombineRule<
 // Combines shuffles of vector into build_vector
 def combine_shuffle_vector_to_build_vector : GICombineRule<
   (defs root:$root),
-  (match (G_SHUFFLE_VECTOR $dst, $src1, $src2, $mask):$root,
-    [{ return Helper.matchCombineShuffleToBuildVector(*${root}); }]),
+  (match (G_SHUFFLE_VECTOR $dst, $src1, $src2, $mask):$root),
   (apply [{ Helper.applyCombineShuffleToBuildVector(*${root}); }])>;
 
 def insert_vector_element_idx_undef : GICombineRule<
diff --git a/llvm/include/llvm/Target/TargetOptions.h b/llvm/include/llvm/Target/TargetOptions.h
index 2c2122a7b204f..bfd2817b8d1f5 100644
--- a/llvm/include/llvm/Target/TargetOptions.h
+++ b/llvm/include/llvm/Target/TargetOptions.h
@@ -118,9 +118,8 @@ enum CodeObjectVersionKind {
 class TargetOptions {
 public:
   TargetOptions()
-      : UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
-        NoTrappingFPMath(true), NoSignedZerosFPMath(false),
-        EnableAIXExtendedAltivecABI(false),
+      : NoInfsFPMath(false), NoNaNsFPMath(false), NoTrappingFPMath(true),
+        NoSignedZerosFPMath(false), EnableAIXExtendedAltivecABI(false),
         HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
         GuaranteedTailCallOpt(false), StackSymbolOrdering(true),
         EnableFastISel(false), EnableGlobalISel(false), UseInitArray(false),
@@ -156,13 +155,6 @@ class TargetOptions {
   /// MCAsmInfo::BinutilsVersion.
   std::pair<int, int> BinutilsVersion{0, 0};
 
-  /// UnsafeFPMath - This flag is enabled when the
-  /// -enable-unsafe-fp-math flag is specified on the command line.  When
-  /// this flag is off (the default), the code generator is not allowed to
-  /// produce results that are "less precise" than IEEE allows.  This includes
-  /// use of X86 instructions like FSIN and FCOS instead of libcalls.
-  unsigned UnsafeFPMath : 1;
-
   /// NoInfsFPMath - This flag is enabled when the
   /// -enable-no-infs-fp-math flag is specified on the command line. When
   /// this flag is off (the default), the code generator is not allowed to
diff --git a/llvm/include/llvm/TargetParser/AArch64TargetParser.h b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
index 8e83b04681f58..7e68ad20e7583 100644
--- a/llvm/include/llvm/TargetParser/AArch64TargetParser.h
+++ b/llvm/include/llvm/TargetParser/AArch64TargetParser.h
@@ -115,9 +115,9 @@ struct ArchInfo {
   // Defines the following partial order, indicating when an architecture is
   // a superset of another:
   //
-  // v9.6a > v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a;
-  //                   v       v       v       v       v
-  //                 v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > v8.4a > ... > v8a;
+  // v9.7a > v9.6a > v9.5a > v9.4a > v9.3a > v9.2a > v9.1a > v9a;
+  //                           v       v       v       v       v
+  //                         v8.9a > v8.8a > v8.7a > v8.6a > v8.5a > ... > v8a;
   //
   // v8r has no relation to anything. This is used to determine which
   // features to enable for a given architecture. See
diff --git a/llvm/include/llvm/TargetParser/ARMTargetParser.def b/llvm/include/llvm/TargetParser/ARMTargetParser.def
index ff53aa10b4481..0ada2e79d1545 100644
--- a/llvm/include/llvm/TargetParser/ARMTargetParser.def
+++ b/llvm/include/llvm/TargetParser/ARMTargetParser.def
@@ -187,6 +187,11 @@ ARM_ARCH("armv9.6-a", ARMV9_6A, "9.6-A", "+v9.6a", ARMBuildAttrs::CPUArch::v9_A,
          (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
           ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
           ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
+ARM_ARCH("armv9.7-a", ARMV9_7A, "9.7-A", "+v9.7a", ARMBuildAttrs::CPUArch::v9_A,
+         FK_NEON_FP_ARMV8,
+         (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
+          ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS |
+          ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_I8MM))
 ARM_ARCH("armv8-r", ARMV8R, "8-R", "+v8r", ARMBuildAttrs::CPUArch::v8_R,
          FK_FPV5_SP_D16,
          (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM |
diff --git a/llvm/include/llvm/TargetParser/Triple.h b/llvm/include/llvm/TargetParser/Triple.h
index 5e43444d52a14..0e82dd212f34d 100644
--- a/llvm/include/llvm/TargetParser/Triple.h
+++ b/llvm/include/llvm/TargetParser/Triple.h
@@ -115,6 +115,7 @@ class Triple {
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v9_7a,
     ARMSubArch_v9_6a,
     ARMSubArch_v9_5a,
     ARMSubArch_v9_4a,
diff --git a/llvm/include/llvm/TextAPI/SymbolSet.h b/llvm/include/llvm/TextAPI/SymbolSet.h
index 22f4124f40313..c97d7a1606f2c 100644
--- a/llvm/include/llvm/TextAPI/SymbolSet.h
+++ b/llvm/include/llvm/TextAPI/SymbolSet.h
@@ -139,18 +139,14 @@ class SymbolSet {
       iterator_range<const_filtered_symbol_iterator>;
 
   // Range that contains all symbols.
-  const_symbol_range symbols() const {
-    return {Symbols.begin(), Symbols.end()};
-  }
+  const_symbol_range symbols() const { return Symbols; }
 
   // Range that contains all defined and exported symbols.
   const_filtered_symbol_range exports() const {
     std::function<bool(const Symbol *)> fn = [](const Symbol *Symbol) {
       return !Symbol->isUndefined() && !Symbol->isReexported();
     };
-    return make_filter_range(
-        make_range<const_symbol_iterator>({Symbols.begin()}, {Symbols.end()}),
-        fn);
+    return make_filter_range(symbols(), fn);
   }
 
   // Range that contains all reexported symbols.
@@ -158,9 +154,7 @@ class SymbolSet {
     std::function<bool(const Symbol *)> fn = [](const Symbol *Symbol) {
       return Symbol->isReexported();
     };
-    return make_filter_range(
-        make_range<const_symbol_iterator>({Symbols.begin()}, {Symbols.end()}),
-        fn);
+    return make_filter_range(symbols(), fn);
   }
 
   // Range that contains all undefined and exported symbols.
@@ -168,9 +162,7 @@ class SymbolSet {
     std::function<bool(const Symbol *)> fn = [](const Symbol *Symbol) {
       return Symbol->isUndefined();
     };
-    return make_filter_range(
-        make_range<const_symbol_iterator>({Symbols.begin()}, {Symbols.end()}),
-        fn);
+    return make_filter_range(symbols(), fn);
   }
 
   LLVM_ABI bool operator==(const SymbolSet &O) const;
diff --git a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
index b1391cb04302c..077703c214745 100644
--- a/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
+++ b/llvm/include/llvm/Transforms/Instrumentation/AllocToken.h
@@ -16,6 +16,7 @@
 
 #include "llvm/IR/Analysis.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/Support/AllocToken.h"
 #include <optional>
 
 namespace llvm {
@@ -23,6 +24,7 @@ namespace llvm {
 class Module;
 
 struct AllocTokenOptions {
+  AllocTokenMode Mode = DefaultAllocTokenMode;
   std::optional<uint64_t> MaxTokens;
   bool FastABI = false;
   bool Extended = false;
diff --git a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
index 979f3b3eb72ff..e677cbf2d8968 100644
--- a/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
+++ b/llvm/include/llvm/Transforms/Utils/BasicBlockUtils.h
@@ -21,6 +21,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/Support/Compiler.h"
+#include "llvm/Support/Printable.h"
 #include <cassert>
 
 namespace llvm {
@@ -611,6 +612,10 @@ LLVM_ABI void InvertBranch(BranchInst *PBI, IRBuilderBase &Builder);
 // br/brcond/unreachable/ret
 LLVM_ABI bool hasOnlySimpleTerminator(const Function &F);
 
+/// Print BasicBlock \p BB as an operand or print "<nullptr>" if \p BB is a
+/// nullptr.
+LLVM_ABI Printable printBasicBlock(const BasicBlock *BB);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_BASICBLOCKUTILS_H
diff --git a/llvm/lib/Analysis/DXILResource.cpp b/llvm/lib/Analysis/DXILResource.cpp
index f9bf09262dd1f..6f19a68dcd194 100644
--- a/llvm/lib/Analysis/DXILResource.cpp
+++ b/llvm/lib/Analysis/DXILResource.cpp
@@ -255,6 +255,12 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
   if (!ContainedType)
     return;
 
+  SmallVector<uint64_t> ArrayDimensions;
+  while (ArrayType *AT = dyn_cast<ArrayType>(ContainedType)) {
+    ArrayDimensions.push_back(AT->getNumElements());
+    ContainedType = AT->getElementType();
+  }
+
   StringRef ElementName;
   ElementType ET = toDXILElementType(ContainedType, IsSigned);
   if (ET != ElementType::Invalid) {
@@ -271,6 +277,8 @@ static void formatTypeName(SmallString<64> &Dest, StringRef Name,
   DestStream << "<" << ElementName;
   if (const FixedVectorType *VTy = dyn_cast<FixedVectorType>(ContainedType))
     DestStream << VTy->getNumElements();
+  for (uint64_t Dim : ArrayDimensions)
+    DestStream << "[" << Dim << "]";
   DestStream << ">";
 }
 
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 853bd66c8a7f8..a572eefddd20e 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -1582,6 +1582,23 @@ static const SCEV *minusSCEVNoSignedOverflow(const SCEV *A, const SCEV *B,
   return nullptr;
 }
 
+/// Returns the absolute value of \p A. In the context of dependence analysis,
+/// we need an absolute value in a mathematical sense. If \p A is the signed
+/// minimum value, we cannot represent it unless extending the original type.
+/// Thus if we cannot prove that \p A is not the signed minimum value, returns
+/// nullptr.
+static const SCEV *absSCEVNoSignedOverflow(const SCEV *A, ScalarEvolution &SE) {
+  IntegerType *Ty = cast<IntegerType>(A->getType());
+  if (!Ty)
+    return nullptr;
+
+  const SCEV *SMin =
+      SE.getConstant(APInt::getSignedMinValue(Ty->getBitWidth()));
+  if (!SE.isKnownPredicate(CmpInst::ICMP_NE, A, SMin))
+    return nullptr;
+  return SE.getAbsExpr(A, /*IsNSW=*/true);
+}
+
 /// Returns true iff \p Test is enabled.
 static bool isDependenceTestEnabled(DependenceTestType Test) {
   if (EnableDependenceTest == DependenceTestType::All)
@@ -1669,21 +1686,25 @@ bool DependenceInfo::strongSIVtest(const SCEV *Coeff, const SCEV *SrcConst,
   LLVM_DEBUG(dbgs() << ", " << *Delta->getType() << "\n");
 
   // check that |Delta| < iteration count
-  if (const SCEV *UpperBound =
-          collectUpperBound(CurSrcLoop, Delta->getType())) {
+  bool IsDeltaLarge = [&] {
+    const SCEV *UpperBound = collectUpperBound(CurSrcLoop, Delta->getType());
+    if (!UpperBound)
+      return false;
+
     LLVM_DEBUG(dbgs() << "\t    UpperBound = " << *UpperBound);
     LLVM_DEBUG(dbgs() << ", " << *UpperBound->getType() << "\n");
-    const SCEV *AbsDelta =
-        SE->isKnownNonNegative(Delta) ? Delta : SE->getNegativeSCEV(Delta);
-    const SCEV *AbsCoeff =
-        SE->isKnownNonNegative(Coeff) ? Coeff : SE->getNegativeSCEV(Coeff);
+    const SCEV *AbsDelta = absSCEVNoSignedOverflow(Delta, *SE);
+    const SCEV *AbsCoeff = absSCEVNoSignedOverflow(Coeff, *SE);
+    if (!AbsDelta || !AbsCoeff)
+      return false;
     const SCEV *Product = SE->getMulExpr(UpperBound, AbsCoeff);
-    if (isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product)) {
-      // Distance greater than trip count - no dependence
-      ++StrongSIVindependence;
-      ++StrongSIVsuccesses;
-      return true;
-    }
+    return isKnownPredicate(CmpInst::ICMP_SGT, AbsDelta, Product);
+  }();
+  if (IsDeltaLarge) {
+    // Distance greater than trip count - no dependence
+    ++StrongSIVindependence;
+    ++StrongSIVsuccesses;
+    return true;
   }
 
   // Can we compute distance?
@@ -2259,6 +2280,9 @@ bool DependenceInfo::weakZeroSrcSIVtest(
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(DstCoeff);
   if (!ConstCoeff)
     return false;
+
+  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+  // TODO: Bail out if it's a signed minimum value.
   const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
                              ? SE->getNegativeSCEV(ConstCoeff)
                              : ConstCoeff;
@@ -2369,6 +2393,9 @@ bool DependenceInfo::weakZeroDstSIVtest(
   const SCEVConstant *ConstCoeff = dyn_cast<SCEVConstant>(SrcCoeff);
   if (!ConstCoeff)
     return false;
+
+  // Since ConstCoeff is constant, !isKnownNegative means it's non-negative.
+  // TODO: Bail out if it's a signed minimum value.
   const SCEV *AbsCoeff = SE->isKnownNegative(ConstCoeff)
                              ? SE->getNegativeSCEV(ConstCoeff)
                              : ConstCoeff;
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index 757f68999691e..c4fee39cc0766 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -751,7 +751,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       if (CA.analyze().isSuccess()) {
         // We were able to inline the indirect call! Subtract the cost from the
         // threshold to get the bonus we want to apply, but don't go below zero.
-        Cost -= std::max(0, CA.getThreshold() - CA.getCost());
+        addCost(-std::max(0, CA.getThreshold() - CA.getCost()));
       }
     } else
       // Otherwise simply add the cost for merely making the call.
@@ -1191,7 +1191,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     // If this function uses the coldcc calling convention, prefer not to inline
     // it.
     if (F.getCallingConv() == CallingConv::Cold)
-      Cost += InlineConstants::ColdccPenalty;
+      addCost(InlineConstants::ColdccPenalty);
 
     LLVM_DEBUG(dbgs() << "      Initial cost: " << Cost << "\n");
 
@@ -2193,7 +2193,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
   // the cost of inlining it drops dramatically. It may seem odd to update
   // Cost in updateThreshold, but the bonus depends on the logic in this method.
   if (isSoleCallToLocalFunction(Call, F)) {
-    Cost -= LastCallToStaticBonus;
+    addCost(-LastCallToStaticBonus);
     StaticBonusApplied = LastCallToStaticBonus;
   }
 }
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index 8da51d039f197..2a0a6a2d302b1 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -4866,6 +4866,89 @@ static Value *simplifySelectWithFCmp(Value *Cond, Value *T, Value *F,
   return nullptr;
 }
 
+/// Look for the following pattern and simplify %to_fold to %identicalPhi.
+/// Here %phi, %to_fold and %phi.next perform the same functionality as
+/// %identicalPhi and hence the select instruction %to_fold can be folded
+/// into %identicalPhi.
+///
+/// BB1:
+///   %identicalPhi = phi [ X, %BB0 ], [ %identicalPhi.next, %BB1 ]
+///   %phi = phi [ X, %BB0 ], [ %phi.next, %BB1 ]
+///   ...
+///   %identicalPhi.next = select %cmp, %val, %identicalPhi
+///                      (or select %cmp, %identicalPhi, %val)
+///   %to_fold = select %cmp2, %identicalPhi, %phi
+///   %phi.next = select %cmp, %val, %to_fold
+///             (or select %cmp, %to_fold, %val)
+///
+/// Prove that %phi and %identicalPhi are the same by induction:
+///
+/// Base case: Both %phi and %identicalPhi are equal on entry to the loop.
+/// Inductive case:
+/// Suppose %phi and %identicalPhi are equal at iteration i.
+/// We look at their values at iteration i+1 which are %phi.next and
+/// %identicalPhi.next. They would have become different only when %cmp is
+/// false and the corresponding values %to_fold and %identicalPhi differ
+/// (similar reason for the other "or" case in the bracket).
+///
+/// The only condition when %to_fold and %identicalPh could differ is when %cmp2
+/// is false and %to_fold is %phi, which contradicts our inductive hypothesis
+/// that %phi and %identicalPhi are equal. Thus %phi and %identicalPhi are
+/// always equal at iteration i+1.
+bool isSelectWithIdenticalPHI(PHINode &PN, PHINode &IdenticalPN) {
+  if (PN.getParent() != IdenticalPN.getParent())
+    return false;
+  if (PN.getNumIncomingValues() != 2)
+    return false;
+
+  // Check that only the backedge incoming value is different.
+  unsigned DiffVals = 0;
+  BasicBlock *DiffValBB = nullptr;
+  for (unsigned i = 0; i < 2; i++) {
+    BasicBlock *PredBB = PN.getIncomingBlock(i);
+    if (PN.getIncomingValue(i) !=
+        IdenticalPN.getIncomingValueForBlock(PredBB)) {
+      DiffVals++;
+      DiffValBB = PredBB;
+    }
+  }
+  if (DiffVals != 1)
+    return false;
+  // Now check that the backedge incoming values are two select
+  // instructions with the same condition. Either their true
+  // values are the same, or their false values are the same.
+  auto *SI = dyn_cast<SelectInst>(PN.getIncomingValueForBlock(DiffValBB));
+  auto *IdenticalSI =
+      dyn_cast<SelectInst>(IdenticalPN.getIncomingValueForBlock(DiffValBB));
+  if (!SI || !IdenticalSI)
+    return false;
+  if (SI->getCondition() != IdenticalSI->getCondition())
+    return false;
+
+  SelectInst *SIOtherVal = nullptr;
+  Value *IdenticalSIOtherVal = nullptr;
+  if (SI->getTrueValue() == IdenticalSI->getTrueValue()) {
+    SIOtherVal = dyn_cast<SelectInst>(SI->getFalseValue());
+    IdenticalSIOtherVal = IdenticalSI->getFalseValue();
+  } else if (SI->getFalseValue() == IdenticalSI->getFalseValue()) {
+    SIOtherVal = dyn_cast<SelectInst>(SI->getTrueValue());
+    IdenticalSIOtherVal = IdenticalSI->getTrueValue();
+  } else {
+    return false;
+  }
+
+  // Now check that the other values in select, i.e., %to_fold and
+  // %identicalPhi, are essentially the same value.
+  if (!SIOtherVal || IdenticalSIOtherVal != &IdenticalPN)
+    return false;
+  if (!(SIOtherVal->getTrueValue() == &IdenticalPN &&
+        SIOtherVal->getFalseValue() == &PN) &&
+      !(SIOtherVal->getTrueValue() == &PN &&
+        SIOtherVal->getFalseValue() == &IdenticalPN))
+    return false;
+  return true;
+}
+
 /// Given operands for a SelectInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
@@ -5041,7 +5124,14 @@ static Value *simplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   std::optional<bool> Imp = isImpliedByDomCondition(Cond, Q.CxtI, Q.DL);
   if (Imp)
     return *Imp ? TrueVal : FalseVal;
-
+  // Look for same PHIs in the true and false values.
+  if (auto *TruePHI = dyn_cast<PHINode>(TrueVal))
+    if (auto *FalsePHI = dyn_cast<PHINode>(FalseVal)) {
+      if (isSelectWithIdenticalPHI(*TruePHI, *FalsePHI))
+        return FalseVal;
+      if (isSelectWithIdenticalPHI(*FalsePHI, *TruePHI))
+        return TrueVal;
+    }
   return nullptr;
 }
 
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 050c32707596a..424a7fe3721bb 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -436,10 +436,9 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
       const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
 
       if (StepRec && SE.isKnownNegative(StepRec))
-        AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
-                                    SE.getNegativeSCEV(StepRec),
-                                    AccessFnAR->getLoop(),
-                                    AccessFnAR->getNoWrapFlags());
+        AccessFn = SE.getAddRecExpr(
+            AccessFnAR->getStart(), SE.getNegativeSCEV(StepRec),
+            AccessFnAR->getLoop(), SCEV::NoWrapFlags::FlagAnyWrap);
       const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
       Subscripts.push_back(Div);
       Sizes.push_back(ElemSize);
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 1c5f08e13498c..edca3871ad593 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -288,6 +288,34 @@ MemoryLocation MemoryLocation::getForArgument(const CallBase *Call,
                             LocationSize::precise(DL.getTypeStoreSize(
                                 II->getArgOperand(1)->getType())),
                             AATags);
+    case Intrinsic::matrix_column_major_load:
+    case Intrinsic::matrix_column_major_store: {
+      bool IsLoad = II->getIntrinsicID() == Intrinsic::matrix_column_major_load;
+      assert(ArgIdx == (IsLoad ? 0 : 1) && "Invalid argument index");
+
+      auto *Stride = dyn_cast<ConstantInt>(II->getArgOperand(IsLoad ? 1 : 2));
+      uint64_t Rows =
+          cast<ConstantInt>(II->getArgOperand(IsLoad ? 3 : 4))->getZExtValue();
+      uint64_t Cols =
+          cast<ConstantInt>(II->getArgOperand(IsLoad ? 4 : 5))->getZExtValue();
+
+      // The stride is dynamic, so there's nothing we can say.
+      if (!Stride)
+        return MemoryLocation(Arg, LocationSize::afterPointer(), AATags);
+
+      uint64_t ConstStride = Stride->getZExtValue();
+      auto *VT = cast<VectorType>(IsLoad ? II->getType()
+                                         : II->getArgOperand(0)->getType());
+      assert(Cols != 0 && "Matrix cannot have 0 columns");
+      TypeSize Size = DL.getTypeAllocSize(VT->getScalarType()) *
+                      (ConstStride * (Cols - 1) + Rows);
+
+      // In the unstrided case, we have a precise size, ...
+      if (ConstStride == Rows)
+        return MemoryLocation(Arg, LocationSize::precise(Size), AATags);
+      // otherwise we merely obtain an upper bound.
+      return MemoryLocation(Arg, LocationSize::upperBound(Size), AATags);
+    }
     }
 
     assert(
diff --git a/llvm/lib/Analysis/MemoryProfileInfo.cpp b/llvm/lib/Analysis/MemoryProfileInfo.cpp
index 92a5b6f378aab..b09f4ed78ca7e 100644
--- a/llvm/lib/Analysis/MemoryProfileInfo.cpp
+++ b/llvm/lib/Analysis/MemoryProfileInfo.cpp
@@ -241,9 +241,13 @@ static MDNode *createMIBNode(LLVMContext &Ctx, ArrayRef<uint64_t> MIBCallStack,
       ColdBytes += TotalSize;
       // If we have the max cold context size from summary information and have
       // requested identification of contexts above a percentage of the max, see
-      // if this context qualifies.
-      if (MaxColdSize > 0 && MinPercentMaxColdSize < 100 &&
-          TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize)
+      // if this context qualifies. We should assume this is large if we rebuilt
+      // the trie from existing metadata (i.e. to update after inlining), in
+      // which case we don't have a MaxSize from the profile - we assume any
+      // context size info in existence on the metadata should be propagated.
+      if (BuiltFromExistingMetadata ||
+          (MaxColdSize > 0 && MinPercentMaxColdSize < 100 &&
+           TotalSize * 100 >= MaxColdSize * MinPercentMaxColdSize))
         LargeColdContext = true;
     }
     // Only add the context size info as metadata if we need it in the thin
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 6f7dd79032cfe..7597f3ad685a0 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -3768,13 +3768,11 @@ ScalarEvolution::getAddRecExpr(SmallVectorImpl<const SCEV *> &Operands,
   return getOrCreateAddRecExpr(Operands, L, Flags);
 }
 
-const SCEV *
-ScalarEvolution::getGEPExpr(GEPOperator *GEP,
-                            const SmallVectorImpl<const SCEV *> &IndexExprs) {
+const SCEV *ScalarEvolution::getGEPExpr(GEPOperator *GEP,
+                                        ArrayRef<const SCEV *> IndexExprs) {
   const SCEV *BaseExpr = getSCEV(GEP->getPointerOperand());
   // getSCEV(Base)->getType() has the same address space as Base->getType()
   // because SCEV::getType() preserves the address space.
-  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   GEPNoWrapFlags NW = GEP->getNoWrapFlags();
   if (NW != GEPNoWrapFlags::none()) {
     // We'd like to propagate flags from the IR to the corresponding SCEV nodes,
@@ -3787,13 +3785,20 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       NW = GEPNoWrapFlags::none();
   }
 
+  return getGEPExpr(BaseExpr, IndexExprs, GEP->getSourceElementType(), NW);
+}
+
+const SCEV *ScalarEvolution::getGEPExpr(const SCEV *BaseExpr,
+                                        ArrayRef<const SCEV *> IndexExprs,
+                                        Type *SrcElementTy, GEPNoWrapFlags NW) {
   SCEV::NoWrapFlags OffsetWrap = SCEV::FlagAnyWrap;
   if (NW.hasNoUnsignedSignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNSW);
   if (NW.hasNoUnsignedWrap())
     OffsetWrap = setFlags(OffsetWrap, SCEV::FlagNUW);
 
-  Type *CurTy = GEP->getType();
+  Type *CurTy = BaseExpr->getType();
+  Type *IntIdxTy = getEffectiveSCEVType(BaseExpr->getType());
   bool FirstIter = true;
   SmallVector<const SCEV *, 4> Offsets;
   for (const SCEV *IndexExpr : IndexExprs) {
@@ -3812,7 +3817,7 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       if (FirstIter) {
         assert(isa<PointerType>(CurTy) &&
                "The first index of a GEP indexes a pointer");
-        CurTy = GEP->getSourceElementType();
+        CurTy = SrcElementTy;
         FirstIter = false;
       } else {
         CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index bf62623099a97..c47a1c1b23a37 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1001,13 +1001,25 @@ InstructionCost TargetTransformInfo::getShuffleCost(
 
 TargetTransformInfo::PartialReductionExtendKind
 TargetTransformInfo::getPartialReductionExtendKind(Instruction *I) {
-  if (isa<SExtInst>(I))
-    return PR_SignExtend;
-  if (isa<ZExtInst>(I))
-    return PR_ZeroExtend;
+  if (auto *Cast = dyn_cast<CastInst>(I))
+    return getPartialReductionExtendKind(Cast->getOpcode());
   return PR_None;
 }
 
+TargetTransformInfo::PartialReductionExtendKind
+TargetTransformInfo::getPartialReductionExtendKind(
+    Instruction::CastOps CastOpc) {
+  switch (CastOpc) {
+  case Instruction::CastOps::ZExt:
+    return PR_ZeroExtend;
+  case Instruction::CastOps::SExt:
+    return PR_SignExtend;
+  default:
+    return PR_None;
+  }
+  llvm_unreachable("Unhandled cast opcode");
+}
+
 TTI::CastContextHint
 TargetTransformInfo::getCastContextHint(const Instruction *I) {
   if (!I)
diff --git a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
index 0afe5c77d0461..f026b0d183a01 100644
--- a/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
+++ b/llvm/lib/Analysis/models/x86SeedEmbeddingVocab100D.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[0.07323841750621796, -0.006006906274706125, 0.09751169383525848, -0.011089739389717579, 0.06642112135887146, -0.015824640169739723, -0.021592319011688232, -0.0035401300992816687, 0.06047678738832474, -0.007392085622996092, 0.07134906202554703, -0.019624482840299606, -0.10975595563650131, -0.007685789838433266, 0.07451746612787247, 0.06384266912937164, -0.08230067789554596, 0.050922468304634094, 0.013724055141210556, 0.015687907114624977, -0.018451329320669174, 0.046987198293209076, -0.037734340876340866, -0.07235030829906464, 0.10218106210231781, 0.08037368208169937, -0.029537858441472054, -0.047520823776721954, -0.022125739604234695, -0.03125226870179176, -0.02882847562432289, 0.013811410404741764, 0.0023568253964185715, 0.017958490177989006, -0.05359291657805443, -0.03606243059039116, 0.07840022444725037, -0.016711654141545296, -0.038644544780254364, 0.05886651948094368, -0.011418955400586128, -0.04882095381617546, 0.04027162492275238, 0.001088760793209076, 0.03045983798801899, -0.10998888313770294, -0.0097441291436553, 0.015445191413164139, 0.030951637774705887, -0.06309321522712708, -0.019475746899843216, -0.029662512242794037, 0.05312168970704079, 0.05355998873710632, 0.05060160160064697, -0.053278811275959015, -0.01803833432495594, 0.010853713378310204, -0.053911495953798294, 0.06630647927522659, -0.08671313524246216, 0.0699775293469429, -0.08346731215715408, -0.045348167419433594, 0.06779918074607849, 0.008865933865308762, 0.05460203066468239, 0.007126103155314922, 0.0012282058596611023, 0.06817980855703354, 0.0216530654579401, 0.03552381321787834, 0.015414077788591385, -0.06002715229988098, 0.05233345925807953, 0.0782286673784256, 0.04220856353640556, -0.005762201733887196, 0.004772072657942772, 0.004578332882374525, 0.002619141712784767, 0.024511393159627914, -0.10089710354804993, 0.018322769552469254, 0.020811809226870537, -0.03358744457364082, -0.06896928697824478, -0.007399350870400667, -0.044467780739068985, -0.08094192296266556, -0.09795571863651276, 0.08391229063272476, -0.04749457910656929, 0.0029586481396108866, -5.354872337193228e-05, 0.005788655485957861, 0.015252145007252693, 0.06928747892379761, 0.041780371218919754, 0.016391364857554436],
         "ADC":[-0.07533542811870575, -0.01729339174926281, 0.04298720881342888, 0.015697332099080086, -0.04403507336974144, -0.059322185814380646, -0.050977922976017, 0.027526788413524628, -0.07009710371494293, -0.025621667504310608, 0.0352291613817215, -0.011538374237716198, 0.03682859241962433, -0.09788215160369873, -0.07216927409172058, -0.03659192472696304, 0.05676230415701866, -0.06369645893573761, -0.04756825789809227, 0.005865555722266436, 0.022270306944847107, -0.042112063616514206, 0.07008901983499527, 0.07748222351074219, -0.1020870953798294, -0.008511601015925407, -0.05725255608558655, -0.07881367206573486, 0.05627593398094177, -0.0005361076910048723, 0.03351512551307678, 0.04348289221525192, -0.08322969079017639, -0.02161242999136448, -0.07805898040533066, 0.04819482937455177, -0.061123576015233994, -0.010114834643900394, -0.04676959663629532, -0.008176938630640507, 0.010575453750789165, -0.04312445595860481, 0.00376943894661963, -0.0691257119178772, 0.03553615137934685, 0.10397598147392273, 0.009375158697366714, 0.001147320494055748, 0.026351911947131157, -0.0194610096514225, -0.05202522128820419, 0.014047946780920029, -0.040036872029304504, 0.06963572651147842, 0.04827437922358513, -0.06908547878265381, 0.024857567623257637, -0.03304143249988556, 0.02291242778301239, 0.07687342166900635, -0.05110599845647812, -0.00873416755348444, 0.026205750182271004, 0.045064594596624374, -0.03565925359725952, 0.09580051153898239, -0.02518773265182972, 0.047807395458221436, -0.03548192232847214, 0.08286304026842117, -0.053511787205934525, 0.02892065793275833, -0.0495525486767292, 0.02590095065534115, -0.006982128601521254, 0.006042638327926397, -0.07269058376550674, 0.02401554025709629, -0.05660006031394005, -0.026029467582702637, 0.05318204686045647, 0.06714116781949997, -0.0023821850772947073, 0.05028798058629036, -0.005811943672597408, -0.003296421840786934, -0.005409242119640112, -0.10150349885225296, -0.06406981498003006, 0.02553202211856842, -0.002790689468383789, 0.0663856491446495, 0.09109167754650116, -0.04678672179579735, 0.022019781172275543, 0.007821275852620602, 0.022490357980132103, -0.058503177016973495, 0.08841150254011154, -0.00892670825123787],
         "ADD":[-0.037626221776008606, 0.006784931290894747, 0.10051396489143372, -0.0014993306249380112, -0.0323498398065567, -0.03148593008518219, -0.014100957661867142, -0.020252650603652, 0.014126972295343876, -0.1295478343963623, 0.08520576357841492, -0.02513248659670353, 0.03539956361055374, -0.07019674777984619, -0.019069846719503403, 0.016678515821695328, -0.009174983017146587, -0.019034702330827713, -0.024083402007818222, -0.07829779386520386, -0.007908892817795277, -0.07924024760723114, -0.034599609673023224, 0.05271153524518013, 0.0016642026603221893, -0.03938138112425804, 0.0019624519627541304, 0.03562740981578827, 0.07340876758098602, 0.09457183629274368, -0.06507840752601624, 0.00246993126347661, -0.004548616707324982, 0.058226197957992554, -0.021043049171566963, -0.0599520243704319, -0.03138553351163864, 0.03265950828790665, 0.004963710438460112, -0.003248866181820631, -0.04021746292710304, 0.038208190351724625, -0.02256007120013237, 0.10770396143198013, 0.013757425360381603, 0.040707558393478394, -0.00694271270185709, -0.012331271544098854, 0.004992029629647732, -0.032236646860837936, 0.01055158581584692, 0.04604483023285866, 0.09973260760307312, 0.07322807610034943, 0.06853726506233215, 0.004230210557579994, -0.04007832333445549, 0.16341225802898407, -0.01683313027024269, -0.01998194307088852, -0.035919081419706345, -0.055582448840141296, 0.008072910830378532, -0.0054771858267486095, -0.013343624770641327, 0.014230597764253616, -0.06542462855577469, 0.015897123143076897, -0.06011642515659332, 0.07983837276697159, 0.026512078940868378, 0.014883842319250107, -0.015171286650002003, 4.1508101276122034e-05, -0.048078570514917374, -0.052594274282455444, -0.07897629588842392, -0.01334046758711338, -0.06180298700928688, 0.022423526272177696, 0.07393807917833328, 0.022332284599542618, 0.04279463365674019, 0.04075624793767929, 0.007524204906076193, -0.024405587464571, 0.0011822516098618507, -0.0019135301699861884, 0.10789427906274796, -0.040499038994312286, 0.011574117466807365, 0.048836030066013336, 0.0380941741168499, -0.047072283923625946, -0.01285380870103836, -0.038019485771656036, -0.06277137994766235, -0.0034404860343784094, -0.031123748049139977, 0.04279843345284462],
@@ -47,7 +47,6 @@
         "CVTSS":[-0.06638028472661972, -0.011326023377478123, 0.008208844810724258, 0.007368308026343584, 0.009791173972189426, -0.03396046161651611, 0.02250068075954914, -0.057750262320041656, -0.04949551820755005, 0.02559898979961872, -0.025012727826833725, -0.05923935025930405, 0.005058884620666504, 0.008716589771211147, -0.017511164769530296, -0.07095059007406235, -0.06573225557804108, -0.028140492737293243, 0.11092227697372437, 0.02664722129702568, -0.01997300609946251, 0.0798712745308876, -0.022800235077738762, 0.09157945215702057, 0.025709187611937523, -0.09037603437900543, -0.07092109322547913, -0.04094154015183449, -0.025702493265271187, 0.015247789211571217, 0.06089004501700401, 0.051023274660110474, -0.04670926183462143, 0.04763137549161911, -0.035940639674663544, 0.002320673782378435, -0.005764417815953493, -0.07975194603204727, -0.0038822791539132595, 0.06728507578372955, 0.020742014050483704, 0.08809743821620941, -0.061493389308452606, -0.0485445000231266, -0.022268671542406082, 0.08475345373153687, -0.0030403153505176306, -0.05737586319446564, -0.07930854707956314, -0.01657176949083805, 0.04658877104520798, 0.005716703832149506, -0.04288295656442642, -0.08686209470033646, -0.07359853386878967, 0.02947128191590309, -0.03684910386800766, -0.03841136023402214, 0.01288131158798933, -0.04918907582759857, -0.05579863488674164, 0.06267702579498291, -0.0034505922812968493, 0.034628838300704956, 0.04280426353216171, 0.042202845215797424, 0.012274117209017277, 0.025021208450198174, -0.07867497205734253, 0.03826712444424629, 0.017088277265429497, 0.037250861525535583, -0.016143174842000008, -0.06754780560731888, -0.013957766816020012, 0.1060054823756218, 0.014829001389443874, 0.06808885931968689, 0.022929415106773376, -0.10870063304901123, -0.002258410444483161, 0.009293666109442711, 0.08529872447252274, -0.018672339618206024, -0.06721168756484985, 0.04180533438920975, -0.0031767592299729586, -0.023869113996624947, -0.00011912015179404989, -0.034519728273153305, 0.0022619885858148336, -0.00573525857180357, -0.033912476152181625, 0.059763263911008835, -0.048703599721193314, -0.07433722168207169, 0.04105979949235916, 0.0022583131212741137, 0.03093089908361435, -0.05187990516424179],
         "CVTTSD":[-0.08537309616804123, 0.0010597433429211378, 0.07481679320335388, 0.05997887998819351, -0.0376993790268898, 0.10309506952762604, 0.07795511186122894, 0.0833413377404213, 0.056095756590366364, 0.05851535126566887, -0.057075001299381256, 0.020756129175424576, -0.08901876956224442, 0.02559811621904373, -0.016971183940768242, -0.04282280057668686, -0.005386374890804291, -0.06672719866037369, -0.09664622694253922, 0.06042492762207985, -0.042353514581918716, 0.06194235011935234, -0.025712836533784866, -0.029526079073548317, 0.044016264379024506, 0.036507125943899155, -0.038406822830438614, 0.006118632387369871, -0.0495009683072567, -0.07487531006336212, -0.07304015755653381, 0.042621925473213196, -0.06314127147197723, 0.03934277594089508, -0.09373295307159424, -0.05887934938073158, 0.010626542381942272, -0.050934500992298126, -0.037448156625032425, 0.01178495679050684, -0.07045318186283112, 0.10210251808166504, -0.07279546558856964, 0.04947654530405998, -0.039519909769296646, 0.07030976563692093, -0.011039734818041325, 0.01187387015670538, -0.0840335488319397, -0.005615191534161568, -0.06869980692863464, -0.012282256036996841, -0.013054385781288147, -0.0711965560913086, 0.015505223535001278, 0.0693473145365715, 0.012862266041338444, -0.04747828096151352, 0.023439936339855194, 0.03891129791736603, -0.04998844489455223, -0.04673001170158386, 0.02121424488723278, 0.0501207634806633, 0.07420068979263306, -0.014888633042573929, 0.007586659397929907, 0.01340668834745884, -0.09216003119945526, 0.09335170686244965, 0.023272672668099403, 0.030810026451945305, 0.05792044475674629, -0.020374637097120285, -0.02717672660946846, 0.028085753321647644, 0.08691198378801346, 0.061656054109334946, -0.07689087092876434, 0.0407567173242569, 0.010403914377093315, -0.03389676660299301, 0.07075867801904678, 0.002534526167437434, -0.026066122576594353, 0.005012217443436384, 0.08335569500923157, -0.02732011303305626, 0.03854125738143921, 0.03336648270487785, -0.10646265000104904, -0.003997548017650843, 0.09871185570955276, 0.0275016650557518, 0.015653448179364204, 0.07066125422716141, 0.05811227858066559, 0.046357106417417526, 0.047027964144945145, 0.07407277077436447],
         "CVTTSS":[-0.07762601226568222, 0.051891617476940155, 0.02840222790837288, 0.012996217235922813, -0.04709569737315178, -0.011790127493441105, 0.07787185907363892, 0.07411551475524902, 0.04010153189301491, 0.000911108567379415, -0.09610971063375473, 0.042953960597515106, 0.01613607630133629, -0.07504888623952866, -0.04967263713479042, 0.06148393824696541, -0.018901845440268517, 0.08033818751573563, -0.06893469393253326, -0.036083199083805084, 0.08206851035356522, 0.08462843298912048, 0.06728347390890121, -0.03210798278450966, -0.019102206453680992, 0.0723310112953186, 0.009836986660957336, -0.057902153581380844, 0.007954364642500877, -0.015247606672346592, 0.08317636698484421, -0.030078981071710587, -0.003329804167151451, -0.00047014118172228336, -0.02859017252922058, -0.07635723054409027, -0.008230162784457207, 0.03107159025967121, -0.009525406174361706, 0.06515175849199295, -0.06525594741106033, -0.028639627620577812, -0.0781184732913971, 0.009911812841892242, 0.011008340865373611, -0.04294031485915184, -0.04256690293550491, -1.394751961925067e-05, -0.029347950592637062, -0.031849224120378494, 0.012988862581551075, -0.0009693846222944558, -0.019299298524856567, 0.0045416890643537045, -0.04690401256084442, -0.04800841212272644, 0.0020325451623648405, -0.02004505693912506, 0.04130777344107628, -0.033602941781282425, -0.06956057250499725, -0.008079515770077705, 0.0033002288546413183, 0.03853915259242058, 0.08760882169008255, -0.04805464297533035, 0.02319355681538582, 0.018974801525473595, -0.08521144837141037, -0.05224936082959175, -0.023577861487865448, 0.01627342589199543, 0.024244949221611023, 0.09439363330602646, -0.007235093507915735, 0.055853620171546936, -0.00885567907243967, 0.02217228338122368, 0.05414341762661934, -0.0278383269906044, -0.000764147553127259, 0.045272815972566605, -0.009049531072378159, 0.05590446665883064, -0.05074811726808548, -0.06311893463134766, -0.026139337569475174, 0.01067473366856575, -0.043730076402425766, -0.07134802639484406, -0.11087869852781296, 0.05522335693240166, -0.07894640415906906, -0.06710508465766907, -0.022497203201055527, 0.0777427926659584, -0.07944057136774063, 0.05494234338402748, -0.04788406938314438, -0.032921578735113144],
-        "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
         "DEC":[0.0634445771574974, -0.06605149805545807, 0.03212125599384308, 0.030006375163793564, -0.08837386220693588, -0.016591178253293037, -0.03157195448875427, 0.005282422062009573, 0.04301748052239418, -0.035375431180000305, -0.050481121987104416, -0.10733921080827713, -0.03802337497472763, -0.0745977833867073, -0.03943190351128578, -0.014895747415721416, 0.004689200781285763, -0.05872263386845589, -0.02043316885828972, 0.017881838604807854, -0.02151746302843094, 0.049130357801914215, -0.0980888232588768, -0.0012140831677243114, -0.03892286866903305, -0.050167523324489594, -0.06817777454853058, 0.011282221414148808, 0.0848090872168541, -0.04859968274831772, -0.005405630450695753, 0.09327276051044464, -0.031913015991449356, -0.07784294337034225, -0.039762917906045914, -0.0004000961489509791, -0.03763844072818756, -0.024915525689721107, 0.04509890824556351, 0.05546657368540764, -0.055939678102731705, -0.0467451736330986, -0.030023904517292976, -0.010519847273826599, 0.009574057534337044, 0.023444844409823418, 0.007250144146382809, 0.060414351522922516, -0.0011268716771155596, -0.10112253576517105, -0.068567655980587, -0.044332459568977356, 0.0022569731809198856, -0.012019195593893528, 0.0016708170296624303, 0.01029527559876442, -0.024694599211215973, -0.0428738109767437, 0.053816765546798706, 0.09999147802591324, 0.06608963757753372, 0.014324366115033627, 0.022997796535491943, -0.012565241195261478, -0.008212191984057426, -0.012308428063988686, -0.09830988198518753, -0.04177428036928177, 0.03759279474616051, 0.06749766319990158, -0.08330990374088287, -0.06375840306282043, 0.0471678152680397, 0.06524914503097534, 0.09668447077274323, 0.07395336031913757, -0.06081546097993851, 0.0322561152279377, -0.05461571738123894, 0.022349894046783447, 0.0981096625328064, 0.019211066886782646, 0.10566835105419159, 0.004508140496909618, 0.030159158632159233, 0.1076640635728836, -0.004145997576415539, 0.08043811470270157, 0.030684711411595345, 0.07909402251243591, -0.015952520072460175, 0.027102122083306313, 0.017120881006121635, 0.0860346332192421, 0.06145261228084564, -0.01827210932970047, 0.027506740763783455, 0.08201386034488678, -0.09402544051408768, -0.07927247136831284],
         "DIV":[0.08121486008167267, -0.06398852169513702, -0.007856910116970539, 0.09644383192062378, 0.0013691268395632505, 0.03523438796401024, -0.04342259466648102, -0.011761687695980072, 0.021194210276007652, -0.0386938601732254, -0.004948849324136972, -0.08348845690488815, 0.005121953319758177, -0.06682730466127396, -0.004115825518965721, 0.015023703686892986, 0.042783256620168686, 0.08872916549444199, -0.03392689675092697, -0.014770613051950932, 0.001988545060157776, -0.05145770683884621, -0.029310323297977448, 0.06324473023414612, -0.08066411316394806, 0.006997138261795044, 0.004352204035967588, -0.060964930802583694, 0.02948148362338543, 0.052747759968042374, -0.05635778605937958, -0.014655586332082748, 0.015838103368878365, -0.04539657384157181, 0.031915292143821716, 0.05234432592988014, -0.012030252255499363, 0.06431112438440323, -0.027869969606399536, -0.006431832443922758, 0.025956276804208755, 0.047651831060647964, -0.01758543774485588, 0.07249220460653305, -0.049627624452114105, -0.007435495033860207, 0.0015833197394385934, 2.190603845519945e-05, 0.03457536920905113, 0.03895196691155434, -0.037442032247781754, 0.003120564157143235, -0.0690622553229332, -0.04405339062213898, 0.016464274376630783, -0.05068953335285187, 0.009520933963358402, 0.05033525079488754, 0.030095860362052917, 0.08773164451122284, -0.03623930364847183, -0.0076989103108644485, 0.0133424773812294, 0.025229837745428085, 0.018198521807789803, 0.011319941841065884, -0.005582685582339764, -0.03598775342106819, -0.0565820187330246, 0.08609189838171005, 0.035601116716861725, -0.007436969317495823, -0.018040914088487625, -0.04825054481625557, -0.014956142753362656, 0.03343576192855835, -0.0739198625087738, 0.038971979171037674, -0.03691745549440384, -0.0371851809322834, 0.08137080073356628, 0.03924981504678726, -0.06499960273504257, 0.047913506627082825, -0.0464070662856102, 0.04404731094837189, -0.03972303494811058, 0.03341617435216904, 0.05367732420563698, -0.04457789286971092, -0.07455608248710632, 0.007865827530622482, 0.04562194645404816, -0.03552774339914322, -0.007738951593637466, 0.09388759732246399, -0.015701837837696075, 0.033921483904123306, -0.017276542261242867, 0.04943705350160599],
         "DIVPDrm":[0.04179735854268074, 0.008989601396024227, 0.0027430830523371696, 0.06804384291172028, -0.06657993793487549, 0.033647675067186356, -0.03707171231508255, 0.08443991839885712, -0.054565757513046265, 0.0765392854809761, -0.08189049363136292, 0.02573087066411972, 0.018917549401521683, 0.079402856528759, -0.011117411777377129, 0.06308865547180176, -0.045432765036821365, -0.05054701492190361, -0.009618235751986504, -0.0594516322016716, 0.07967120409011841, 0.08030137419700623, -0.0768255814909935, -0.061036787927150726, 0.004279104992747307, -0.09737113863229752, 0.07295801490545273, -0.027599459514021873, 0.0045133912935853004, -0.048141367733478546, 0.0003157609316986054, -0.014835191890597343, 0.01462356187403202, -0.03225003555417061, 0.06723359227180481, 0.05244021862745285, 0.07099424302577972, -0.09206876158714294, 0.06154841184616089, -0.022400988265872, 0.034042902290821075, 0.002528816694393754, -0.04578591138124466, -0.023195132613182068, -0.07696253061294556, -0.03475971147418022, 0.03545870631933212, -0.021839862689375877, 0.0036371496971696615, 0.07372148334980011, -0.0596211701631546, -0.06768393516540527, -0.032637521624565125, 0.008432515896856785, 0.007569535635411739, -0.0034237385261803865, 0.05811845883727074, 0.013580343686044216, -0.03924565017223358, -0.025963587686419487, 0.03800642117857933, -0.04651957005262375, -0.033428385853767395, -0.053251899778842926, -0.04647624120116234, -0.034290049225091934, 0.003906013211235404, -0.05534028634428978, 0.04434804245829582, -0.08216925710439682, -0.011801591143012047, -0.006801240611821413, -0.07483590394258499, -0.06332433968782425, -0.005107037723064423, -0.008274846710264683, -0.07277056574821472, 0.03865613043308258, -0.0472225658595562, 0.009775533340871334, 0.055412523448467255, -0.014846398495137691, -0.008565607480704784, -0.018367087468504906, 0.038180120289325714, 0.06085506081581116, -0.02658388763666153, 0.006586031056940556, 0.0761575847864151, -0.007659312803298235, -0.10445686429738998, 0.01846102997660637, 0.02885548584163189, 0.0437043160200119, -0.012576445005834103, 0.04055696353316307, 0.002144219819456339, -0.08052077144384384, 0.03422001749277115, 0.03888843208551407],
@@ -62,12 +61,9 @@
         "DIV_Fp":[0.0013771128142252564, -0.03939857333898544, 0.06826473772525787, -0.055852942168712616, 0.021110225468873978, -0.07429434359073639, -0.01439732313156128, 0.047745198011398315, 0.03544871136546135, -0.006474921014159918, -0.05228240415453911, 0.00804696511477232, 0.0025021089240908623, 0.049810487776994705, -0.009595588780939579, 0.0507207065820694, -0.040155258029699326, 0.013851179741322994, -0.09630413353443146, -0.012529753148555756, 0.08176414668560028, 0.05994131416082382, 0.0013053410220891237, -0.035347871482372284, -0.06649265438318253, 0.07997933030128479, -0.042037565261125565, -0.06072461977601051, 0.09246786683797836, -0.0072363922372460365, 0.01850724034011364, 0.03905143961310387, -0.07601091265678406, -0.04824458062648773, -0.014410853385925293, -0.06455439329147339, -0.0593516007065773, -0.047922395169734955, -0.07904111593961716, -0.05896637961268425, -0.05629009008407593, -0.08674604445695877, 0.017179397866129875, -0.0020149857737123966, 0.02413070574402809, 0.024688012897968292, 0.027266085147857666, -0.015890855342149734, -0.00813567265868187, 0.024672919884324074, -0.020992467179894447, 0.019298823550343513, 0.022587062790989876, 0.06570186465978622, 0.061541132628917694, -0.07291612029075623, -0.010421186685562134, 0.032753147184848785, -0.06230449676513672, 0.040921296924352646, 0.05855383351445198, -0.035908423364162445, 0.05353318154811859, -0.013773049227893353, 0.0073576089926064014, 0.016397720202803612, 0.03753839433193207, 0.04765179380774498, -0.041083212941884995, -0.013994180597364902, -0.015261827036738396, 0.0982649177312851, 0.05605688691139221, -0.041869863867759705, -0.017181048169732094, 0.03721241280436516, -0.005489564035087824, 0.026647603139281273, -0.07785916328430176, 0.0476430244743824, -0.006558667402714491, 0.06363014876842499, -0.05705825239419937, -0.048359137028455734, -0.09657922387123108, -0.020021332427859306, 0.05151694640517235, 0.0028305412270128727, -0.012787899002432823, -0.09800048917531967, 0.01322718895971775, 0.08181536942720413, -0.04321233555674553, -0.0016350646037608385, -0.03537006303668022, 0.041411954909563065, 0.028577959164977074, 0.01855066418647766, 0.01671769842505455, -0.04467424377799034],
         "DIV_FpI":[-0.010708109475672245, -0.0732470378279686, -0.033443547785282135, -0.06361733376979828, 0.017653197050094604, -0.030770231038331985, -0.0766882598400116, 0.08713997155427933, -0.0696694403886795, 0.0565333366394043, 0.0079630296677351, 0.009157304652035236, 0.07795052230358124, 0.00863052811473608, 0.009487103670835495, -0.021366223692893982, -0.08859013020992279, -0.052845098078250885, 0.07517100870609283, 0.030445149168372154, -0.031006425619125366, -0.011518558487296104, 0.031634584069252014, 0.006774903275072575, -0.0008412582101300359, 0.05720775946974754, -0.03664165362715721, 0.04671872407197952, -0.04702712222933769, 0.08346223086118698, -0.02042539417743683, 0.005731453187763691, -0.02509506233036518, 0.04370206221938133, -0.06398718804121017, 0.052075039595365524, 0.05920809134840965, 0.0037172543816268444, 0.07034561783075333, -0.018100138753652573, 0.002755390014499426, -0.07121799886226654, 0.03879084065556526, 0.013516174629330635, -0.02845778502523899, 0.019500035792589188, 0.014439111575484276, 0.06561631709337234, -0.10264755040407181, -0.016511712223291397, -0.018063146620988846, 0.08819841593503952, -0.0031949833501130342, -0.07884415239095688, -0.10739012062549591, 0.007700629997998476, 0.049550142139196396, 0.0866587832570076, 0.054501283913850784, 0.10046342760324478, 0.01546743419021368, -0.05334487929940224, -0.02652003802359104, -0.009483176283538342, 0.011785115115344524, 0.04965313896536827, -0.030048802495002747, -0.043639082461595535, -0.004809096921235323, -0.0515226274728775, 0.08381897956132889, 0.003956930246204138, 0.03591177612543106, 0.04015829414129257, -0.03484338894486427, 0.04027436673641205, 0.09792015701532364, 0.013287014327943325, 0.09490979462862015, -0.024792836979031563, 0.04872164502739906, 0.026059577241539955, -0.05917894095182419, -0.011415015906095505, -0.024944854900240898, 0.00499876169487834, -0.06721813976764679, -0.03442658111453056, -0.002175490139052272, 0.0004200722905807197, -0.10891042649745941, 0.021674668416380882, 0.03700282797217369, -0.0014478170778602362, -0.013477527536451817, -0.02742406167089939, -0.01233668439090252, 0.02371596358716488, 0.04435785114765167, -0.03723753243684769],
         "EH_LABEL":[-0.03541884943842888, 0.012697970494627953, 0.07690317928791046, 0.10800164937973022, -0.033531446009874344, 0.010248170234262943, 0.08690237253904343, -0.018254421651363373, 0.006330807693302631, 0.054908059537410736, 0.05281105265021324, 0.01866377331316471, -0.03986826166510582, 0.012461063452064991, -0.0570770688354969, 0.010465170256793499, -0.0007985670818015933, 0.014928294345736504, -0.08143509179353714, -0.04576095566153526, -0.014382844790816307, 0.09261113405227661, -0.06843073666095734, 0.08642790466547012, 0.010645134374499321, 0.02887858636677265, -0.08228367567062378, 0.06679805368185043, 0.0023300996981561184, 0.02060936950147152, 0.06778941303491592, 0.10305429995059967, 0.06289057433605194, -0.020899023860692978, -0.024045836180448532, 0.0433543361723423, -0.0338776558637619, -0.05156976729631424, -0.02495928853750229, -0.060252029448747635, -0.1022094339132309, 0.014480574987828732, 0.0545940026640892, 0.04824232682585716, 0.06658189743757248, 0.11414545774459839, 0.08304191380739212, -0.03313761577010155, -0.056730128824710846, -0.005165864247828722, -0.00412571569904685, -0.0007486011018045247, -0.03322497382760048, 0.0425163209438324, 0.0785580724477768, 0.015084332786500454, 0.049294766038656235, 0.07518152892589569, -0.008224403485655785, -0.08819448202848434, -0.020814890041947365, 0.054976895451545715, -0.06431052833795547, 0.026952102780342102, -0.02861913852393627, -0.05228573456406593, -0.08044329285621643, 0.02844928950071335, 0.06669115275144577, -0.005387885496020317, 0.05081101134419441, -0.0627083107829094, -0.0785573348402977, -0.042252350598573685, -0.0632990375161171, -0.042457811534404755, -0.07097408920526505, 0.0032806433737277985, 0.039354246109724045, 0.054314617067575455, 0.04231691733002663, 0.00793430395424366, -0.06007056310772896, -0.06129273772239685, -0.008646488189697266, -0.024291129782795906, -0.06316813081502914, 0.02861824445426464, 0.029990797862410545, -0.014714590273797512, 0.005561451427638531, 0.06847269088029861, 0.05630529299378395, -0.015434914268553257, 0.08646618574857712, -0.0025325058959424496, -0.0046173883602023125, -0.04263639822602272, -0.021261105313897133, 0.02382785640656948],
-        "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
         "FLDCW":[-0.0138143515214324, 0.021748993545770645, 7.070673746056855e-05, -0.0897645577788353, 0.09824047237634659, -0.07988506555557251, -0.03454058617353439, 0.0019847718067467213, 0.04983500763773918, 0.03934836760163307, -0.01007675752043724, -0.07798215001821518, -0.08095540851354599, 0.002752745756879449, 0.030696945264935493, 0.017224561423063278, 0.00200466881506145, 0.055515315383672714, -0.06178406998515129, -0.07683275640010834, 0.06503588706254959, 0.06047344580292702, 0.017141321673989296, -0.021984437480568886, -0.05550537258386612, -0.10371828079223633, 0.04531969875097275, 0.04299109801650047, 0.008607891388237476, -0.015554985031485558, -0.08462150394916534, 0.01943030022084713, -0.03486369550228119, -0.06457459926605225, -0.0051103211008012295, 0.05992105230689049, 0.0358397401869297, -0.04655934497714043, -0.018018357455730438, -0.057540085166692734, 0.0061888862401247025, -0.013676634058356285, -0.05362136662006378, 0.06076344475150108, 0.014500541612505913, 0.04466172680258751, 0.025775697082281113, 0.034106262028217316, -0.045596618205308914, 0.022729532793164253, 0.0068075573071837425, 0.033541467040777206, 0.04034329950809479, -0.05922241508960724, -0.11147011071443558, 0.10801365971565247, 0.028543133288621902, -0.076783187687397, 0.0018997815204784274, -0.030598029494285583, 0.04199691861867905, -0.09739390015602112, 0.06310229748487473, -0.03830089420080185, -0.03836864233016968, 0.02324736677110195, 0.10289694368839264, -0.08237223327159882, 0.09511970728635788, -0.022883199155330658, 0.07018155604600906, 0.021149639040231705, 0.06003378704190254, 0.020026177167892456, -0.019267164170742035, 0.06961971521377563, -0.004955677315592766, -0.07218261808156967, 0.08104820549488068, -0.0418921560049057, -0.0317075252532959, 0.020996741950511932, -0.009143776260316372, 0.05348548665642738, -0.0625229999423027, -0.06267517060041428, -0.09454416483640671, -0.043331023305654526, -0.06992270052433014, -0.027888890355825424, -0.08271876722574234, -0.05188243091106415, -0.010446823202073574, 0.05846165865659714, -0.010190286673605442, -0.03009830228984356, 0.03426814824342728, -0.03598400205373764, -0.1076725572347641, -0.028831692412495613],
         "FNSTCW":[-0.08537304401397705, 0.014420966617763042, 0.026950713247060776, -0.008387862704694271, -0.0038766334764659405, 0.026867343112826347, -0.030130255967378616, -0.04617878049612045, -0.007106459699571133, -0.0215947013348341, 0.007403566502034664, 0.032729458063840866, 0.0008728280663490295, -0.017559584230184555, 0.017324298620224, -0.014857987873256207, -0.03798896074295044, -0.05294371768832207, 0.05491216480731964, -0.04219334200024605, -0.024796022102236748, 0.033826109021902084, 0.04021430388092995, 0.015585671178996563, -0.025553781539201736, -0.011536196805536747, 0.021523986011743546, 0.01087264809757471, -0.023965656757354736, 0.021311553195118904, -0.0554395355284214, -9.890173532767221e-05, -0.0012819130206480622, -0.055725399404764175, 0.008443817496299744, 0.014645406976342201, 0.09493250399827957, 0.005851465743035078, -0.0346904918551445, -0.018780557438731194, -0.0024646760430186987, -0.04922417551279068, -0.025316428393125534, -0.047623440623283386, 0.04252983629703522, 0.008884137496352196, 0.024444259703159332, 0.11018849164247513, 0.06603030860424042, 0.10775407403707504, -0.06696148216724396, 0.07046543061733246, 0.03569186478853226, 0.06831049919128418, 0.10069368779659271, -0.07917457073926926, 0.07819988578557968, 0.0325605608522892, 0.028253860771656036, -0.03586380183696747, 0.08094784617424011, -0.08532348275184631, 0.08135068416595459, 0.08752897381782532, 0.07736475020647049, 0.03881741315126419, 0.01930568367242813, 0.01373430248349905, 0.07003094255924225, 0.021482432261109352, 0.0606292188167572, 0.005889599211513996, -0.06958997994661331, 0.04857232794165611, 0.09418252855539322, 0.030624384060502052, -0.05853968486189842, 0.0978643149137497, 0.042890243232250214, -0.06594833731651306, -0.00445757107809186, 0.028062766417860985, 0.04270890727639198, 0.049651019275188446, -0.10246159136295319, -0.04101993143558502, -0.06874924898147583, -0.047776881605386734, 0.060615722090005875, 0.022016024217009544, -0.0476866140961647, -0.09320542216300964, -0.06186588481068611, 0.030679777264595032, -0.01664678566157818, -0.02508559450507164, -0.0495455376803875, 0.02986457757651806, 0.0242463406175375, -0.03076062723994255],
-        "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
         "FsFLD":[-0.0508677139878273, -0.05399654433131218, -0.07149481028318405, -0.047971777617931366, 0.0019320917781442404, -0.007547610439360142, 0.0815814733505249, -0.12202084064483643, -0.08665104955434799, 0.03356856107711792, -0.15713559091091156, -0.0400867722928524, -0.006232412997633219, 0.044278621673583984, 0.09549921005964279, -0.029399411752820015, 0.01864752173423767, -0.04044967144727707, 0.05652021989226341, -0.09881851822137833, 0.025765251368284225, -0.02329906076192856, -0.06028103083372116, 0.09247462451457977, -0.04210466891527176, 0.03263019770383835, -0.03578515350818634, 0.0314578041434288, 0.003650028258562088, 0.04645871743559837, -0.010650137439370155, 0.015904754400253296, 0.018990037962794304, -0.005266033578664064, 0.038479309529066086, 0.008642041124403477, -0.049301791936159134, 0.09484748542308807, 0.005372038576751947, -0.08711376041173935, 0.07584445923566818, 0.09458201378583908, -0.00032702009775675833, 0.048093944787979126, -0.08043119311332703, 0.049779392778873444, -0.006967591121792793, -0.07319328933954239, 0.01582382619380951, -0.006244257558137178, -0.011940727941691875, -0.0013992231106385589, -0.028953444212675095, 0.010995968244969845, -0.005534093361347914, -0.04907146096229553, -0.0039899349212646484, 0.05501222237944603, 0.041574396193027496, 0.030038336291909218, -0.0402531623840332, 0.07675039023160934, 0.01103806123137474, -0.006072944961488247, -0.025336718186736107, 0.06967771798372269, -0.025075508281588554, 0.0031819106079638004, -0.015812508761882782, -0.12114851176738739, 0.07704214751720428, 0.1273191273212433, -0.014406625181436539, -0.031106390058994293, -0.0602225735783577, 0.016253838315606117, -0.059025105088949203, -0.04163780063390732, 0.01571997068822384, 0.025686416774988174, 0.032261066138744354, -0.016690189018845558, 0.014042876660823822, 0.009416786953806877, -0.012661219574511051, 0.013285082764923573, 0.03095356747508049, 0.008239349350333214, 0.0444798618555069, -0.05153216794133186, -0.010029821656644344, -0.015202880837023258, 0.06329496204853058, -0.0590473897755146, 0.08585292845964432, -0.08594027906656265, 0.06057215481996536, 0.01079416275024414, -0.04006461799144745, 0.029236430302262306],
-        "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
         "IDIV":[-0.03631015121936798, -0.07882149517536163, -0.010781447403132915, -0.025117948651313782, 0.01618420146405697, 0.044446997344493866, 0.011386583559215069, -0.00582836102694273, -0.012903614901006222, 0.006322081200778484, -0.07392880320549011, -0.1300479620695114, -0.05186808854341507, -0.06542935222387314, 0.08297666162252426, 0.03790606930851936, -0.07716395705938339, 0.02288512885570526, -0.038660015910863876, -0.04705967381596565, -0.00015759489906486124, -0.06133948266506195, -0.022438891232013702, -0.012017307803034782, 0.01929904706776142, 0.007114879786968231, 0.00567955756559968, -0.041199274361133575, 0.08304950594902039, 0.044402915984392166, -0.10634922981262207, -0.009510381147265434, 0.009772839024662971, -0.048219580203294754, -0.0321214459836483, 0.008684953674674034, 0.009846106171607971, 0.011280585080385208, 0.0310650784522295, 0.05677618831396103, 0.025418052449822426, -0.022629115730524063, 0.0074129728600382805, 0.1081111952662468, -0.03284893184900284, 0.002745774807408452, 0.05030296742916107, 0.04322626441717148, 0.005321172997355461, 0.03260405734181404, -0.051505692303180695, -0.033541131764650345, -0.03955534100532532, 0.047906432300806046, 0.02181984856724739, -0.0026405092794448137, 0.03350621834397316, -0.10710552334785461, -0.01533215120434761, -0.06872875243425369, -0.015413723886013031, -0.007149300072342157, -0.03660491481423378, -0.003503897227346897, -0.02898445539176464, 0.040071532130241394, 0.019684670493006706, -0.10101661086082458, -0.08199643343687057, 0.05637385696172714, -0.03792939707636833, 0.03106122836470604, -0.0590706542134285, -0.03607700765132904, -0.09597010910511017, -0.005815848242491484, 0.017992950975894928, 0.0007907312246970832, 0.04653536528348923, -0.03997295722365379, 0.006737773306667805, 0.11695551127195358, 0.022216010838747025, 0.041878726333379745, -0.035456813871860504, 0.04327021911740303, -0.03799387812614441, 0.10658515244722366, 0.010188632644712925, 0.09275273978710175, 0.09797771275043488, -0.12400814890861511, 0.03475511074066162, -0.08061601221561432, 0.022533612325787544, -0.11562027782201767, -0.026964085176587105, 0.08614259958267212, -0.025526022538542747, 0.040927182883024216],
         "ILD_Fp":[0.01509383600205183, -0.044326793402433395, -0.051242612302303314, -0.053859174251556396, -0.013097256422042847, -0.06370041519403458, 0.06120477616786957, 0.050328709185123444, -0.04184471070766449, 0.023432370275259018, -0.06435256451368332, 0.02055867575109005, 0.08239544183015823, 0.012251744978129864, -0.05063817650079727, 0.04293346777558327, -0.05919358506798744, -0.03159564360976219, -0.0037220751401036978, -0.001002405071631074, -0.026786377653479576, -0.07405146211385727, 0.044357798993587494, 0.08067265897989273, -0.05229390412569046, -0.06903751194477081, 0.010448710061609745, 0.006885232869535685, -0.052135784178972244, 0.08535145968198776, 0.041820794343948364, -0.020588336512446404, 0.07256042212247849, -0.017755955457687378, -0.032768987119197845, 0.06633710861206055, -0.03427698463201523, -0.10930930078029633, 0.05371936410665512, -0.06794329732656479, -0.014769122004508972, -0.07577606290578842, 0.07853815704584122, -0.09360899031162262, 0.05865737050771713, -0.034065186977386475, 0.05096115916967392, 0.0888199508190155, -0.03904300555586815, 0.03125728294253349, -0.0634637326002121, 0.03385297581553459, 0.027269205078482628, -0.07597903162240982, 0.008366324007511139, -0.03017764538526535, 0.011727942153811455, -0.04941355064511299, 0.027957690879702568, 0.09743025153875351, 0.004836047999560833, -0.028614182025194168, 0.016423141583800316, 0.0895770713686943, 0.025168858468532562, 0.030979957431554794, 0.016665387898683548, 0.025412173941731453, -0.035893514752388, -0.05403519794344902, 0.02931641787290573, 0.07742571830749512, -0.07045850157737732, -0.03433118015527725, -0.03651195392012596, -0.04036823660135269, -0.08663841336965561, 0.05561026185750961, 0.06927209347486496, -0.010819001123309135, -0.10697789490222931, 0.009881369769573212, 0.055065181106328964, -0.06379911303520203, 0.04137800633907318, 0.030417418107390404, -0.03515362739562988, -0.09139228612184525, 0.029920026659965515, 0.027388064190745354, -0.06739232689142227, 0.07639766484498978, -0.044223885983228683, 0.02472294308245182, -0.052025098353624344, 0.014643780887126923, 7.120784721337259e-05, 0.018760213628411293, -0.002873474732041359, 0.015561423264443874],
         "IMPLICIT_DEF":[-0.026583483442664146, -0.03995991870760918, 0.03633055090904236, -0.04622741788625717, -0.02326572686433792, 0.02231338992714882, -0.014788332395255566, -0.09906739741563797, 0.022785643115639687, -0.014632754027843475, -0.1041543111205101, 0.05013664439320564, -0.08690599352121353, -0.08063319325447083, 0.030247388407588005, -0.09707676619291306, 0.03499408811330795, 0.012669776566326618, 0.06481463462114334, -0.040453050285577774, -0.0489707849919796, -0.07584276050329208, 0.001047363504767418, 0.08496157824993134, 0.02357148937880993, -0.06866959482431412, 0.09267362207174301, 0.030527250841259956, -0.031355831772089005, 0.02419896423816681, -0.02442512847483158, 0.029297800734639168, 0.10321355611085892, 0.06579483300447464, -0.012722077779471874, 0.10042434185743332, -0.004708406049758196, 0.007217984646558762, 0.0753282904624939, -0.07088368386030197, -0.07383686304092407, 0.06410741060972214, 0.06312107294797897, 0.06989452987909317, 0.03766098991036415, -0.0008440924575552344, -0.023516006767749786, -0.04153933748602867, 0.07342316210269928, 0.05416297912597656, -0.02841850183904171, 0.04128013551235199, -0.001023625023663044, 0.005061942618340254, -0.06027042120695114, 0.025808431208133698, 0.027118714526295662, -0.08965771645307541, 0.012222534976899624, 0.008590211160480976, -0.01785023882985115, 0.03389652445912361, 0.0038459128700196743, 0.021088456735014915, -0.060241442173719406, 0.052924126386642456, -0.03849414363503456, 0.0044007860124111176, 0.05139085650444031, -0.06002991273999214, 0.026294095441699028, 0.06567239761352539, 0.1145782321691513, -0.02774081937968731, -0.07959162443876266, -0.00901349913328886, -0.09212079644203186, -0.016664501279592514, -0.019095804542303085, 0.05008011311292648, -0.016630882397294044, -0.007292845752090216, 0.01243519689887762, 0.011623953469097614, -0.0202464796602726, 0.08120717853307724, 0.04192841053009033, -0.014358888380229473, 0.0402902215719223, -0.05741799250245094, 0.0023748986423015594, -0.0007613254711031914, -0.11052780598402023, -0.08283583074808121, -0.018524790182709694, -0.09601832926273346, 0.037600427865982056, -0.06403559446334839, -0.08838459849357605, 0.01904650405049324],
@@ -75,12 +71,10 @@
         "INC":[-0.04204729199409485, -0.04558457434177399, -0.004308773670345545, 0.08560862392187119, -0.025844622403383255, -0.01385454647243023, -0.06715847551822662, 0.04059276729822159, 0.0008142509614117444, -0.04987747594714165, 0.05252164602279663, -0.07536070048809052, 0.012251293286681175, -0.01428443193435669, 0.028742481023073196, -0.024608345702290535, 0.009724774397909641, -0.024144234135746956, -0.04345421493053436, -0.03454094007611275, -0.03657921776175499, -0.025569358840584755, 0.04140102490782738, -0.02267373353242874, -0.05346262827515602, -0.07470668852329254, -0.03458420932292938, -0.015982985496520996, 0.013558092527091503, -0.029305797070264816, 0.026653757318854332, -0.00041234202217310667, 0.038508299738168716, 0.08509717136621475, 0.0016276738606393337, -0.013578594662249088, 0.05669381096959114, 0.0274334829300642, 0.023921431973576546, -0.02701006643474102, -0.09357035905122757, 0.07844959199428558, -0.03195708245038986, 0.044196177273988724, 0.017355425283312798, -0.04172753170132637, -0.07773707062005997, 0.018204662948846817, -0.07242465019226074, 0.07735569030046463, 0.03859752044081688, 0.08490721136331558, 0.04661087319254875, 0.015468046069145203, 0.02267235703766346, -0.030244702473282814, -0.043930262327194214, -0.015585970133543015, -0.004605699330568314, 0.0052457586862146854, -0.027553195133805275, -0.06406774371862411, 0.008009923622012138, -0.09624558687210083, 0.07006736844778061, 0.052846722304821014, -0.029392898082733154, -0.0659954622387886, -0.10725440829992294, 0.04428407922387123, 0.02606845460832119, 0.018936248496174812, -0.013534934259951115, 0.03338829427957535, -0.06049540638923645, 0.007389454171061516, 0.030835872516036034, -0.026952944695949554, -0.008518273010849953, 0.07688802480697632, 0.03663042560219765, -0.09961165487766266, -0.02765841968357563, 0.06263019144535065, -0.003026304766535759, -0.0023868512362241745, -0.052803706377744675, 0.04688272252678871, 0.08415349572896957, -0.044724639505147934, 0.01759890839457512, 0.022962408140301704, 0.00944716576486826, -0.084384024143219, -0.02845100499689579, -0.05094959959387779, -0.08001884073019028, 0.0449872724711895, -0.05161838233470917, 0.015422065742313862],
         "INLINEASM":[0.09296883642673492, -0.007579821161925793, 0.05054628103971481, 0.0011402746895328164, -0.02369365282356739, -0.040429845452308655, 0.048763860017061234, -0.012725423090159893, -0.017820369452238083, -0.0700153335928917, -0.00037883210461586714, 0.06301063299179077, 0.0503254272043705, 0.023893356323242188, -0.07308998703956604, 0.058056626468896866, -0.002504807896912098, -0.03528450429439545, -0.0775352418422699, -0.08423604816198349, 0.01841139607131481, 0.07128658145666122, 0.01363592129200697, 0.05391324311494827, 0.04803359508514404, 0.06145099550485611, -0.03153276443481445, 0.019207997247576714, 0.07138897478580475, 0.06972941011190414, 0.06482893973588943, -0.019937975332140923, -0.00694684125483036, 0.0624234639108181, 0.08495642989873886, 0.017590269446372986, -0.0075670769438147545, 0.05114367976784706, 0.031221428886055946, -0.07108655571937561, -0.018287384882569313, 0.035706836730241776, -0.0794610008597374, -0.03627452626824379, -0.06174106150865555, -0.036826081573963165, -0.030408767983317375, 0.008271732367575169, -0.09423738718032837, 0.004248321522027254, 6.044749170541763e-05, 0.011095447465777397, -0.10245273262262344, -0.07278212904930115, -0.00845671258866787, 0.008961541578173637, 0.019341865554451942, 0.010205359198153019, 0.0724569708108902, -0.08050914853811264, -0.057010360062122345, 0.05053231865167618, -0.04844024032354355, 0.057458631694316864, 0.007486356887966394, -0.029497744515538216, 0.009812748059630394, -0.05314056575298309, 0.11012034863233566, -0.0647352784872055, 0.017479702830314636, -0.027027146890759468, -0.015448061749339104, 0.06321517378091812, -0.06948030740022659, 0.030430838465690613, -0.022251488640904427, -0.0358838327229023, 0.020705783739686012, -0.10970951616764069, -0.07724311202764511, 0.03224516287446022, 0.004828427918255329, 0.07738938182592392, -0.0036471053026616573, 0.06867322325706482, -0.07092054188251495, -0.024759342893958092, -0.054835252463817596, 0.019259851425886154, 0.011149682104587555, -0.09652992337942123, 0.050764426589012146, -0.0809553936123848, -0.04605351760983467, 0.0399462915956974, 0.05396333709359169, -0.01706104166805744, -0.031266387552022934, 0.020599452778697014],
         "IST_Fp":[-0.046584248542785645, -0.07452045381069183, 0.03998621925711632, 0.03091888502240181, 0.016272397711873055, -0.00985297653824091, -0.007199955638498068, -0.03536335751414299, 0.01673988439142704, 0.07562774419784546, 0.023876583203673363, -0.008683494292199612, 0.04009688273072243, -0.03663905709981918, -0.014492983929812908, 0.07349997758865356, 0.028999919071793556, -0.07499339431524277, -0.03727814927697182, -0.046455491334199905, -0.032447993755340576, 0.02374599315226078, -0.044662121683359146, -0.025333719328045845, 0.037562429904937744, 0.0006656686891801655, -0.00804421491920948, 0.06697870045900345, 0.04367857426404953, -0.0583018884062767, 0.03050180710852146, 0.053111929446458817, -0.04168881103396416, -0.027295507490634918, 0.057777389883995056, 0.08833678811788559, -0.026598922908306122, 0.005393106956034899, -0.05517015606164932, -0.0731138065457344, 0.07386088371276855, -0.07228095829486847, 0.023828018456697464, -0.0025013380218297243, -0.012031037360429764, 0.029700662940740585, -0.101964570581913, 0.0899822935461998, 0.013285316526889801, 0.002607472240924835, 0.04784732311964035, -0.044669900089502335, -0.04348702356219292, -0.07007527351379395, -0.016267215833067894, 0.059609103947877884, -0.036534957587718964, 0.013465121388435364, 0.10186120122671127, 0.015473871491849422, -0.08443709462881088, -0.004981503821909428, 0.06996916979551315, 0.011159068904817104, -0.07315052300691605, 0.024891534820199013, 0.0426689088344574, 0.008847315795719624, -0.06540054082870483, -0.09095568209886551, 0.053956128656864166, -0.010535894893109798, 0.035168495029211044, 0.04921877756714821, -0.07781729847192764, 0.006958760786801577, -0.05714801698923111, -0.06458019465208054, -0.055241748690605164, -0.007552466355264187, -0.02490214817225933, -0.014270482584834099, 0.03710750862956047, 0.003406278323382139, -0.044638775289058685, -0.09159127622842789, -0.025353819131851196, -0.07952282577753067, -0.02874225378036499, -0.06654132902622223, 0.0031955954618752003, 0.0602104589343071, -0.09261002391576767, -0.06175351142883301, 0.01194009743630886, -0.0348934531211853, 0.04460763558745384, -0.08773446083068848, 0.04335169121623039, 0.054603610187768936],
-        "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
         "Int_MemBarrier":[0.0418969988822937, -0.06285926699638367, -0.018717624247074127, -0.0031687396112829447, 0.04023218899965286, 0.08492552489042282, -0.06942103803157806, 0.005588027182966471, -0.08964942395687103, 0.055396437644958496, -0.06732998788356781, 0.06981600075960159, -0.05258888751268387, -0.06051918491721153, 0.02948639541864395, -0.04473342001438141, 0.01574157550930977, -0.04423875734210014, -0.053338322788476944, 0.008577392436563969, 0.10632415115833282, 0.040030092000961304, 0.02552260458469391, 0.026821544393897057, -0.05510386824607849, 0.05976655334234238, -0.0008300095796585083, 0.06861157715320587, -0.049591872841119766, -0.07650840282440186, -0.004643433261662722, -0.03990425914525986, 0.06366871297359467, -0.014906020835042, -0.06371121108531952, 0.0194997675716877, -0.07784571498632431, 0.029953552410006523, 0.06530797481536865, -0.09173597395420074, 0.021494632586836815, 0.052978403866291046, -0.001283245743252337, -0.05061378329992294, 0.04639996960759163, 0.06478390842676163, -0.015909312292933464, 0.013739313930273056, -0.06675873696804047, -0.0704226866364479, 0.020883914083242416, 0.07323179394006729, -0.0010066484101116657, -0.002373248105868697, -0.07056596130132675, 0.024577656760811806, 0.04880139231681824, -0.038608577102422714, 0.07695038616657257, 0.002806240925565362, 0.006876204162836075, 0.006961337756365538, 0.059363361448049545, 0.021191507577896118, -0.06366844475269318, -0.015020458959043026, -0.0815785601735115, 0.004222068004310131, -0.07691111415624619, 0.02711009606719017, 0.014720573090016842, 0.022912023589015007, 0.05272422730922699, 0.08111070841550827, -0.018083568662405014, -0.0418405644595623, 0.08496879786252975, -0.04420621693134308, 0.090696781873703, -0.02872851863503456, -0.024066468700766563, 0.07789512723684311, -0.012021118775010109, 0.041637614369392395, 0.07615016400814056, -0.042834896594285965, 0.05792360380291939, -0.051077719777822495, -0.05241186544299126, 0.006270663347095251, -0.008865885436534882, -0.09101007878780365, 0.009276151657104492, 0.036050815135240555, -0.06729964166879654, -0.014552133157849312, -0.06943532824516296, -0.023805340752005577, -0.058313168585300446, -0.04949163272976875],
         "JCC_":[-0.03625413775444031, -0.041811503469944, -0.07486920803785324, -0.05052778869867325, 0.021635157987475395, -0.045879144221544266, 0.014834613539278507, -0.03941917419433594, -0.010327291674911976, -0.08194752782583237, 0.049111511558294296, 0.05970187485218048, 0.03878019377589226, -0.08208157867193222, 0.11816514283418655, -0.0021148237865418196, 0.022616155445575714, 0.02145639806985855, -0.056387223303318024, -0.07890307158231735, 0.049655016511678696, -0.09555239230394363, -0.07599814981222153, 0.04143097624182701, -0.029399001970887184, 0.01379090640693903, 0.04894237220287323, 0.04915700852870941, 0.020924754440784454, 0.11983200162649155, -0.045743830502033234, 0.04826069250702858, 0.06473162770271301, 0.032176557928323746, 0.012342192232608795, 0.03632035106420517, -0.011231182143092155, 0.03319219872355461, 0.012383898720145226, 0.017726020887494087, -0.027707353234291077, 0.052987076342105865, -0.06459034234285355, 0.03180805966258049, 0.038370322436094284, -0.018640436232089996, -0.05121193453669548, -0.052741218358278275, 0.0953487753868103, 0.0914265364408493, 0.08409767597913742, -0.009599939920008183, 0.02045055478811264, 0.009363643825054169, -0.00872961338609457, -0.08178623765707016, -0.008178372867405415, -0.005903102457523346, 0.05836755037307739, 0.011602274142205715, -0.02761419117450714, 0.016957316547632217, 0.04471946507692337, 0.005247261840850115, -0.05416998639702797, 0.00770663283765316, -0.06152857095003128, 0.021657155826687813, -0.04485960677266121, -0.0008541923016309738, 0.053551655262708664, 0.062185727059841156, -0.012641278095543385, -0.020507624372839928, -0.02900690771639347, 0.019629495218396187, 0.05620177462697029, -0.07772354781627655, -0.025509009137749672, 0.01923682540655136, 0.03035508468747139, 0.018665296956896782, 0.013450516387820244, 0.06740278005599976, 0.013274379074573517, 0.011593983508646488, 0.02331095188856125, 0.048694003373384476, 0.05861792340874672, -0.021130137145519257, 0.02437412552535534, 0.059087324887514114, 0.024816056713461876, -0.050772879272699356, -0.01114521361887455, -0.028665395453572273, -0.09630053490400314, 0.0039062038995325565, -0.08236120641231537, 0.019473683089017868],
         "JMP":[-0.021766331046819687, -0.021576769649982452, -0.03795000538229942, 0.10449998080730438, -0.037742577493190765, -0.009156269021332264, 0.015289359726011753, 0.03519408404827118, -0.034353505820035934, 0.03226960077881813, 0.07340928167104721, 0.06086661294102669, 0.05736850947141647, -0.01725650765001774, -0.06702736765146255, -0.014972181059420109, 0.03435607627034187, 0.012023050338029861, 0.03370668366551399, -0.022338073700666428, -0.08280093967914581, -0.08060947060585022, 0.012210523709654808, -0.08165933936834335, 0.0016056479653343558, 0.015586943365633488, 0.11792927235364914, 0.06917431950569153, 0.02870137430727482, 0.01961304247379303, -0.027661900967359543, 0.10504695773124695, -0.03640349581837654, -0.01896090805530548, -0.011636625044047832, -0.04474593698978424, -0.029941411688923836, -0.058342345058918, 0.05885041877627373, 0.05553867667913437, 0.03953809291124344, 0.06787443161010742, 0.002061075298115611, 0.027305128052830696, 0.05792280659079552, 0.08001891523599625, 0.026575665920972824, -0.0171738862991333, -0.010685772635042667, -0.05422135442495346, 0.03660969436168671, 0.03091355785727501, -0.05900857225060463, 0.08500046283006668, -0.08218419551849365, 0.061078935861587524, 0.018783383071422577, 0.047520000487565994, -0.00014930205361451954, 0.002577823819592595, -0.06816059350967407, 0.041743114590644836, 0.03372296690940857, 0.016127480193972588, -0.07235685735940933, 0.024466760456562042, -0.03468412905931473, 0.037008773535490036, -0.060657840222120285, 0.016427740454673767, 0.08229042589664459, -0.061172664165496826, -0.009794612415134907, -0.024358782917261124, -0.06573519110679626, 0.09360098838806152, -0.07428182661533356, -0.02529928646981716, 0.09198813885450363, 0.025180503726005554, 0.03200048953294754, 0.018081925809383392, 0.0034776402171701193, 0.07848992198705673, -0.00043209362775087357, -0.01768604852259159, -0.043686315417289734, 0.04550321400165558, 0.11878672987222672, -0.008190528489649296, 0.003286525374278426, 0.06845948845148087, 0.04892893135547638, -0.053277406841516495, -0.016919657588005066, 0.032096169888973236, 0.02839065156877041, -0.01713993400335312, -0.15167304873466492, -0.02013365738093853],
         "JMP_":[-0.014233234338462353, -0.0260892603546381, -0.13750334084033966, -0.050227466970682144, -0.042988359928131104, -0.027947310358285904, 0.08639533072710037, -0.16317786276340485, -0.03907149285078049, -0.05328908935189247, -0.03975899517536163, 0.04182944446802139, -0.010540750809013844, -0.11645861715078354, -0.012753792107105255, 0.002367585664615035, -0.05188040807843208, 0.0033823091071099043, -0.01240340806543827, -0.06099176034331322, -0.0015601427294313908, -0.11171454191207886, -0.04928319901227951, 0.05990544706583023, 0.015553089790046215, 0.04499414563179016, -0.034520961344242096, 0.07318194955587387, 0.013978325761854649, 0.07317976653575897, -0.029100794345140457, -0.09544635564088821, 0.030067358165979385, 0.057544808834791183, 0.005057932808995247, 0.005621553864330053, -0.03627946600317955, -0.0391962006688118, 0.03113878332078457, -0.02958066016435623, 0.012381716631352901, 0.011978821828961372, 0.13839371502399445, 0.010590317659080029, 0.06677765399217606, 0.046147286891937256, -0.05033441260457039, -0.020135121420025826, 0.032657306641340256, -0.05044032260775566, 0.05499301478266716, 0.07406507432460785, -0.0011679750168696046, 0.000989275984466076, 0.029161963611841202, 0.02679276280105114, 0.024040302261710167, 0.0710899606347084, -0.0035478041972965, -0.03730632737278938, -0.014350024051964283, 0.1638166308403015, -0.10163120925426483, 0.02900329977273941, -0.05366139113903046, 0.07186686992645264, -0.041340481489896774, 0.0401119627058506, -0.002295189071446657, -0.07949572801589966, -0.011504769325256348, 0.10675538331270218, -0.012056156061589718, -0.00748586468398571, -0.039624687284231186, -0.03555607795715332, -0.06799864768981934, -0.04550764709711075, -0.03302829712629318, -0.008404256775975227, 0.10563746094703674, -0.026095328852534294, 0.07613116502761841, 0.02101682499051094, 0.018749620765447617, 0.0056787943467497826, 0.005889789201319218, 0.03994893655180931, 0.05512934923171997, -0.004684743471443653, -0.01083239447325468, 0.0003112686099484563, 0.024348445236682892, -0.02665846049785614, 0.0064091463573277, -0.02719639055430889, 0.11076066642999649, -0.0014569570776075125, 0.0050220787525177, 0.032427236437797546],
-        "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
         "LCMPXCHG":[0.0649508610367775, -0.04321656376123428, 0.08405561745166779, -0.07786691188812256, -0.05277935788035393, 0.011031142435967922, -0.0015533932019025087, 0.08730415254831314, -0.004414519295096397, 0.04040057212114334, -0.005748671013861895, -0.013907546177506447, 0.1028006374835968, 0.09900037944316864, -0.06475479900836945, 0.024365412071347237, -0.0727076306939125, 0.06610138714313507, -0.026073187589645386, 0.08258920162916183, -0.007938066497445107, 0.07641425728797913, 0.10221290588378906, 0.029036179184913635, -0.024506229907274246, 0.00953623466193676, -0.03283938392996788, -0.07194274663925171, -0.023513879626989365, -0.017550935968756676, -0.037860531359910965, 0.042062658816576004, 0.0501263290643692, 0.02325640618801117, 0.0018605751683935523, 0.012687316164374352, -0.016979143023490906, -0.059858907014131546, -0.07078705728054047, 0.033630695194005966, 0.036799900233745575, -0.03821465000510216, -0.059619177132844925, -0.06309511512517929, 0.0019384543411433697, -0.053095221519470215, 0.00571654736995697, 0.07134073972702026, -0.02115899883210659, 0.021287376061081886, -0.04855392873287201, 0.0103003466501832, -0.008993818424642086, 0.05131004378199577, -0.0734843909740448, 0.017303360626101494, 0.008291462436318398, 0.046435531228780746, -0.055057018995285034, -0.05454597249627113, -0.009126733057200909, -0.0012434959644451737, -0.0846821740269661, -0.017736544832587242, -0.04779898375272751, 0.020568806678056717, -0.061118245124816895, -0.012131555937230587, 0.024907736107707024, -0.0161012914031744, -0.011221951805055141, -0.029136324301362038, 0.04336633160710335, -0.00514700124040246, 0.004810850135982037, 0.014044326730072498, -0.07381691038608551, -0.064864382147789, 0.041784100234508514, 0.06648915261030197, 0.038817185908555984, -0.03421948850154877, 0.019546108320355415, -0.00579161336645484, -0.06579872220754623, -0.01745537295937538, -0.07164284586906433, 0.032588109374046326, 0.009170422330498695, -0.08387100696563721, -0.04743993282318115, 0.05926872417330742, 0.03129392862319946, -0.012995549477636814, 0.007799868006259203, 0.036110181361436844, 0.01603531278669834, -0.09735894203186035, 0.014374110847711563, -0.023844046518206596],
         "LD_Fp":[0.09850919246673584, 0.022097617387771606, -0.02880568616092205, 0.014175659976899624, -0.03401500731706619, -0.010281442664563656, -0.05501694604754448, -0.041856300085783005, 0.07016798853874207, -0.022585496306419373, -0.007230871357023716, 0.02143889106810093, 0.011802875436842442, -0.011940510012209415, 0.001225354615598917, -0.04420488327741623, 0.058923713862895966, 0.07726655155420303, -0.024950502440333366, -0.005545462481677532, 0.037338823080062866, -0.03718772903084755, 0.08340831100940704, 0.030300375074148178, -0.04332158342003822, -0.10117480903863907, -0.023774733766913414, 0.055412717163562775, 0.07188894599676132, 0.048699796199798584, 0.02051064558327198, -0.05177381634712219, 0.046848755329847336, 0.06421937793493271, 0.014812597073614597, 0.06599052250385284, 0.055128950625658035, 0.057206105440855026, 0.004570540506392717, 0.0006673894240520895, -0.04956628009676933, 0.018173960968852043, 0.009045585989952087, -0.09929032623767853, -0.0734606683254242, 0.009978558868169785, 0.016378602012991905, -0.0809779167175293, 0.028371425345540047, 0.07337132841348648, -0.0712965577840805, -0.07612331956624985, 0.023224541917443275, -0.01886812597513199, 0.049867402762174606, 0.04525093734264374, -0.04347287490963936, 0.04647829011082649, -0.020921878516674042, 0.055911704897880554, 0.0646883100271225, 0.043256886303424835, 0.012135359458625317, 0.06405725330114365, 0.04327752813696861, -0.06879010051488876, -0.02182726003229618, -0.030435195192694664, -0.04794333875179291, 0.03966866061091423, -0.05612926930189133, 0.061092350631952286, -0.047390542924404144, 0.06440525501966476, 0.07119303941726685, 0.036672186106443405, 0.039346762001514435, 0.05825766921043396, -0.05363740026950836, 0.026515239849686623, -0.021117106080055237, -0.061990927904844284, 0.06407181918621063, -0.02918284200131893, 0.06280291080474854, 0.05465791001915932, 0.025043612346053123, -0.015093226917088032, 0.0339696891605854, 0.039516378194093704, -0.005943501368165016, 0.037065502256155014, 0.0036617075093090534, -0.04032375290989876, -0.027956390753388405, -0.028206538408994675, 0.003602939657866955, 0.0015424611046910286, 0.03779160603880882, -0.012583530507981777],
         "LEA":[-0.07203060388565063, -0.017553633078932762, 0.0402604416012764, -0.03958871215581894, -0.035693515092134476, 0.006020952947437763, 0.06661038845777512, -0.05565638095140457, -0.07512512803077698, 0.015386131592094898, 0.1531272977590561, 0.07126382738351822, -0.018143991008400917, 0.0798688530921936, -0.0836813896894455, -0.005903773941099644, -0.03920849785208702, 0.025672506541013718, -0.017640162259340286, -0.09243063628673553, 0.0272371768951416, 0.04267166927456856, -0.032052017748355865, 0.06952647119760513, -0.03414658084511757, 0.05041181296110153, 0.04035321623086929, 0.04639449715614319, -0.000271787925157696, 0.1057962104678154, -0.031690120697021484, 0.0785541757941246, -0.008634688332676888, 0.035989925265312195, -0.00988205149769783, -0.047323428094387054, -0.018978994339704514, -0.001277003320865333, -0.022872451692819595, -0.034365635365247726, -0.04628191888332367, 0.06221615523099899, 0.01957613043487072, 0.13219280540943146, 0.03662179410457611, 0.046082716435194016, 0.011469600722193718, -0.025702660903334618, -0.08428508788347244, -0.07941769808530807, -0.06742636859416962, 0.0873873308300972, 0.0038614647928625345, 0.02177446149289608, -0.004519546404480934, -0.06213155761361122, -0.011228920891880989, -0.12034870684146881, 0.008946738205850124, 0.009164049290120602, -0.02258075587451458, 0.016061170026659966, 0.0645158663392067, 0.03723616153001785, -0.06451661139726639, -0.005219440441578627, -0.055180057883262634, 0.015841009095311165, -0.01621314138174057, -0.09887613356113434, 0.04894544556736946, -0.07996354252099991, 0.0138346366584301, -0.04036646708846092, -0.07073907554149628, -0.019294722005724907, -0.08181063830852509, -0.002301511587575078, -0.03429428115487099, 0.04098176211118698, 0.0706806555390358, 0.020024126395583153, 0.043529968708753586, 0.060017164796590805, 0.003525135340169072, -0.029752371832728386, -0.021769365295767784, 0.03941021487116814, -0.002250884659588337, -0.08078912645578384, 0.015297000296413898, 0.026888463646173477, 0.048139896243810654, -0.04837239161133766, -0.036249756813049316, -0.027615496888756752, -0.15165935456752777, -0.03756902739405632, 0.015112340450286865, -0.0010633820202201605],
@@ -89,13 +83,10 @@
         "LXADD":[-0.11344388872385025, 0.08068472892045975, -0.041796449571847916, -0.043138183653354645, -0.049067553132772446, -0.005337296053767204, 0.021436110138893127, -0.035862036049366, -0.05354782193899155, 0.007918866351246834, -0.033625587821006775, 0.048349399119615555, 0.07167208194732666, -0.04589017853140831, -0.023661522194743156, 0.03580676391720772, 0.03326055034995079, 0.041535746306180954, -0.008772681467235088, -0.03362834453582764, -0.008885134011507034, -0.005286931060254574, -0.09389151632785797, 0.015108847059309483, -0.020455803722143173, 0.06477829068899155, 0.012845957651734352, -0.03201524540781975, -0.07100234925746918, 0.046879976987838745, -0.06030888110399246, 0.022502053529024124, -0.10942362248897552, -0.06978410482406616, 0.0714743509888649, 0.057766277343034744, 0.038102924823760986, -0.007761931978166103, -0.11331900954246521, -0.07498679310083389, -0.002573479898273945, -0.005142265930771828, -0.04596858471632004, -0.05356051027774811, 0.10633396357297897, -0.07426618784666061, 0.037482988089323044, 0.10527358204126358, 0.08239476382732391, 0.0678592249751091, -0.014271541498601437, -0.010673552751541138, -0.0767236202955246, 0.0329856239259243, -0.02222914807498455, -0.0019944666419178247, -0.0789676085114479, 0.006855306681245565, -0.012843947857618332, -0.10197136551141739, -0.036981865763664246, 0.04500154033303261, 0.0023044694680720568, -0.0031417198479175568, -0.06536462903022766, -0.02773689292371273, 0.06672050058841705, 0.046953968703746796, 0.009028433822095394, -0.008872197940945625, 0.09054717421531677, 0.009121377021074295, 0.09400534629821777, 0.012045130133628845, -0.014854185283184052, 0.030989984050393105, -0.030203191563487053, 0.09275887161493301, -0.009853487834334373, 0.038435857743024826, 0.05689401552081108, -0.06919367611408234, -0.02360834926366806, -0.08338318765163422, 0.01904873177409172, -0.027271559461951256, -0.05529508367180824, 0.09507890790700912, -0.03128642588853836, 0.026687508448958397, -0.05117009952664375, -0.03872146084904671, 0.08641110360622406, -0.027542488649487495, -0.09849996864795685, 0.05740527808666229, -0.02291804924607277, -0.10829142481088638, 0.008436905220150948, 0.027438905090093613],
         "MAXSDrr":[-0.06119297072291374, -0.04124095290899277, -0.0296846404671669, -0.045824289321899414, 0.02508155070245266, 0.007925539277493954, 0.043926920741796494, -0.03159729018807411, 0.019068658351898193, -0.013711963780224323, -0.028986897319555283, -0.04561398923397064, 0.04851536825299263, -0.03764308616518974, -0.018207892775535583, 0.016173269599676132, -0.004123492166399956, -0.025343073531985283, -0.09777097404003143, 0.0290510356426239, -0.06969164311885834, -0.06684337556362152, 0.04377250373363495, 0.06861237436532974, -0.046966683119535446, 0.0611143596470356, -0.044503044337034225, 0.023559842258691788, -0.029876690357923508, 0.011016200296580791, 0.07286348938941956, 0.00030023325234651566, 0.08359035104513168, 0.017708808183670044, 0.07800529897212982, -0.08712167292833328, 0.002862636698409915, -0.06735634058713913, 0.03052128478884697, 0.04226242005825043, 0.023851098492741585, -0.04562359303236008, -0.013745550066232681, 0.013936172239482403, -0.0647776871919632, -0.0487772636115551, 0.07015536725521088, -0.030445875599980354, -0.043143901973962784, -0.09556057304143906, 0.047779254615306854, 0.046041958034038544, 0.009388554841279984, 0.04671555384993553, -0.059331271797418594, 0.03360891714692116, 0.03569460287690163, 0.004674405790865421, 0.03280949592590332, -0.011293579824268818, -0.05531742051243782, 0.045912306755781174, 0.04241438955068588, -0.07023770362138748, -0.03889290615916252, 0.019566599279642105, 0.06292827427387238, -0.012180106714367867, -0.009482266381382942, 0.0033363515976816416, -0.028241898864507675, 0.04916750639677048, -0.011430651880800724, 0.05025538429617882, 0.02134493552148342, 0.04370661824941635, 0.08801361173391342, -0.04115797579288483, -0.06421534717082977, -0.051845721900463104, -0.041304778307676315, 0.0507316067814827, 0.049301628023386, -0.013558737933635712, -0.004291698802262545, 0.038709867745637894, -0.0636303573846817, -0.047141704708337784, 0.022303685545921326, 0.07054309546947479, 0.009679436683654785, 0.0638614296913147, -0.046838339418172836, 0.01595005951821804, -0.025526082143187523, -0.0818924531340599, 0.016986405476927757, 0.023154381662607193, 0.06338698416948318, 0.07277237623929977],
         "MAXSSrr":[0.04370328411459923, 0.007435579318553209, 0.05632773041725159, 0.05872607231140137, -0.02179848775267601, -0.02491024136543274, -0.09028499573469162, -0.073136106133461, 0.0038046056870371103, -0.004702121019363403, 0.06376311928033829, 0.025374436751008034, 0.03343794494867325, -0.03841162100434303, 0.04050759971141815, 0.06359805166721344, -0.05459776520729065, -0.013898322358727455, 0.043059010058641434, 0.008913826197385788, -0.08469206839799881, 0.07041019201278687, -0.08591683208942413, 0.001833248999901116, 0.07940677553415298, -0.025694575160741806, -0.07197162508964539, -0.017312491312623024, -0.037606846541166306, -0.024861449375748634, 0.024707462638616562, -0.00026734761195257306, 0.033847302198410034, 0.05927937477827072, 0.04899705946445465, 0.0770091861486435, -0.09790053963661194, 0.057826053351163864, 0.05768071860074997, 0.01531772967427969, 0.0404951311647892, -0.04033346846699715, 0.05936214700341225, -0.029121382161974907, 0.044257547706365585, -0.10413498431444168, 0.09214437007904053, 0.017709942534565926, 0.026122651994228363, -0.08045665174722672, -0.03744427487254143, 0.09111800789833069, 0.0020880592055618763, 0.07745599746704102, 0.04109589755535126, -0.07718705385923386, -0.045550283044576645, 0.06791391223669052, 0.06261736899614334, -0.04795467481017113, 0.016496436670422554, 0.02853775955736637, -0.038986679166555405, 0.012603304348886013, 0.05299075320363045, 0.0022748250048607588, 0.00884503684937954, 0.1081618219614029, 0.05347983166575432, 0.03069908171892166, 0.015294212847948074, 0.0618034303188324, -0.07555301487445831, -0.0897526815533638, -0.07293840497732162, -0.02863491326570511, 0.01548877265304327, 0.09115951508283615, 0.011775748804211617, -0.009436656720936298, -0.07188120484352112, -0.004493236541748047, 0.0661926344037056, -0.04905804619193077, -0.06685564666986465, 0.06110713630914688, 0.018521195277571678, -0.04577818885445595, 0.07256703823804855, 0.0831693485379219, 0.008730655536055565, 0.04827301949262619, 0.0754026547074318, 0.027548737823963165, -0.07210569083690643, -0.004550515208393335, -0.06998797506093979, -0.014580612070858479, -0.04511459916830063, 0.1119980439543724],
-        "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
-        "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583],
         "MINSDrr":[0.00284420233219862, 0.07673676311969757, 0.08602232486009598, 0.030074521899223328, -0.06255929172039032, -0.10135219246149063, 0.0772649347782135, 0.0045582992024719715, -0.01195931900292635, 0.009085145778954029, -0.04665979743003845, 0.019213048741221428, 0.022454556077718735, -0.05505772680044174, 0.035268958657979965, -0.06431140005588531, -0.001450810581445694, -0.027346337214112282, 0.041191086173057556, -0.0808955729007721, -0.04748200997710228, 0.0653977245092392, 0.042980875819921494, -0.04332194849848747, -0.024661004543304443, 0.09317019581794739, -0.06639514118432999, 0.013383567333221436, 0.051771167665719986, 0.05815904587507248, -0.05226780101656914, 0.079694002866745, -0.017969269305467606, -0.07137028127908707, -0.0011493286583572626, -0.02009846828877926, 0.006549016106873751, 0.0019126685801893473, 0.06168307736515999, -0.025323089212179184, 0.010943768545985222, 0.02157585136592388, -0.012993190437555313, -0.025179127231240273, -0.08958654850721359, -0.04273540899157524, 0.015248515643179417, 0.05456075817346573, 0.05705633386969566, -0.0038763433694839478, 0.08008016645908356, -0.004114328417927027, -0.01975642889738083, -0.014040309935808182, 0.025527596473693848, -0.06883629411458969, 0.06273050606250763, 0.05779215693473816, -0.061573851853609085, 0.01889919489622116, 0.026195447891950607, -0.021544434130191803, -0.0810774490237236, -0.016286203637719154, 0.01799311302602291, -0.08440321683883667, 0.0897485539317131, 0.08083964139223099, -0.006629236973822117, 0.051063962280750275, -0.08597207814455032, 0.029692046344280243, -0.03309508413076401, -0.09422174096107483, 0.0019163102842867374, 0.05546015128493309, -0.05980079993605614, -0.07416199892759323, -0.005134278908371925, 0.07392455637454987, -0.0634748563170433, 0.020546387881040573, -0.019978882744908333, 0.039572179317474365, -0.04754075035452843, -0.06090293824672699, -0.011185224168002605, -0.054661743342876434, 0.027916360646486282, -0.00819246843457222, -0.03119322657585144, 0.019949961453676224, 0.008312772959470749, 0.06788603216409683, 0.041624777019023895, 0.051687415689229965, -0.04819793254137039, -0.0761520192027092, -0.019374510273337364, -0.008435340598225594],
         "MINSSrr":[-0.06906168162822723, 0.008121289312839508, 0.010413543321192265, 0.052863992750644684, 0.01030051801353693, -0.009280139580368996, 0.016139337792992592, -0.05126945674419403, 0.06733083724975586, -0.01006366591900587, 0.06506948918104172, 0.05012301355600357, -0.07191506028175354, 0.018038516864180565, -0.020798280835151672, 0.08538958430290222, -0.028427604585886, 0.02630189247429371, 0.010489841923117638, 0.10011959075927734, -0.067482590675354, 0.01461686473339796, 0.03908747434616089, -0.015383233316242695, -0.03783239424228668, 0.06359098851680756, -0.052475571632385254, 0.07818790525197983, -0.0030931381043046713, 0.013684416189789772, 0.04222726821899414, 0.04708671569824219, 0.01192860770970583, 0.08628913760185242, -0.06380248814821243, -0.004006511997431517, -0.02817981317639351, -0.11196613311767578, 0.01953534409403801, 0.0034300305414944887, -0.040240559726953506, 0.004963779356330633, -0.06623393297195435, -0.04386508837342262, -0.08431598544120789, -0.023293999955058098, 0.02133636176586151, 0.04054516181349754, 0.04479363188147545, 0.02776535600423813, 0.01497643906623125, 0.026148531585931778, -0.05869835242629051, -0.07451415807008743, -0.009552933275699615, -0.004124804865568876, 0.08342882245779037, 0.05295371264219284, -0.05495591461658478, -0.07350015640258789, -0.05573306977748871, 0.07158630341291428, 0.04162517935037613, 0.0019162269309163094, -0.07742705941200256, -0.05673951655626297, 0.05760834366083145, 0.08143799751996994, 0.09629082679748535, -0.05737840384244919, 0.03762679174542427, 0.022383252158761024, 0.02897579036653042, -0.0929567888379097, 0.04767351970076561, 0.05145186930894852, 0.012956425547599792, 0.04237693175673485, 0.06772835552692413, 0.011290902271866798, -0.06324069201946259, -0.04689439386129379, 0.09521757066249847, 0.05625065788626671, -0.032533977180719376, -0.00987032800912857, -0.08346299827098846, -0.06292857229709625, 0.042861636728048325, 0.08865208923816681, -0.0021774298511445522, 0.010668188333511353, -0.05791740491986275, 0.02240762859582901, -0.022414017468690872, 0.04343479871749878, 0.01852354407310486, -0.004329795949161053, -0.00262851663865149, -0.009029376320540905],
         "MOV":[-0.03924819082021713, -0.015029003843665123, 0.14121688902378082, -0.05414531007409096, -0.01409768033772707, 0.05467522144317627, -0.0798286497592926, 0.042834796011447906, -0.04328306391835213, -0.12638653814792633, 0.02380293421447277, -0.010002975352108479, -0.03018246777355671, -0.09843093156814575, -0.015159506350755692, -0.03186051547527313, -0.009830419905483723, 0.024049948900938034, -0.028536750003695488, -0.05252794921398163, -0.003984724637120962, -0.09075328707695007, -0.015937313437461853, 0.07316069304943085, 0.002778300317004323, 0.003214895725250244, -0.0832214206457138, 0.012602301314473152, 0.0687694102525711, 0.1425037384033203, -0.04724106192588806, 0.05618143081665039, 0.0028424363117665052, 0.03067261539399624, 0.008477674797177315, -0.002142940880730748, 0.0036045191809535027, -0.02257452718913555, 0.013552851043641567, -0.016065331175923347, 0.03364546224474907, 0.0027604023925960064, -0.013575572520494461, 0.1340155154466629, 0.04859570413827896, 0.07984673976898193, 0.006813493091613054, -0.017625009641051292, -0.0667564794421196, -0.0025298972614109516, -0.06280945241451263, 0.08589767664670944, -0.011751428246498108, 0.04074618220329285, 0.0561428964138031, -0.0068444423377513885, 0.028041694313287735, 0.06258948892354965, 0.02493610419332981, -0.018480388447642326, -0.035079196095466614, 0.14365622401237488, -0.046609606593847275, 0.040164150297641754, -0.049927353858947754, 0.06781942397356033, -0.04828719049692154, 0.03496144339442253, -0.044686879962682724, 0.04254060238599777, 0.024320241063833237, -0.0031205937266349792, -0.049061503261327744, -0.028716804459691048, -0.056192029267549515, 0.022012677043676376, -0.0745186060667038, -0.0008951064082793891, -0.051033493131399155, 0.023357892408967018, 0.06984421610832214, 0.0057564410381019115, -0.005192344542592764, -0.003961252048611641, -0.012275456450879574, -0.018581852316856384, -0.0046620736829936504, 0.02494811825454235, 0.0520334355533123, -0.02435225434601307, 0.0008846594137139618, 0.017687007784843445, 0.07866063714027405, -0.025595100596547127, -0.020679078996181488, -0.027750879526138306, 0.10005537420511246, -0.015581297688186169, -0.08011393249034882, 0.028118811547756195],
         "MUL":[-0.026987887918949127, 0.06016572564840317, 0.0787728950381279, -0.0803905576467514, 0.005736608523875475, -0.07245960086584091, -0.02662983350455761, 0.012340782210230827, 0.042490337044000626, 0.06399581581354141, -0.009004191495478153, 0.0370473749935627, -0.0605553574860096, -0.09520823508501053, 0.0010566662531346083, -0.028270091861486435, 0.08631408214569092, 0.002891023177653551, -0.051674507558345795, -0.04089691862463951, -0.04444378614425659, -0.061945777386426926, -0.026001833379268646, 0.04689744487404823, -0.07711070775985718, 0.07018855959177017, -0.02606336772441864, 0.054914504289627075, 0.03522270917892456, -0.027317974716424942, 0.02187947928905487, 0.009710998274385929, 0.01340037677437067, 0.016422593966126442, -0.058249425143003464, -0.08377814292907715, -0.04476138949394226, 0.04349169507622719, 0.05062006786465645, 0.01706511154770851, 0.020649245008826256, 0.06287672370672226, -0.03981941193342209, 0.04973218962550163, -0.03353424742817879, -0.016799092292785645, -0.031751759350299835, 0.10430201143026352, -0.04326871410012245, 0.0736854076385498, -0.0768580436706543, -0.03183818608522415, 0.010583195835351944, 0.015541432425379753, 0.03191666305065155, 0.020011236891150475, 0.041239380836486816, -0.0029152908828109503, 0.009499716572463512, -0.011166329495608807, 0.03469998389482498, 0.00607832008972764, 0.030300112441182137, -0.040855471044778824, 0.00988304428756237, 0.050531189888715744, 0.06647889316082001, -0.027519647032022476, -0.06819992512464523, 0.02215251699090004, 0.086424820125103, -0.03395787626504898, -0.020825445652008057, 0.08309803158044815, -0.0256529338657856, 0.005000723991543055, -0.03375622257590294, 0.005569287110120058, -0.028089171275496483, 0.04142652079463005, -0.03232670575380325, 0.025872791185975075, -0.07439207285642624, 0.04975134879350662, 0.049770113080739975, -0.05090470612049103, -0.04476647078990936, 0.09217675030231476, 0.05079415813088417, 0.017867455258965492, -0.04477125406265259, 0.004301204811781645, 0.05066722631454468, -0.08186711370944977, 0.008772231638431549, -0.10532139241695404, 0.004499110858887434, 0.03296274691820145, -0.0020684772171080112, 0.05012065917253494],
-        "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
         "NEG":[-0.0585959330201149, -0.02519698068499565, 0.029133861884474754, -0.003332944354042411, 0.05054186284542084, -0.03572014719247818, -0.012210451066493988, 0.06708117574453354, -0.0712793841958046, -0.01644597202539444, 0.06453811377286911, -0.03662518784403801, 0.0545802004635334, -0.11130833625793457, -0.04544609412550926, 0.012950814329087734, 0.08011337369680405, 0.014672964811325073, 0.0030391360633075237, -0.10994786024093628, 0.004102041013538837, -0.0749390497803688, -0.010000540874898434, 0.062072113156318665, 0.03312767669558525, -0.04764379560947418, -0.033307697623968124, 0.02903047949075699, 0.0319744311273098, 0.027374137192964554, -0.05640692263841629, -0.01572772115468979, 0.019634589552879333, 0.0629790723323822, -0.024743184447288513, -0.09348101913928986, 0.04078087955713272, 0.0002063393039861694, 0.01791796088218689, -0.01174850668758154, 0.0067609078250825405, 0.031922854483127594, 0.045338794589042664, -0.06706424057483673, -0.03090975433588028, 0.035511564463377, 0.0377444289624691, -0.007464382331818342, 0.02387971244752407, -0.023001981899142265, -0.0052301278337836266, 0.08532170951366425, 0.00384823651984334, 0.0689602717757225, -0.05606595426797867, 0.03483026847243309, 0.023350417613983154, -0.06512849777936935, 0.0627395287156105, -0.0203714482486248, -0.009735504165291786, 0.06432165950536728, -0.04546240717172623, 0.0322086475789547, 0.004561635199934244, 0.040702879428863525, -0.0680280476808548, 0.025354159995913506, -0.07624178379774094, 0.06776861846446991, 0.07863514125347137, -0.037652503699064255, -0.023264721035957336, 0.030604641884565353, -0.07419195026159286, 0.014679630286991596, 0.1294829547405243, 0.007591600529849529, -0.06612348556518555, 0.03127516806125641, 0.10645392537117004, -0.018773522228002548, 0.03992835432291031, 0.044048961251974106, 0.00023814172891434282, -0.06797933578491211, -0.08000202476978302, -0.04320430010557175, 0.043590281158685684, -0.05034546181559563, 0.014501169323921204, 0.03329288214445114, 0.03045976720750332, -0.01932660862803459, -0.026188183575868607, -0.1232738122344017, -0.04858024790883064, -0.015570580027997494, 0.013346930965781212, 0.009410912171006203],
         "NOT":[0.02556992694735527, -0.0005189123330637813, 0.010195978917181492, -0.027382172644138336, -0.0374554842710495, 0.08793098479509354, 0.0024311996530741453, -0.08769379556179047, -0.054654307663440704, -0.08747632801532745, 0.09218847006559372, 0.0972878560423851, 0.044738128781318665, -0.02398994378745556, -0.046165600419044495, -0.0002692296984605491, -0.03797682002186775, 0.05161413550376892, -0.033769138157367706, 0.011279402300715446, 0.08941229432821274, -0.07437314093112946, -0.025249861180782318, 0.1026485413312912, -0.042062994092702866, 0.022835882380604744, 0.05108749121427536, -0.054616689682006836, -0.04208545386791229, 0.10205414891242981, -0.02474227361381054, -0.01605238951742649, -0.011079655028879642, -0.04231556877493858, -0.058844879269599915, 0.0017704797210171819, 0.005396600812673569, -0.058835554867982864, 0.03384264558553696, -0.024245088919997215, 0.03355555981397629, 0.02017929218709469, 0.04421762749552727, 0.09027500450611115, 0.03916880115866661, 0.042518291622400284, 0.024490609765052795, 0.00026937652728520334, -0.010342003777623177, -0.05488119646906853, 0.07418034970760345, 0.0008032438345253468, 0.09190968424081802, 0.07747997343540192, -0.024773627519607544, 0.0496656633913517, -0.038326963782310486, -0.0022213482297956944, 0.02448110282421112, 0.0022990668658167124, 0.052763812243938446, 0.051123637706041336, 0.03795074671506882, 0.06734737008810043, -0.030445149168372154, 0.021410485729575157, -0.044919464737176895, -0.0011586989276111126, -0.0903671532869339, -0.01408425159752369, 0.07342954725027084, -0.04118982329964638, -0.008432484231889248, -0.0008165669860318303, -0.0642886608839035, 0.007230957038700581, -0.0670868456363678, -0.01116579957306385, -0.09545603394508362, -0.03109285980463028, 0.005951744969934225, 0.024672016501426697, -0.04027184471487999, 0.03607063740491867, 0.023179687559604645, 0.0117312828078866, -0.019768331199884415, -0.023262612521648407, 0.04165903106331825, -0.039224691689014435, 0.040571704506874084, 0.08653629571199417, 0.027772698551416397, -0.08196783810853958, -0.013821743428707123, 0.004212009254842997, 0.01664070598781109, -0.008459849283099174, 0.041462354362010956, 0.06886350363492966],
         "OR":[-0.0010318798013031483, -0.058885037899017334, 0.015562368556857109, -0.03459857404232025, -0.006239954382181168, 0.04347813501954079, -0.043183062225580215, -0.06115246191620827, -0.08097145706415176, -0.040188197046518326, 0.02098822593688965, -0.013338722288608551, -0.01845080405473709, -0.07172099500894547, -0.00026761949993669987, 0.015059647150337696, -0.08275016397237778, 0.10280061513185501, -0.017712965607643127, -0.07511771470308304, 0.007648291997611523, -0.12827979028224945, -0.020353827625513077, 0.08809063583612442, -0.02829514630138874, 0.003038457129150629, -0.04399721696972847, 0.046383049339056015, 0.06416497379541397, -0.0006932668038643897, -0.033501505851745605, -0.012374987825751305, 0.018504725769162178, 0.00529597420245409, -0.040804456919431686, -0.00419827364385128, -0.017476536333560944, -0.04530858248472214, 0.01608600653707981, -0.08898036181926727, -0.015132613480091095, -0.053797122091054916, -0.011825251393020153, 0.09507828205823898, 0.08454664051532745, 0.04075947031378746, 0.020354142412543297, 0.01704799383878708, -0.026439497247338295, -0.04004717990756035, -0.053405825048685074, 0.04079057276248932, 0.026150185614824295, 0.04538597911596298, 0.046778932213783264, 0.057205770164728165, 0.037173718214035034, -0.07114585489034653, 0.03480122983455658, 0.0069038826040923595, -0.056386105716228485, -0.03294815868139267, 0.04636325314640999, -0.05767818167805672, -0.05788124352693558, -0.011048000305891037, -0.04350278526544571, 0.029680529609322548, -0.0512658953666687, 0.04321866109967232, 0.047014784067869186, -0.014913392253220081, -0.007425297982990742, -0.09810416400432587, -0.07316632568836212, 0.05063875392079353, -0.07298189401626587, -0.012434680946171284, -0.09386061877012253, 0.016765601933002472, 0.06658460199832916, 0.0014198448043316603, -0.022241152822971344, 0.05902376398444176, 0.057584285736083984, 0.024565961211919785, -0.02896188013255596, 0.006485136691480875, 0.05981580168008804, -0.015995489433407784, 0.027470067143440247, 0.09679803997278214, 0.0342426523566246, -0.08387557417154312, -0.015599220991134644, -0.0049544889479875565, -0.06524655222892761, 0.02150602824985981, 0.016511479392647743, 0.055177561938762665],
@@ -115,7 +106,6 @@
         "PCMPGTBrr":[-0.04665364325046539, -0.03588206693530083, 0.05219453573226929, 0.08376432955265045, 0.05562759190797806, -0.0034289404284209013, 0.08200010657310486, 0.023898538202047348, -0.002851601457223296, -0.08778133243322372, 0.017107484862208366, 0.08448091894388199, 0.020043527707457542, 0.038858626037836075, 0.036468397825956345, -0.0069902255199849606, -0.09442859143018723, 0.0018075992120429873, 0.05577728524804115, -0.0005804274696856737, 0.029588190838694572, -0.050955869257450104, 0.016604335978627205, -0.054141607135534286, -0.030936168506741524, 0.004688458051532507, -0.02321118488907814, -0.009524177759885788, 0.030161075294017792, -0.0557246096432209, 0.017830688506364822, 0.04058525711297989, 0.023080267012119293, 0.04536818340420723, 0.09658516198396683, 0.004083207808434963, 0.053284309804439545, 0.07114734500646591, 0.03272407501935959, -0.06646303087472916, 0.08200454711914062, -0.06558514386415482, 0.0745493471622467, -0.0010506648104637861, -0.02250707894563675, 0.015057512558996677, -5.047186277806759e-06, 0.04663649946451187, 0.06489380449056625, -0.0477377213537693, -0.08882559835910797, 0.08948437124490738, -0.052260447293519974, 0.06798093020915985, -0.06404604762792587, 0.0005905702710151672, 0.014312930405139923, 0.0370929092168808, 0.03622571751475334, 0.06601805984973907, 0.04077596217393875, -0.0019877473823726177, -0.02357509359717369, 0.04524341970682144, 0.024309739470481873, -0.05969798564910889, -0.015872884541749954, -0.055400021374225616, 0.04820183292031288, 0.024034500122070312, -0.05125486105680466, 0.020366262644529343, 0.03310052305459976, 0.1036759540438652, 0.049202825874090195, -0.010945710353553295, -0.030628688633441925, 0.048871662467718124, 0.07457619905471802, 0.017111260443925858, 0.028184816241264343, -0.09065181016921997, -0.017116032540798187, -0.06233282387256622, -0.011385255493223667, -0.06190027296543121, -0.01189250499010086, -0.03632708638906479, 0.04705822467803955, 0.0022293981164693832, 0.06782552599906921, -0.0490303635597229, -0.08690774440765381, -0.08311695605516434, 0.04079030826687813, 0.022971853613853455, -0.019726071506738663, -0.032829709351062775, -0.05147984251379967, -0.06768873333930969],
         "PCMPGTDrr":[0.026095403358340263, 0.009877854026854038, -0.022390423342585564, -0.06749505549669266, 0.03866114094853401, 0.07523459941148758, -0.02331429533660412, -0.013958744704723358, -0.05151516944169998, -0.033018071204423904, -0.017118683084845543, 0.06611985713243484, 0.024562569335103035, 0.027193237096071243, -0.04081164300441742, -0.0557839497923851, 0.07676059752702713, -0.017435213550925255, -0.0696197971701622, 0.04529204219579697, 0.015718640759587288, -0.0868423655629158, -0.025476763024926186, 0.1075882539153099, 0.08407340198755264, 0.03219793736934662, -0.029079284518957138, -0.10067792236804962, -0.01665782555937767, -0.002518820110708475, 0.06302576512098312, -0.042360853403806686, -0.014688530936837196, -0.04797102138400078, -0.05708448588848114, 0.05345156416296959, -0.03360274061560631, -0.006362707354128361, 0.045909661799669266, -0.0034944594372063875, -0.04771789163351059, -0.015326191671192646, -0.017800530418753624, 0.009678518399596214, -0.01412744726985693, 0.09620117396116257, 0.0705861821770668, -0.0663042888045311, 0.07589521259069443, -0.08846025168895721, 0.008178732357919216, -0.023293234407901764, 0.049390021711587906, 0.00771696399897337, -0.026583032682538033, 0.012981866486370564, -0.06098538264632225, -0.04784953594207764, -0.001411060569807887, -0.0646580159664154, 0.07771933078765869, 0.012061100453138351, 0.026251494884490967, 0.024035189300775528, 0.00368816708214581, 0.019370727241039276, 0.0473535880446434, 0.0688827782869339, -0.0656280517578125, 0.0001225982268806547, -0.04765431582927704, 0.08570858836174011, 0.06544618308544159, 0.02309294231235981, -0.07891835272312164, 0.05969972908496857, 0.04259306937456131, -0.0388357900083065, 0.10700955986976624, -0.03643207252025604, -0.014097973704338074, 0.018475063145160675, -0.008959461003541946, -0.04132810980081558, -0.01586003415286541, -0.013873838819563389, 0.07354859262704849, 0.003967848140746355, -0.023853322491049767, -0.013099947944283485, 0.06407736241817474, 0.03060499019920826, -0.08859552443027496, 0.009045977145433426, -0.09939071536064148, -0.022137949243187904, -0.03951180726289749, -0.0316530205309391, 0.05501912534236908, -0.06330689787864685],
         "PEXTRWrr":[-0.05698293820023537, -0.02332535944879055, -0.01313185878098011, 0.08844685554504395, -0.030702419579029083, -0.042257267981767654, -0.06976033002138138, 0.08907881379127502, 0.040486857295036316, 0.01966431364417076, 0.011261478997766972, 0.011022844351828098, -0.0069642444141209126, -0.016230706125497818, -0.009695738554000854, -0.04666578397154808, 0.016855308786034584, -0.03308985382318497, 0.01504850760102272, 0.09940154105424881, -0.07109691947698593, 0.043378811329603195, -0.06964893639087677, -0.05999808758497238, 0.008651218377053738, 0.04237857088446617, 0.04557272046804428, 0.04033806174993515, -0.005760873202234507, 0.008976156823337078, 0.05276636406779289, -0.06584233790636063, -0.011512805707752705, -0.01598522625863552, -0.044132646173238754, -0.020889364182949066, -0.09435509145259857, -0.02823605202138424, 0.0820322185754776, -0.0391690619289875, 0.03367430716753006, -0.029474111273884773, -0.07719384133815765, 0.003098628716543317, 0.05822441354393959, -0.09175454080104828, 0.02256210707128048, -0.004901964217424393, -0.008566503413021564, 0.040359016507864, -0.04049991816282272, 0.010366388596594334, -0.05293237417936325, -0.0956558957695961, -0.01418458204716444, 0.05464276298880577, -0.014091472141444683, 0.023551519960165024, -0.042662639170885086, -0.07025191932916641, -0.0017952515045180917, 0.07680258899927139, -0.10743812471628189, -0.08435508608818054, -0.00337960640899837, -0.03381747379899025, 0.027066459879279137, -0.009784750640392303, 0.04265652969479561, 0.02066781371831894, -0.03692338988184929, 0.0029027678538113832, 0.06893923878669739, 0.03784753382205963, -0.04037536308169365, -0.09532847255468369, 0.03193795308470726, 0.0387917198240757, 0.03887058049440384, -0.0002478501701261848, -0.0671166405081749, 0.06754262745380402, 0.01643708348274231, 0.012460017576813698, 0.03147564455866814, 0.05646798014640808, 0.014081758446991444, 0.07141963392496109, 0.016428180038928986, 0.0443485863506794, 0.06492826342582703, 0.09964785724878311, 0.026795320212841034, 0.0271765124052763, -0.015695465728640556, -0.08133535832166672, -0.05439477041363716, 0.04913243651390076, 0.024485180154442787, -0.04072758927941322],
-        "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063],
         "PMOVMSKBrr":[0.07294902205467224, -0.00040799094131216407, -0.01483855675905943, -0.02571418508887291, 0.08466307818889618, -0.03447218984365463, -0.05685977265238762, -0.019133185967803, 0.06332023441791534, -0.061352625489234924, -0.023195402696728706, -0.05378473922610283, -0.05650350823998451, 0.06583224982023239, -0.012845925986766815, -0.052972156554460526, 0.049470845609903336, -0.04565730318427086, 0.09717552363872528, -0.014171762391924858, 0.013508875854313374, 0.004057068843394518, -0.020556267350912094, -0.10475417971611023, 0.018426941707730293, -0.07273723930120468, 0.01702595315873623, -0.013097747229039669, -0.07530277967453003, 0.05442536994814873, -0.0601920410990715, -0.05255919322371483, -0.07305102050304413, 0.02758030779659748, 0.06180129200220108, 0.10606050491333008, 0.046477098017930984, -0.024062691256403923, 0.07360008358955383, -0.011283098720014095, -0.03712400794029236, -0.09973011910915375, 0.018314119428396225, 0.009135990403592587, -0.01891133189201355, 0.00915572326630354, 0.006080301944166422, -0.02368554100394249, -0.019582828506827354, 0.051494162529706955, -0.010953089222311974, 0.011621126905083656, 0.010515356436371803, 0.011188569478690624, -0.0202876515686512, 0.038686931133270264, -0.066365085542202, 0.014182188548147678, 0.00445093447342515, 0.05712618678808212, -0.04463819041848183, -0.10292281210422516, -0.011173201724886894, 0.0029098563827574253, 0.06890314072370529, 0.06398330628871918, 0.03248615562915802, -0.05457807704806328, -0.006898659747093916, 0.038892313838005066, -0.09130232781171799, 0.013324378058314323, -0.033766016364097595, -0.043404608964920044, 0.018701359629631042, -0.03784232959151268, -0.05014420300722122, 0.04404780641198158, 0.09254389256238937, 0.09839074313640594, -0.028214668855071068, 0.03262662887573242, 0.04281335324048996, 0.07356158643960953, -0.0773080587387085, 0.026536725461483, -0.06819723546504974, 0.03335537016391754, 0.09355103969573975, -0.052649617195129395, -0.08467497676610947, -0.06516479700803757, -0.07499512284994125, 0.023276200518012047, -0.06063856557011604, -0.044472258538007736, 0.03155883774161339, -0.011262890882790089, 0.04045895114541054, 0.012343645095825195],
         "PMULUDQrr":[-0.018331514671444893, 0.04249238595366478, 0.0718526765704155, 0.03221653401851654, -0.04829120263457298, -0.02055567130446434, 0.05200991779565811, -0.04337913170456886, -0.02698952704668045, 0.05037892237305641, 0.014545431360602379, 0.09035851061344147, 0.0777752548456192, -0.06762461364269257, 0.032133519649505615, 0.048851024359464645, 0.01295433659106493, 0.054136257618665695, 0.09599477052688599, 0.024489495903253555, 0.05683024227619171, -0.05242127552628517, -0.043476004153490067, 0.004586773458868265, 0.024281315505504608, 0.03402777388691902, 0.0033939755521714687, 0.049474406987428665, 0.0011405921541154385, 0.06828528642654419, 0.08426304161548615, -0.029339993372559547, -0.04173621907830238, -0.03966334089636803, -0.03011258877813816, -0.07684683799743652, 0.040944185107946396, -0.04709877818822861, 0.07968004047870636, 0.07534269988536835, -0.006957313045859337, -0.0016522067598998547, -0.017229178920388222, 0.030470186844468117, 0.05390452966094017, 0.05233803763985634, 0.045554302632808685, -0.03710555285215378, 0.05699322372674942, 0.019888387992978096, 0.10152119398117065, 0.026563912630081177, -0.0018862299621105194, -0.02453959546983242, -0.06107368320226669, -0.04910692200064659, -0.06316373497247696, 0.04648333042860031, -0.00939352996647358, 0.030374331399798393, 0.0027768383733928204, 0.07302171736955643, -0.0035402378998696804, 0.054474033415317535, -0.0739617869257927, 0.01190911140292883, -0.019428657367825508, -0.006644500885158777, -0.04998863860964775, 0.03215506672859192, 0.054085105657577515, 0.047874726355075836, 0.10735851526260376, 0.030255280435085297, 0.029996531084179878, 0.006218941882252693, 0.04892734810709953, 0.06425125896930695, -0.017792150378227234, 0.041398752480745316, -0.017293022945523262, -0.011015499010682106, -0.02933122031390667, -0.005825115367770195, -0.07212502509355545, 0.10469445586204529, 0.009840304031968117, 0.026172513142228127, 0.002459621522575617, -0.02771947532892227, -0.006639100145548582, -0.04062161594629288, -0.0746249407529831, 0.04523816704750061, -0.07439430058002472, 0.06977812945842743, 0.008738852106034756, 0.06937781721353531, 0.07391723990440369, -0.09542208909988403],
         "POPCNT":[0.032459065318107605, 0.11127372831106186, -0.004006756469607353, 0.06373029947280884, 0.07161973416805267, -0.07966824620962143, -0.014274416491389275, 0.02168503776192665, -0.060636017471551895, -0.051414258778095245, 0.003268218832090497, 0.05552225932478905, 0.01940925046801567, -0.05398592725396156, 0.09021458029747009, -0.060922130942344666, -0.0407782681286335, -0.027882883325219154, 0.012706448324024677, -0.02730434015393257, 0.05854162946343422, -0.0798129290342331, -0.00179530237801373, 0.04958317428827286, -0.04621487483382225, 0.0524308979511261, -0.03889109939336777, 0.07240460813045502, 0.06366933137178421, 0.029314585030078888, -0.014743340201675892, -0.021233027800917625, 0.06803205609321594, -0.01269250176846981, -0.033408213406801224, 0.09638478606939316, -0.02009841799736023, -0.014619074761867523, 0.022498659789562225, 0.006679723970592022, -0.016163295134902, 0.09717728197574615, -0.010882971808314323, -0.09489153325557709, 0.046623144298791885, -0.04596618935465813, -0.026864662766456604, 0.01605546846985817, 0.05979238823056221, -0.024411896243691444, 0.039511535316705704, -0.0108433086425066, -0.05629622936248779, 0.02339898608624935, -0.025785285979509354, 0.011886742897331715, 0.08834438771009445, -0.08506806194782257, 0.021776534616947174, 0.01446699257940054, -0.009117010980844498, -0.022380229085683823, -0.0541100800037384, -0.040569182485342026, -0.02888612262904644, 0.07774273306131363, -0.052350424230098724, -0.039240963757038116, 0.004771160893142223, 0.014987779781222343, -0.05511622130870819, 0.019763313233852386, -0.0920683741569519, 0.021821241825819016, 0.10812623798847198, -0.06422155350446701, -0.07388156652450562, 0.00949418731033802, -0.06905169039964676, 0.006180475000292063, -0.02844754233956337, 0.11084792017936707, -0.03348945826292038, 0.06860767304897308, -0.0214154664427042, -0.0008655296987853944, -0.020698973909020424, 0.03369581326842308, 0.019848104566335678, 0.013533092103898525, 0.03423681482672691, 0.014547858387231827, 0.02418140508234501, -0.013769546523690224, -0.09633788466453552, 0.01689709909260273, -0.01452709175646305, 0.047873757779598236, -0.0012036423431709409, 0.03720762953162193],
@@ -145,7 +135,6 @@
         "RET":[-0.09685279428958893, 0.0101965656504035, -0.04206235706806183, -0.05282443389296532, 0.050776951014995575, -0.006812752690166235, 0.09618920832872391, 0.04637071117758751, -0.018928129225969315, -0.04118828848004341, -0.06039129197597504, -0.018619466572999954, -0.07845143973827362, -0.14034120738506317, -0.03397035226225853, -0.028233898803591728, -0.08162513375282288, 0.048710327595472336, -0.04177732393145561, -0.08455172181129456, 0.00312337395735085, 0.03531079366803169, -0.057201240211725235, 0.09391707926988602, -0.02847883477807045, 0.01840023323893547, -0.04936904460191727, 0.027487540617585182, 0.08041024953126907, -0.08714525401592255, 0.11963017284870148, -0.0762581005692482, -0.06482874602079391, 0.038007382303476334, -0.003661463735625148, 0.0064629544503986835, -0.08281382918357849, -0.053177930414676666, 0.01966426707804203, -0.04822755232453346, -0.0474051795899868, 0.026990806683897972, -0.057971399277448654, 0.12347304075956345, -0.02745792828500271, 0.0832793116569519, 0.03029884397983551, -0.032751865684986115, -0.022912420332431793, -0.030569355934858322, 0.0971289873123169, 0.07298070192337036, 0.0306894201785326, 0.05817654728889465, 0.005174126010388136, 0.042281877249479294, 0.01975836046040058, -0.11205509305000305, 0.05081645026803017, 0.0034761943388730288, -0.03858469799160957, 0.007316718343645334, 0.07441510260105133, 0.004579664673656225, -0.021868426352739334, 0.01116174180060625, 0.061042461544275284, 0.029598504304885864, -0.06691239774227142, 0.03223221376538277, 0.0867755264043808, 0.05488765984773636, -0.019738517701625824, -0.030367519706487656, -0.06396497040987015, -0.0022451707627624273, -0.06131305173039436, -0.03129804506897926, -0.05657076835632324, 0.009733426384627819, -0.08145039528608322, -0.09049411863088608, 0.004821183159947395, 0.038612931966781616, -0.019062234088778496, -0.021097682416439056, -0.06061801686882973, 0.019766775891184807, 0.0276743546128273, -0.057942990213632584, -0.033430278301239014, 0.0043391571380198, 0.05848158895969391, 0.0826464518904686, 0.09988056123256683, -0.05677378550171852, -0.11326800286769867, 0.051275406032800674, 0.01158174965530634, 0.04368240013718605],
         "ROL":[0.026423713192343712, 0.08523924648761749, 0.005345864687114954, 0.027778491377830505, 0.06572498381137848, 0.056946828961372375, -0.03009108640253544, -0.05564097315073013, 0.07753216475248337, -0.07402804493904114, -0.05589171126484871, -0.050976503640413284, 0.041095346212387085, -0.06708681583404541, 0.08517566323280334, 0.02110634744167328, -0.027871981263160706, 0.0005450723110698164, 0.07511565834283829, 0.0016275837551802397, 0.04902505874633789, 0.024746844545006752, 0.08780711144208908, -0.06167766824364662, 0.06365402787923813, 0.06462119519710541, -0.04920244216918945, 0.056112516671419144, 0.10561680048704147, 0.07879003882408142, 0.03879575803875923, -0.03582729026675224, 0.004805437754839659, 0.030719229951500893, 0.0558336041867733, 0.04387545958161354, 0.020841658115386963, 0.015068157576024532, -0.008266274817287922, 0.05914990231394768, -0.01581275276839733, 0.060716625303030014, -0.02257946878671646, 0.00995479617267847, 0.002104438142850995, 0.03806104138493538, 0.010437156073749065, 0.039603881537914276, -0.02074524573981762, 0.024094516411423683, 0.031944990158081055, -0.07122939079999924, -0.023190783336758614, 0.006518832873553038, -0.04528677463531494, -0.02354210615158081, -0.03518632799386978, -0.07059651613235474, -0.017474880442023277, 0.06688393652439117, -0.07900173962116241, -0.05843310430645943, 0.05351021885871887, -0.05724814161658287, -0.02697751857340336, -0.031128596514463425, 0.03040527179837227, -0.009157841093838215, -0.07642515003681183, -0.042137425392866135, -0.031383614987134933, -0.07586777210235596, 0.0489036925137043, 0.0657171905040741, 0.027123138308525085, 0.034842655062675476, -0.035231154412031174, 0.009778738021850586, -0.06150955334305763, 0.042132262140512466, 0.08945925533771515, -0.07213590294122696, -0.0518047958612442, -0.07094760239124298, 0.07041053473949432, 0.1046413779258728, 0.02394813485443592, -0.014966128394007683, -0.04967860132455826, -0.03941388055682182, -0.10642798990011215, -0.03915626183152199, -0.10921923071146011, -0.035421375185251236, 0.039855729788541794, 0.04145469889044762, -0.025123557075858116, 0.06743432581424713, -0.02060243859887123, 0.02994687482714653],
         "ROR":[-0.03797177970409393, -0.03406170755624771, -0.014866529032588005, -0.002243943279609084, 0.024476991966366768, -0.08789698034524918, 0.02924288995563984, -0.03145875036716461, -0.0030907171312719584, 0.013303312472999096, 0.05823688209056854, 0.06085257977247238, 0.0682583823800087, 0.06680850684642792, -0.0008473473135381937, 0.056926507502794266, 0.05309343710541725, 0.017690004780888557, -0.028605103492736816, 0.02303914539515972, -0.07054196298122406, -0.011117611080408096, -0.0012138717574998736, -0.0877937376499176, -0.005339651368558407, -0.029197875410318375, -0.06283852458000183, 0.00677055399864912, 0.07529082894325256, -0.005144342314451933, -0.03930655121803284, -0.0469868965446949, 0.06799482554197311, -0.013870766386389732, -0.07353825122117996, -0.10425472259521484, 8.023920236155391e-05, 0.05196760594844818, -0.024758316576480865, -0.03249195218086243, -0.0037688545417040586, -0.0033505349420011044, 0.04382188990712166, 0.035679250955581665, -0.04743441194295883, 0.031142324209213257, -0.04255860671401024, -0.02310662344098091, -0.04199622571468353, -0.034439221024513245, -0.06397263705730438, -0.011049525812268257, -0.055776823312044144, 0.039233505725860596, 0.016644736751914024, -0.08737850934267044, 0.0151174021884799, 0.10728199779987335, -0.0006503594922833145, -0.060365013778209686, -0.05337308719754219, -0.021152105182409286, 0.06532585620880127, -0.00926337018609047, -0.08149554580450058, 0.0485830195248127, 0.034749776124954224, -0.05045035108923912, -0.06366241723299026, 0.0544571727514267, 0.07594002038240433, 0.027496861293911934, -0.047294747084379196, 0.017491186037659645, -0.034639474004507065, 0.006060798652470112, -0.07335491478443146, -0.054728057235479355, -0.0018357941880822182, -0.07110298424959183, 0.09072742611169815, 0.03083305060863495, 0.054598040878772736, -0.028097454458475113, -0.012821618467569351, 0.008708478882908821, -0.06561881303787231, -0.04448843002319336, 0.08860815316438675, -0.050312310457229614, 0.09012935310602188, -0.004711236339062452, -0.020932462066411972, -0.10615857690572739, -0.005630030296742916, 0.03976801037788391, 0.040199730545282364, 0.07235082983970642, -7.448523683706298e-05, 0.076942078769207],
-        "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
         "SAR":[-0.058561697602272034, -0.014889497309923172, -0.009758144617080688, 0.00019282882567495108, -0.040600407868623734, -0.05907759070396423, 0.033052023500204086, -0.04672614857554436, -0.050173744559288025, -0.06619776040315628, 0.005385559983551502, 0.05449973791837692, -0.0035163976717740297, -0.12835650146007538, 0.06576846539974213, 0.030572880059480667, -0.014856431633234024, 0.011252024210989475, 0.018954169005155563, -0.10070347040891647, -0.032273050397634506, 0.007221086882054806, -0.020879192277789116, 0.0691007450222969, 0.01286559458822012, -0.020694725215435028, -0.07545264810323715, -0.07742343097925186, -0.005103116389364004, 0.10223732143640518, -0.08521754294633865, 0.07459715753793716, 0.006563629489392042, -0.059839747846126556, -0.023294325917959213, 0.04265525937080383, -0.011012998409569263, -0.02257128618657589, -0.033783379942178726, 0.0368407666683197, -0.048024341464042664, -0.037417128682136536, 0.09010431170463562, 0.09016482532024384, -0.07939734309911728, 0.03274676203727722, 0.0388714037835598, -0.03253694251179695, 0.020820122212171555, -0.0039061333518475294, 0.025425976142287254, -0.01847209222614765, 0.013026821427047253, 0.08873090147972107, -0.010358930565416813, -0.026935681700706482, 0.04795868322253227, -0.06173045188188553, -0.02299962192773819, -0.09966729581356049, 0.008027775213122368, 0.03202224150300026, -0.08922284096479416, 0.03263246268033981, 0.0702379047870636, 0.08681228011846542, -0.053993936628103256, 0.0009890834335237741, -0.060423459857702255, 0.08636976033449173, 0.04784319922327995, 0.05135124549269676, -0.023515762761235237, 0.015414481982588768, -0.06941155344247818, 0.004289102740585804, -0.10909571498632431, 0.014149827882647514, -0.025285568088293076, 0.06270574778318405, 0.0669349953532219, 0.03599094599485397, 0.0436582937836647, 0.06281902641057968, -0.04479018226265907, -0.04126136004924774, -0.026938045397400856, -0.0349077507853508, 0.002964549232274294, -0.04247729107737541, 0.009402072057127953, 0.10574454814195633, 0.03262042999267578, 0.08030910044908524, -0.031244831159710884, 0.010621835477650166, -0.02628093585371971, 0.046942535787820816, -0.022998474538326263, 0.009223603643476963],
         "SBB":[-0.040700677782297134, -0.01474229246377945, 0.09491399675607681, 0.015464535914361477, -0.05408482998609543, -0.09618491679430008, -0.014700816012918949, -0.06255258619785309, 0.09308589994907379, 0.01991264335811138, 0.04899228736758232, -0.03322140499949455, -0.03979090601205826, -0.161369189620018, 0.0957769826054573, -0.045866891741752625, -0.03776619955897331, 0.09559016674757004, -0.0063005415722727776, 0.07086999714374542, -0.004713557660579681, 0.10066409409046173, -0.053719762712717056, 0.07039386034011841, 0.01788068749010563, 0.01069885678589344, -0.003849055850878358, 0.07810717821121216, 0.10748977214097977, -0.09462521225214005, -0.06140149384737015, -0.028434589505195618, 0.0395897701382637, 0.05396975204348564, 0.009982907213270664, -0.014297235757112503, 0.018435295671224594, -0.04264533147215843, -0.0471954308450222, -0.008587008342146873, 0.010918513871729374, -0.03147284686565399, 0.08885594457387924, 0.05178891867399216, 0.05807363614439964, 0.028190992772579193, 0.04205470532178879, 0.00935433991253376, 0.027427801862359047, -0.02180725708603859, -0.06614664196968079, 0.021269382908940315, 0.0585390068590641, 0.12827278673648834, 0.0420454666018486, 0.06753493845462799, -0.05479112267494202, -0.06480395793914795, 0.02621031180024147, -0.07586188614368439, -0.04831313341856003, 0.016674980521202087, -0.006851759273558855, 0.04103298485279083, 0.005965645890682936, -0.02317493036389351, -0.03966135531663895, -0.02576862834393978, -0.0916895642876625, 0.029451601207256317, 0.044677067548036575, 0.026928072795271873, -0.10388721525669098, 0.021140936762094498, -0.06990157812833786, 0.048356350511312485, -0.08890967816114426, -0.0003503488842397928, -0.10245566070079803, -0.0582563653588295, 0.04677841439843178, 0.04697449132800102, -0.04022470489144325, 0.02759086713194847, -0.02867579087615013, 0.013355317525565624, 0.011504339054226875, -0.04230086877942085, -0.045500747859478, -0.03741880878806114, 0.022458063438534737, 0.05192841589450836, 0.008104681968688965, -0.08284809440374374, 0.059996478259563446, 0.07762005180120468, -0.0031316280364990234, 0.06990513950586319, -0.020328091457486153, -0.0027387691661715508],
         "SETB_C":[-0.007473401725292206, -0.06315194815397263, 0.0693482831120491, 0.05207814276218414, -0.08006429672241211, -0.005448522046208382, -0.007457572966814041, 0.011581258848309517, -0.05411145091056824, -0.06738752871751785, 0.013233165256679058, -0.0677611380815506, 0.01846255734562874, -0.09321920573711395, -0.03116961196064949, 0.05861300230026245, -0.001519175828434527, 0.08354826271533966, 0.023905213922262192, 0.0124649154022336, 0.08983863890171051, 0.055941760540008545, 0.07229111343622208, 0.09052376449108124, -0.013718990609049797, 0.06642850488424301, 0.0822976604104042, 0.010060268454253674, 0.04116540774703026, -0.03301406651735306, 0.07296404242515564, -0.03534134477376938, 0.012426529079675674, -0.005412430968135595, 0.06087784096598625, 0.03547677770256996, -0.007232111878693104, 0.06580550968647003, -0.0037480974569916725, 0.02971699647605419, -0.06937503069639206, 0.08572175353765488, -0.02138090692460537, -0.0053040217608213425, -0.029469722881913185, -0.05332958698272705, 0.10073655843734741, -0.03199373558163643, -0.01775289885699749, -0.09716105461120605, 0.06483447551727295, 0.028643250465393066, -0.029914388433098793, 0.007070464547723532, 0.006640028208494186, -0.0033612342085689306, 0.005682659335434437, 0.011877131648361683, -0.038144148886203766, 0.03381858021020889, 0.02083616890013218, 0.029199717566370964, 0.07813020050525665, -0.006173993926495314, -0.016444502398371696, -0.08474857360124588, 0.03877300024032593, -0.046462398022413254, 0.02460806630551815, 0.053950369358062744, 0.01389766950160265, 0.03323421627283096, 0.04349416866898537, 0.04381947219371796, 0.10320119559764862, -0.1117740124464035, 0.03045269101858139, -0.03870442137122154, -0.07607249915599823, -0.00020808610133826733, -0.09519094228744507, 0.06727365404367447, -0.04469249024987221, 0.07144048810005188, -0.08811240643262863, 0.001203814405016601, 0.06901863217353821, 0.05462682247161865, -0.03902207687497139, -0.05885632708668709, -0.028275305405259132, -0.07151838392019272, -0.059166230261325836, -0.015570566058158875, 0.06314826756715775, -0.040293656289577484, 0.021595094352960587, -0.04083842411637306, -0.09180022031068802, -0.0309903621673584],
@@ -263,26 +252,6 @@
         "VINSERTI":[0.09046315401792526, 0.015515458770096302, 0.04200809448957443, -0.046130646020174026, -0.045843131840229034, 0.003743539797142148, 0.025380345061421394, -0.021319231018424034, 0.03850293532013893, 0.006397924851626158, -0.06982530653476715, -0.016159888356924057, -0.09164588898420334, 0.1245846077799797, -0.05104857683181763, -0.011446121148765087, -0.06936608999967575, -0.02683587372303009, -0.0337526798248291, -0.005495472811162472, 0.023584537208080292, -0.0771733894944191, -0.026287894695997238, -0.01172191184014082, 0.09737047553062439, 0.0375351756811142, 0.03280220180749893, -0.014072075486183167, 0.06032971292734146, 0.0072259255684912205, -0.08368974179029465, 0.054626062512397766, -0.021156134083867073, -0.09647785127162933, 0.07431179285049438, -0.09039300680160522, -0.07652204483747482, -0.002478789770975709, -0.012967151589691639, 0.08174770325422287, -0.00968913547694683, 0.015551612712442875, 0.08655177801847458, -0.056927114725112915, -0.011370684020221233, -0.0408773347735405, 0.04413295164704323, 0.05919815972447395, 0.08101782202720642, -0.008914918638765812, -0.019233090803027153, 0.05211508646607399, -0.010292282328009605, 0.021839600056409836, 0.0016950241988524795, -0.031931016594171524, 0.004831018857657909, 0.015328328125178814, -0.015326892025768757, -0.05457184836268425, 0.03782501816749573, -0.014512602239847183, 0.00869232788681984, -0.04001179710030556, -0.00994281005114317, -0.041689563542604446, 0.060574647039175034, 0.044912341982126236, 0.05958174169063568, -0.035378437489271164, 0.08524063974618912, 0.012326095253229141, -0.052227456122636795, -0.015090923756361008, -0.012893415056169033, -0.019565775990486145, -0.03284028172492981, -0.02651887759566307, 0.02436136268079281, 0.004743371158838272, 0.019924448803067207, -0.046163417398929596, 0.005615816451609135, -0.03354670852422714, 0.00801338255405426, 0.02501787059009075, 0.03313247114419937, -0.012842012569308281, 0.04856807366013527, -0.031942710280418396, -0.026277944445610046, 0.11483185738325119, 0.015686793252825737, 0.052031729370355606, 0.025188622996211052, -0.021448785439133644, 0.05062439665198326, -0.030834710225462914, 0.02746596559882164, 0.027027780190110207],
         "VINSERTPSrm":[0.010265239514410496, 0.03508870303630829, 0.0182888712733984, 0.01066108699887991, 0.09608251601457596, -0.0390457920730114, -0.02508910745382309, -0.051061324775218964, 0.051924970000982285, 0.02405509166419506, 0.07347054034471512, -0.023432396352291107, 0.03053455613553524, 0.11051331460475922, -0.07987380027770996, -0.07169938087463379, -0.06944284588098526, -0.010227087885141373, 0.01555782649666071, 0.0033066831529140472, -0.017572278156876564, 0.018880389630794525, -0.03347453102469444, 0.023936258628964424, 0.04189354181289673, -0.008910899050533772, 0.045309878885746, 0.039228133857250214, -0.0026367944665253162, 0.01713910512626171, -0.0038225038442760706, -0.02550170198082924, -0.04479162022471428, -0.11607012152671814, -0.05566885322332382, -0.03926549106836319, -0.05618799477815628, 0.0587196871638298, -0.003744689514860511, 0.09148088097572327, -0.008691483177244663, 0.060393813997507095, 0.05017181858420372, 0.05314680561423302, 0.010222898796200752, 0.04390108212828636, 0.06256565451622009, -0.039335936307907104, -0.030927496030926704, -0.010439696721732616, -0.09615844488143921, 0.04857023432850838, -0.021018074825406075, -0.04949686676263809, 0.0718517154455185, -0.0008082279819063842, -0.05119258537888527, -0.016725104302167892, -0.031902290880680084, 0.07473913580179214, -0.07040376961231232, -0.06263279169797897, -0.01866966485977173, -0.04580819234251976, -0.0018242622027173638, 0.02124813199043274, 0.01608111523091793, 0.033293697983026505, 0.04724595695734024, 0.06764823198318481, -0.010222701355814934, -0.09166357666254044, 0.0065320758149027824, 0.03907076641917229, 0.014404546469449997, -0.04371245950460434, -0.036747194826602936, -0.013570152223110199, -0.04874530807137489, -0.048001520335674286, 0.015114152804017067, 0.018710903823375702, 0.06920618563890457, -0.04024452716112137, 0.07851467281579971, -0.03879975154995918, 0.039278119802474976, -0.06678346544504166, -0.02139596827328205, 0.03483150899410248, 0.047795433551073074, -0.08071790635585785, 0.05023014172911644, -0.05863310769200325, 0.04391729459166527, -0.006796867586672306, -0.07156652212142944, -0.016402525827288628, 0.07568787783384323, 0.01640038751065731],
         "VINSERTPSrr":[0.022902479395270348, 0.024018414318561554, 0.011011038906872272, 0.009072761051356792, -0.057881347835063934, 0.03739643841981888, 0.006768766324967146, 0.07734010368585587, 0.001509909750893712, 0.043335434049367905, -0.07244917005300522, -0.1078081876039505, 0.027186766266822815, 0.018834171816706657, 0.007436624728143215, -0.048498328775167465, 0.09450934827327728, -0.015420452691614628, 0.014672537334263325, -0.0012827727477997541, 0.019664635881781578, -0.026565955951809883, -0.04819858446717262, -0.0004387270309962332, 0.01676507107913494, -0.0014571163337677717, 0.015105879865586758, 0.04038102179765701, 0.008408628404140472, 0.07757255434989929, -0.09923559427261353, -0.04181523248553276, -0.0313955582678318, -0.006045420188456774, 0.05904707312583923, -0.014993838034570217, 0.03219055384397507, 0.058543696999549866, -0.06872320920228958, 0.021718619391322136, -0.08984571695327759, 0.06557019799947739, -0.018167613074183464, -0.011413732543587685, 0.036035921424627304, 0.10104569792747498, 0.05836406350135803, -0.02576756477355957, 0.03827998414635658, -0.06874323636293411, 0.01668366976082325, 0.048310816287994385, -0.010213772766292095, -0.035550933331251144, -0.03385040909051895, 0.004614332225173712, 0.018951643258333206, 0.10679180175065994, -0.019135646522045135, -0.011955377645790577, 0.028140606358647346, -0.08185642957687378, -0.015775075182318687, -0.011507326737046242, 0.07914295047521591, -0.030148068442940712, 0.11757981777191162, 0.00040086795343086123, 0.056880321353673935, -0.014461426064372063, -0.0008378245402127504, -0.06245473772287369, -0.05332277715206146, -0.038401950150728226, 0.005011103581637144, 0.0368003323674202, -0.021230563521385193, 0.01497745979577303, 0.0372738242149353, 0.07988940924406052, -0.013381360098719597, 0.0036820468958467245, 0.07501927763223648, -0.0996084213256836, 0.028014982119202614, -0.09410325437784195, 0.0007335525006055832, 0.004884959198534489, 0.040397197008132935, -0.07655651122331619, 0.05677357316017151, 0.005359896458685398, -0.047478366643190384, 0.03828851506114006, 0.03363208472728729, -0.041756175458431244, -0.00031817640410736203, 0.022837577387690544, 0.039567966014146805, 0.03662540763616562],
-        "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
-        "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
-        "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
-        "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
-        "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
-        "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
-        "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
-        "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
-        "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
-        "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
-        "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
-        "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
-        "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
-        "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
-        "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
-        "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
-        "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
-        "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
-        "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
-        "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335],
         "VMASKMOVPDYmr":[-0.04878474771976471, 0.009688055142760277, 0.05428608879446983, -0.030850162729620934, 0.03008297272026539, 0.03831377625465393, -0.023454757407307625, 0.061062078922986984, -0.07177434861660004, 0.003681673901155591, 0.040161218494176865, -0.009652352891862392, 0.07261710613965988, -0.010966332629323006, -0.013221205212175846, -0.03301544487476349, 0.04829031974077225, -0.08083753287792206, 0.030231673270463943, -0.02659734897315502, -0.036777157336473465, 0.06681652367115021, 0.01175805926322937, 0.06305940449237823, -0.019296150654554367, 0.02796877548098564, -0.029999401420354843, -0.0198240764439106, -0.04471949115395546, -0.06781838089227676, 0.024380704388022423, 0.03754236921668053, 0.06767786294221878, 0.04803696274757385, 0.046649131923913956, 0.04538867995142937, -0.028629129752516747, 0.0127564687281847, 0.004995361436158419, -0.08728974312543869, 0.029057662934064865, 0.07067801058292389, 0.0007887053652666509, 0.019237162545323372, -0.04447153955698013, -0.10583364218473434, 0.08983936905860901, 0.015038984827697277, -0.034384895116090775, -0.055098336189985275, -0.07670909911394119, 0.002524072304368019, 0.10086455941200256, 0.022610867395997047, 0.05591642111539841, -0.07907918840646744, -0.04253252223134041, 0.05387851223349571, -0.034182146191596985, -0.08478306978940964, -0.039358172565698624, 0.05872701108455658, 0.0004980096709914505, -0.054017916321754456, -0.05543661117553711, -0.05234605073928833, -0.01648441143333912, -0.039598412811756134, 0.014009279198944569, 0.07753992825746536, -0.024791967123746872, 0.0015941763995215297, -0.08564147353172302, 0.015439499169588089, 0.04659571126103401, 0.042471837252378464, 0.005456998012959957, 0.015990061685442924, -0.02272135764360428, -0.03891618177294731, -0.0077924951910972595, -0.05113787576556206, 0.040118955075740814, -0.043831776827573776, 0.05283576622605324, 0.09104584157466888, 0.015506122261285782, -0.028880758211016655, -0.0025508899707347155, 0.08238258212804794, -0.011219828389585018, 0.0496247261762619, -0.044287387281656265, 0.050674524158239365, 0.02936738170683384, -0.017218898981809616, 0.07722929865121841, 0.04578819498419762, -0.031120644882321358, -0.022032534703612328],
         "VMASKMOVPSYmr":[0.020578626543283463, -0.004085692577064037, 0.07696651667356491, 0.028803450986742973, -0.006955036427825689, -0.018540993332862854, 0.0719260424375534, 0.09322775900363922, 0.05095001682639122, -0.01811334490776062, 0.01627892442047596, 0.050088733434677124, -0.06736274808645248, 0.025077303871512413, 0.06022811681032181, -0.09305489808320999, -0.09338469058275223, -0.0525103323161602, -0.06159364432096481, 0.030921749770641327, 0.06632588058710098, 0.031169326975941658, 0.016549210995435715, -0.06410345435142517, 0.034944821149110794, 0.01632581278681755, 0.06805131584405899, -0.004622941836714745, -0.02994105964899063, 0.025459013879299164, 0.020487098023295403, 0.06677251309156418, -0.046148937195539474, -0.05847230181097984, -0.0662517175078392, -0.006552667822688818, 0.05338975414633751, -0.07456435263156891, -0.05682503432035446, -0.0720917284488678, -0.08354304730892181, 0.057539310306310654, -0.0984572172164917, -0.015717046335339546, -0.04905203357338905, -0.016580646857619286, 0.030063051730394363, -0.04245767742395401, -0.019089849665760994, 0.037014883011579514, -0.03125334531068802, -0.02194075658917427, 0.057924628257751465, 0.053156934678554535, 0.03154401481151581, -0.03698640316724777, -0.047283731400966644, -0.07787752151489258, -0.09294760227203369, 0.008879968896508217, -0.039479922503232956, 0.06407082825899124, 0.021868228912353516, 0.02621234394609928, -0.05872492864727974, -0.07943505048751831, 0.024682780727744102, 0.014713538810610771, 0.02206231839954853, -0.0664556622505188, -0.08985312283039093, -0.028045928105711937, 0.022865260019898415, -0.03564520925283432, 0.06292934715747833, 0.009946631267666817, 0.031550049781799316, -0.08577742427587509, 0.047102898359298706, -0.07018786668777466, -0.10670997202396393, 0.0016501775244250894, -0.08505392074584961, 0.00861909706145525, -0.06370823830366135, 0.03423767164349556, 0.03173772618174553, 0.019602738320827484, 0.021573755890130997, 0.02385428547859192, -0.01468846295028925, 0.023825718089938164, -0.05538937821984291, 0.05968264490365982, 0.08997872471809387, -0.006320557557046413, 0.012793052941560745, -0.10326020419597626, -0.015349009074270725, 0.006139614153653383],
         "VMASKMOVPSYrm":[-0.09999474138021469, -0.05461611971259117, 0.06111544370651245, 0.009340068325400352, 0.05158305540680885, 0.018409717828035355, 0.03258055821061134, -0.0017857305938377976, 0.041260261088609695, -0.04183795303106308, -0.04711655154824257, 0.007005605846643448, 0.017177876085042953, 0.011972760781645775, -0.058734532445669174, 0.022736912593245506, -0.10794606059789658, 0.029367392882704735, -0.012645614333450794, -0.09590506553649902, -0.07090207934379578, -0.05850019305944443, -0.018024247139692307, -0.0036007456947118044, -0.06459654122591019, 0.009839186444878578, 0.04846305027604103, -0.11106285452842712, 0.029033005237579346, 0.10009876638650894, 0.012796668335795403, -0.0073439814150333405, -0.08754748106002808, -0.037603843957185745, -0.015900349244475365, -0.007158457301557064, 0.03420218825340271, 0.027995899319648743, -0.07699259370565414, 0.042778756469488144, 0.04648644104599953, -0.04391217231750488, -0.018405593931674957, -0.01280362717807293, 0.08530068397521973, -0.03674551844596863, 0.06248623505234718, 0.0038591010961681604, -0.07031620293855667, -0.01702764257788658, 0.005379523150622845, -0.029414091259241104, 0.00011999297566944733, 0.058016858994960785, 0.10091454535722733, 0.07112561911344528, -0.07445680350065231, -0.08252609521150589, -0.05458306148648262, 0.0828995481133461, 0.030287114903330803, 0.08512170612812042, -0.0745752677321434, 0.011145705357193947, 0.07730960845947266, 0.06756677478551865, 0.10192125290632248, -0.015338120050728321, 0.025173967704176903, -0.017697714269161224, 0.00455897580832243, -0.01002852339297533, -0.09001599997282028, 0.06024448946118355, 0.01357717253267765, -0.04349803552031517, 0.026919689029455185, 0.07871785014867783, 0.06163106486201286, -0.02904645912349224, 0.05042176693677902, 0.019180594012141228, -0.029065869748592377, -0.02645217627286911, -0.04180121049284935, -0.01644887775182724, 0.005278781522065401, 0.021325504407286644, 0.0710480809211731, -0.02405066229403019, 0.06883849203586578, -0.08493685722351074, -0.0180019810795784, 0.10276532918214798, -0.04697193205356598, -0.0004998040967620909, 0.014400942251086235, 0.07172509282827377, 0.027445673942565918, 0.04722077399492264],
@@ -673,5 +642,42 @@
         "V_SETALLONES":[0.011805560439825058, 0.005605545360594988, 0.019577916711568832, -0.007038246374577284, -0.013101942837238312, -0.060087915509939194, 0.06600171327590942, 0.1127510741353035, 0.03251935541629791, -0.08513955771923065, -0.1272188425064087, -0.05743984133005142, 0.03415455296635628, -0.01813715696334839, 0.08123213797807693, -0.02604430541396141, 0.004977638833224773, -0.05056260898709297, 0.0759609192609787, -0.03905864432454109, -0.029284782707691193, -0.0773778036236763, -0.06391929090023041, 0.03013690747320652, 0.025567403063178062, -0.04096659645438194, -0.013911372050642967, 0.03076753579080105, 0.09287972748279572, 0.06516721844673157, 0.013303481042385101, -0.05148301273584366, 0.013247961178421974, -0.02087739109992981, -0.06532798707485199, -0.07080436497926712, 0.03797996789216995, -0.05954182893037796, -0.006158157251775265, -0.039611611515283585, 0.016250262036919594, -0.009441757574677467, -0.009183786809444427, 0.16159473359584808, 0.08712765574455261, -0.022884182631969452, -0.03575573116540909, -0.03199240192770958, -0.03306444734334946, -0.003918874077498913, 0.062194518744945526, 0.015179269947111607, -0.027334710583090782, -0.058873455971479416, 0.128275528550148, -0.0292880367487669, -0.07747887820005417, 0.1131230816245079, 0.02434738725423813, -0.025987306609749794, 0.006977062206715345, 0.005061171483248472, 0.010551988147199154, -0.011694980785250664, -0.04222672060132027, 0.0018857514951378107, -0.09771532565355301, 0.005980918649584055, -0.021874738857150078, -0.03269551321864128, -0.0660959854722023, -0.03511122986674309, -0.012204808183014393, -0.010394910350441933, 0.05620425567030907, -0.07928325980901718, 0.0231300238519907, -0.018796175718307495, -0.059483520686626434, -0.06498315185308456, -0.002720780670642853, 0.017449399456381798, -0.07902888208627701, -0.09885134547948837, 0.013462111353874207, 0.0991656631231308, 0.03312922269105911, -0.006249894388020039, 0.005173753947019577, -0.06332565099000931, -0.06398826092481613, -0.03855561092495918, 0.049685269594192505, 0.016197331249713898, -0.006844596937298775, -0.05894636735320091, 0.026065604761242867, -0.023921040818095207, 0.0833858922123909, 0.04180749133229256],
         "XCHG":[0.03013892099261284, -0.005918541457504034, -0.003877029987052083, -0.01153622567653656, 0.07044235616922379, 0.0020885420963168144, -0.04268760234117508, 0.07963797450065613, 0.0896378755569458, -0.03346250206232071, -0.026062551885843277, 0.07721738517284393, 0.08893758058547974, 0.0798523873090744, -0.025333784520626068, -0.01930663175880909, -0.012997916899621487, -0.051225848495960236, -0.0299966000020504, -0.032841041684150696, -0.06343690305948257, -0.016547048464417458, 0.034530773758888245, 0.057199425995349884, 0.0693645030260086, 0.04208416864275932, -0.028830133378505707, 0.08431533724069595, -0.06464798003435135, 0.0009512414690107107, 0.042868468910455704, -0.031348757445812225, -0.01816270686686039, 0.05597987025976181, -0.017707090824842453, -0.03889893740415573, -0.052769940346479416, 0.012921033427119255, -0.029488561674952507, -0.012502696365118027, 0.05398940294981003, -0.032147347927093506, -0.005250571761280298, -0.014250441454350948, 0.08205590397119522, 0.049281857907772064, -0.07257362455129623, -0.0003973407146986574, -0.00821124017238617, 0.10007432103157043, 0.054469816386699677, -0.05644146353006363, 0.013105852529406548, -0.08262810856103897, -0.02594495750963688, 0.007682343479245901, -0.011262120679020882, -0.007376475725322962, -0.05011703073978424, -0.06952987611293793, -0.033738043159246445, 0.01750120520591736, -0.026767224073410034, -0.04718783125281334, 0.002559647196903825, 0.01700885407626629, -0.07193762063980103, 0.07015261799097061, 0.0034866048954427242, -0.08257746696472168, -0.07703307271003723, 0.006709580775350332, 0.06423933804035187, 0.024792056530714035, -0.008637255057692528, 0.0364011712372303, 0.035330090671777725, -0.060980167239904404, 0.026977067813277245, -0.02813805267214775, -0.02690977416932583, 0.05637027323246002, 0.008040377870202065, -0.03371180593967438, -0.06654872000217438, -0.030922764912247658, -0.07050447911024094, 0.047597192227840424, 0.047301240265369415, 0.04565070942044258, -0.0005885852151550353, -0.01970672234892845, -0.013277091085910797, 0.03462797775864601, -0.050644565373659134, -6.830461643403396e-05, -0.0032834408339112997, -0.09096988290548325, -0.0431605726480484, 0.004180085379630327],
         "XOR":[0.05397406592965126, 0.030059566721320152, -0.008174624294042587, -0.015902524814009666, -0.05867229402065277, 0.10023067146539688, 0.039013586938381195, -0.0062194764614105225, 0.0027951474767178297, -0.12871405482292175, 0.006182669661939144, -0.03362947702407837, 0.03972288593649864, -0.0761077031493187, 0.07198456674814224, 0.06330277770757675, -0.020690103992819786, 0.04084693267941475, -0.029953323304653168, -0.1037738174200058, 0.058683767914772034, -0.09326515346765518, -0.030509043484926224, 0.08620086312294006, -0.028335779905319214, 0.0025649559684097767, 0.02293877862393856, 0.06309233605861664, 0.05537085980176926, 0.008650199510157108, 0.08450134843587875, 0.006163342390209436, 0.08676894754171371, 0.00373055599629879, -0.0536164715886116, 0.017478466033935547, -0.02005663886666298, -0.009954672306776047, 0.0935724526643753, -0.013202485628426075, 0.019175032153725624, 0.047811202704906464, -0.010279017500579357, 0.08613553643226624, 0.030951783061027527, -0.007498149760067463, 0.02222890406847, 0.022576699033379555, -0.037464242428541183, -0.05039561539888382, -0.05145428702235222, 0.05291113257408142, -0.04549814388155937, 0.07552238553762436, 0.04320567473769188, 0.08343681693077087, -0.03850278630852699, -0.01834949105978012, 0.047886237502098083, 0.00965320598334074, 0.014898041263222694, -0.06947735697031021, -0.002480468712747097, 0.033667247742414474, -0.057668499648571014, 0.038462892174720764, -0.04644528403878212, -0.06664751470088959, -0.048734813928604126, 0.04303475841879845, 0.027636554092168808, 0.024116700515151024, -0.003788548056036234, -0.0088395019993186, -0.04236738011240959, -0.02894027903676033, -0.135579451918602, -0.032144784927368164, -0.11316774785518646, -0.0039872839115560055, 0.07162772864103317, 0.03945969045162201, 0.007661669049412012, 0.04564569517970085, 0.023007070645689964, 0.0002026051515713334, -0.030437719076871872, -0.01982058770954609, -0.017619898542761803, -0.04013601690530777, 0.03464880958199501, -0.04437020793557167, 0.010373799130320549, -0.057255037128925323, -0.006371108815073967, -0.02713695913553238, -0.06605585664510727, 0.01780680939555168, -0.00013575045159086585, 0.07283638417720795]
+    },
+    "CommonOperands" : {   
+        "Immediate":[-0.039664868265390396, 0.028720445930957794, -0.057207897305488586, 0.04179477319121361, 0.04477043077349663, 0.020050648599863052, -0.056656818836927414, -0.025030966848134995, -0.04394019395112991, 0.04849115386605263, 0.012325904332101345, 0.06731707602739334, 0.04568001255393028, -0.04773757979273796, -0.012142524123191833, -0.03986259177327156, -0.027249159291386604, -0.04930245876312256, -0.10542229562997818, -0.05678592994809151, -0.038303568959236145, -0.07283245027065277, 0.0217409897595644, -0.01139344647526741, 0.006936497986316681, -0.04702157527208328, 0.09977010637521744, -0.035237088799476624, 0.028822069987654686, -0.0691431537270546, -0.0829710066318512, -0.1289154589176178, -0.08470306545495987, -0.06731563061475754, 0.06642980873584747, 0.026025734841823578, -0.04049745202064514, 0.030080674216151237, 0.04203929752111435, 0.06834205985069275, 0.04315062239766121, 0.00788890291005373, 0.03426999971270561, 0.08819636702537537, 0.004112098831683397, 0.03392210975289345, 0.010541473515331745, 0.08045777678489685, -0.02914009988307953, 0.0624285452067852, 0.03299122676253319, -0.05355033650994301, -0.07568570226430893, 0.08106201142072678, 0.0376802459359169, -0.04886564612388611, -0.10992937535047531, -0.00761816743761301, -0.014918084256350994, 0.03816765174269676, -0.04981819912791252, 0.00031993765151128173, 0.011382698081433773, -0.029902901500463486, -0.0117422454059124, -0.057965945452451706, -0.09519924223423004, 0.020727403461933136, -0.04526710882782936, 0.09883677959442139, 0.018033087253570557, -0.003035350237041712, -0.06968960911035538, -0.09893210977315903, -0.01264366414397955, 0.017397744581103325, -0.08519260585308075, 0.09382850676774979, -0.055508699268102646, -0.026548130437731743, -0.013868317008018494, -0.03162496164441109, 0.06089535728096962, -0.01583624631166458, -0.060260944068431854, 0.06709896773099899, -0.09333796799182892, -0.02887417934834957, -0.03424007445573807, -0.01687423326075077, 0.11968979239463806, -0.08361987769603729, 0.09037765115499496, -0.04322688281536102, -0.040831610560417175, -0.061376459896564484, -0.03485504537820816, 0.016033072024583817, 0.004106835462152958, -0.03354674205183983],
+        "MBB":[0.0285621527582407, 0.017540860921144485, -0.08473232388496399, -0.004012782592326403, 0.01284435298293829, -0.05268647149205208, 0.05576688051223755, 0.0021535248961299658, -0.03945871442556381, -0.006189210340380669, -0.015129411593079567, -0.08998296409845352, -0.023543253540992737, -0.03973307088017464, 0.03474939242005348, -0.01602775789797306, -0.07461361587047577, -0.016514597460627556, -0.016366377472877502, 0.004728052299469709, -0.023341577500104904, -0.0914730429649353, 0.030636735260486603, -0.03425632417201996, 0.03614623472094536, -0.007019295822829008, -0.0218521635979414, -0.015808485448360443, -0.05414801836013794, 0.029721688479185104, 0.09407073259353638, 0.029655681923031807, -0.005722714588046074, 0.08653672784566879, 0.01633341796696186, -0.07890991121530533, -0.07574641704559326, 0.013483843766152859, -0.0011275253491476178, -0.05623066797852516, -0.03096684440970421, -0.0019136210903525352, 0.005127475131303072, 0.005057196598500013, -0.008401975966989994, -0.0391613207757473, -0.0026145142037421465, 0.05342942103743553, 0.034099776297807693, 0.028928104788064957, -0.006105952430516481, -0.039190810173749924, 0.026784662157297134, -0.07679374516010284, -0.007475676946341991, -0.036650288850069046, 0.00774755235761404, 0.008984091691672802, -0.059830714017152786, 0.042310964316129684, 0.0681624785065651, -0.018189340829849243, -0.014816401526331902, -0.05541539564728737, -0.09348370134830475, 0.003691869555041194, -0.0010735570685938, -0.010131723247468472, -0.041050590574741364, -0.013792471028864384, -0.024337435141205788, 0.07526508718729019, 0.08163300901651382, -0.03508464992046356, -0.01681988686323166, -0.06734774261713028, -0.07656992971897125, -0.03866373747587204, 0.004544078838080168, 0.0585801787674427, -0.021823249757289886, -0.0610244981944561, -0.04469957575201988, -0.011089849285781384, -0.05069964751601219, -0.025694409385323524, -0.0670132040977478, 0.09616350382566452, 0.06308142840862274, -0.10543308407068253, 0.0023751568514853716, -0.06237253174185753, 0.05771911144256592, -0.06010056659579277, -0.016188565641641617, 0.009142348542809486, -0.014255198650062084, -0.02999819628894329, 0.00473234336823225, 0.03976761922240257],
+        "FrameIndex":[0.05219179764389992, -0.01926516741514206, -0.021848104894161224, -0.008528115227818489, 0.02989117242395878, -0.012461756356060505, -0.050973404198884964, 0.026713935658335686, 0.01968700997531414, -0.001058116089552641, 0.009182002395391464, 0.03877940773963928, 0.070717453956604, -0.0028735792730003595, 0.0528000183403492, -0.015265910886228085, 0.007753959856927395, 0.01596899703145027, -0.07933179289102554, -0.02578687109053135, 0.02417992427945137, -0.03462255373597145, 0.04385964199900627, 0.004388607107102871, 0.03716951236128807, 0.04064105078577995, 0.07711678743362427, 0.0068300217390060425, -0.05443308874964714, -0.010809220373630524, -0.03124961629509926, 0.004911563824862242, -0.09201066941022873, 0.051436200737953186, 0.015400445088744164, 0.07804328948259354, -0.02971532940864563, -0.0003241244703531265, -0.02131350338459015, -0.09173687547445297, -0.01707594096660614, 0.0025449323002249002, 0.08701702952384949, 0.10675988346338272, -0.05082142353057861, 0.021581847220659256, -0.04104776680469513, 0.08402986079454422, -0.06109907105565071, 0.015201682224869728, 0.04374992102384567, -0.028573378920555115, -0.07767742872238159, 0.07216905802488327, 0.020538095384836197, -0.01229778677225113, 0.003033912740647793, -0.0007747758063487709, -0.09185474365949631, -0.02851664461195469, -0.009441743604838848, 0.05500328913331032, -0.002983751241117716, -0.09198789298534393, -0.051319632679224014, -0.054626885801553726, -0.020108554512262344, 0.0010591084137558937, -0.009138713590800762, 0.07223176956176758, -0.022099260240793228, 0.016025206074118614, -0.05320229008793831, 0.025131219998002052, 0.06626036763191223, 0.07639450579881668, -0.027084894478321075, 0.06581225991249084, -0.017618829384446144, -0.03859466314315796, -0.03385398909449577, 0.018783841282129288, -0.0730312392115593, 0.06957981735467911, -0.03065340407192707, 0.020685074850916862, -0.05311165004968643, 0.09466810524463654, 0.00955914705991745, -0.013919183053076267, -0.05540250986814499, -0.03087283857166767, -0.009688221849501133, 0.016239993274211884, -0.012926830910146236, -0.027712060138583183, -0.06342892348766327, -0.011996395885944366, 0.05536693334579468, -0.04359230771660805],
+        "ConstantPoolIndex":[0.041396364569664, -0.032536957412958145, -0.01450332161039114, -0.006678386591374874, 0.058945223689079285, 0.02544882893562317, -0.03047209233045578, -0.07739393413066864, -0.09328317642211914, -0.01668739691376686, -0.024649402126669884, -0.0379607230424881, -0.11910244077444077, -0.020992999896407127, -0.007654233835637569, -0.005232746247202158, -0.05641235038638115, -0.030478237196803093, -0.11095637828111649, -0.029757868498563766, 0.007831704802811146, -0.06478779017925262, -0.029330771416425705, -0.016729608178138733, 0.016851121559739113, -0.08636923134326935, 0.09819734841585159, -0.06862954050302505, -0.054081980139017105, -0.11573795974254608, 0.025045182555913925, -0.045820001512765884, -0.03937136381864548, -0.0006095073185861111, 0.010480350814759731, 0.04263518005609512, -0.07309181243181229, 0.030367357656359673, 0.05174611508846283, -0.07616177201271057, 0.08458246290683746, -0.05704038590192795, -0.08539492636919022, -0.027642514556646347, -0.01617196388542652, 0.025178344920277596, 0.009598441421985626, -0.02391812391579151, -0.007018273696303368, 0.08220435678958893, 0.019317878410220146, -0.07800780981779099, 0.008812256157398224, -0.08796992152929306, -0.018406951799988747, 0.06285018473863602, 0.0247958917170763, -0.010797450318932533, 0.042904313653707504, 0.04307369515299797, 0.03591239079833031, 0.0318138487637043, -0.052741825580596924, -0.05960077419877052, 0.05289359390735626, -0.07335714250802994, -0.07966916263103485, 0.06509458273649216, -0.014078558422625065, 0.05966315418481827, -0.10191051661968231, 0.038503143936395645, 0.08414285629987717, -0.09167703986167908, -0.03125883638858795, 0.00029595239902846515, -0.05052953213453293, 0.06109768897294998, 0.027757229283452034, 0.07064288854598999, 0.025423981249332428, 0.04430470988154411, 0.006646708585321903, 0.011614424176514149, -0.058028463274240494, -0.026873555034399033, -0.045714568346738815, -0.009242760017514229, -0.08255617320537567, 0.03060135245323181, -0.019932182505726814, -0.07189206779003143, 0.01935136877000332, 0.05297813192009926, 0.004497232846915722, -0.08383949100971222, -0.0008196682319976389, 0.03524069860577583, 0.023135961964726448, 0.00863903108984232],
+        "JumpTableIndex":[-0.007416237145662308, 0.0038157713133841753, 0.05180662125349045, 0.03776901960372925, -0.011749244295060635, -0.02952706068754196, -0.06646136939525604, 0.02088487148284912, -0.001927916775457561, 0.018895410001277924, 0.0509350448846817, 0.057210080325603485, -0.0476078987121582, -0.00016809302906040102, -0.02341553010046482, -0.06734820455312729, 0.02047930844128132, 0.009282611310482025, 0.0038133300840854645, 0.0020261742174625397, -0.09253961592912674, 0.0766557827591896, -0.049570225179195404, -0.11510220915079117, -0.009570423513650894, -0.007274465169757605, 0.07750000059604645, 0.02489926479756832, -0.08297400176525116, 0.048176445066928864, 0.03797437995672226, 0.060842450708150864, 0.020265065133571625, -0.03559373319149017, 0.03493893891572952, -0.0036544676404446363, 0.010211148299276829, -0.06471849977970123, -0.034595828503370285, -0.05245388671755791, -0.0014119939878582954, 0.008752748370170593, -0.020637203007936478, 0.053244929760694504, 0.052053239196538925, 0.014706660993397236, 0.02803724631667137, -0.07983336597681046, 0.03106858767569065, 0.001688914722763002, -0.07647732645273209, -0.028148295357823372, -0.0528123639523983, 0.08006428182125092, -0.06398879736661911, -0.033476538956165314, 0.05217607319355011, -0.03093232959508896, 0.044230975210666656, 0.05123162269592285, -0.05225585401058197, 0.06976816058158875, -0.0014492797199636698, 0.03833283483982086, 0.08385992050170898, -0.04722217097878456, -0.00226160092279315, -0.027254855260252953, -0.09566919505596161, 0.02109321765601635, -0.032354824244976044, 0.08032239973545074, -0.046937450766563416, -0.004326784983277321, -0.026024870574474335, 0.12039119750261307, 0.1016048863530159, 0.06808122247457504, -0.012297546491026878, -0.06450799852609634, 0.015778351575136185, 0.012280710972845554, 0.04002666845917702, 0.04792468994855881, -0.06248988211154938, -0.054222140461206436, 0.018379682675004005, -0.0029111658222973347, 0.016062958166003227, 0.09880068898200989, 0.03846307471394539, 0.04975416138768196, 0.07305088639259338, -0.020941948518157005, -0.020897891372442245, 0.03872328996658325, -0.05682756006717682, 0.09583723545074463, 0.0028475294820964336, -0.05127262324094772],
+        "ExternalSymbol":[0.014755810610949993, -0.049842361360788345, -0.06733497977256775, 0.05401315540075302, 0.061938412487506866, 0.02437831088900566, -0.06823863834142685, 0.03685877099633217, 0.02961423434317112, -0.04944299906492233, -0.1271103173494339, 0.030452819541096687, 0.019848955795168877, -0.03185190260410309, 0.06586895883083344, 0.0007315169204957783, 0.010839227586984634, -0.09547370672225952, -0.01799146644771099, -0.02204788289964199, 0.048699937760829926, 0.004187166225165129, 0.004053634125739336, -0.04464051127433777, -0.005158414598554373, -0.0416896678507328, -0.024279240518808365, -0.05358913540840149, -0.04719633609056473, -0.07180647552013397, 0.02559211477637291, 0.04657098650932312, 0.08353757858276367, -0.0023563469294458628, 0.046847302466630936, -0.03508693352341652, 0.0696689784526825, 0.054716791957616806, -0.012037037871778011, 0.019885245710611343, 0.01824580691754818, -0.06719563156366348, -0.05447190999984741, 0.08877509087324142, -0.01375679112970829, -0.014463561587035656, -0.049798283725976944, 0.06304343044757843, -0.007584648672491312, -0.016156170517206192, 0.024602508172392845, 0.004940119571983814, -0.04088609293103218, 0.0026271860115230083, 0.00787595845758915, -0.01889132149517536, -0.041029710322618484, 0.07343143969774246, -0.02505693957209587, -0.04825644940137863, 0.060728199779987335, 0.00460366066545248, 0.020744791254401207, 0.04238201677799225, -0.024090539664030075, -0.05792662873864174, 0.07639332860708237, -0.07511764764785767, -0.08259762078523636, 0.07901840656995773, -0.000285966758383438, 0.021390466019511223, -0.07818973809480667, -0.02385067008435726, -0.0014113716315478086, -0.055170729756355286, 0.00946732610464096, 0.02471417747437954, 0.07941421121358871, 0.006746167317032814, -0.06766024231910706, -0.089698426425457, 0.01933225803077221, -0.06994582712650299, -0.10149082541465759, 0.06007266044616699, -0.14545120298862457, -0.03447172790765762, 0.03258124738931656, 0.04966919496655464, 0.023691890761256218, -0.014501980505883694, 0.05896589905023575, 0.04760534316301346, -0.017742110416293144, 0.0019451226107776165, -0.01854461058974266, -0.04744676500558853, -0.017504630610346794, 0.05197983980178833],
+        "GlobalAddress":[0.021709734573960304, -0.03253590315580368, -0.04603651538491249, -0.02350226789712906, 0.02841794677078724, 0.01920732669532299, 0.053104616701602936, 0.03941836208105087, -0.01895466446876526, -0.030471740290522575, 0.010719750076532364, 0.020050356164574623, 0.03648754581809044, -0.021573888137936592, -0.02554452419281006, -3.637039117165841e-05, 0.05989491194486618, -0.006903402041643858, -0.08826262503862381, -0.028047384694218636, -0.04230065643787384, -0.05190899223089218, 0.06145390123128891, 0.0005839569494128227, -4.391977927298285e-05, -0.01880771853029728, 0.09660127758979797, 0.04333353415131569, 0.06461602449417114, -0.06010710820555687, -0.0690189078450203, 0.04574553668498993, -0.07640431076288223, 0.01879746839404106, 0.02076675370335579, 0.04869573190808296, 0.025147439911961555, 0.05311164632439613, 0.05711919441819191, 0.049520380795001984, 0.041169121861457825, -0.0603964701294899, -0.04195070639252663, 0.07676130533218384, -0.015161959454417229, 0.02903268299996853, -0.027548301964998245, 0.04705912992358208, -0.11194053292274475, -0.008245207369327545, -0.07792827486991882, -0.019468743354082108, 0.05482499673962593, -0.0028855702839791775, 0.05478052794933319, 0.07484771311283112, -0.011742575094103813, 0.00923923309892416, -0.05074375122785568, 0.06956734508275986, -0.045990440994501114, 0.007280972320586443, 0.040920473635196686, -0.09143709391355515, -0.06105270981788635, -0.0021254979074001312, -0.09519167989492416, 0.06324268877506256, -0.0693386048078537, -0.05100148543715477, 0.010643817484378815, -0.008162467740476131, -0.08811189234256744, -0.08640385419130325, 0.0077143507078289986, 0.030832089483737946, -0.01504515577107668, 0.07277517020702362, 0.02581198327243328, -0.052599068731069565, -0.06478387117385864, 0.01634707674384117, -0.021173706278204918, 0.030482977628707886, -0.09826494008302689, 0.07716016471385956, -0.10845024883747101, 0.04479274898767471, -0.015128640457987785, -0.03491876646876335, 0.05239150673151016, -0.03427724912762642, 0.06768845021724701, -0.04174086079001427, -0.05136744678020477, 0.0037109211552888155, -0.030324269086122513, -0.06928850710391998, -0.0395960658788681, 0.07726000994443893],
+        "RegisterMask":[0.009287647902965546, 0.029691029340028763, -0.03465871885418892, 0.032606374472379684, -0.007339544594287872, 0.03367740660905838, -0.0661492720246315, 0.0436118021607399, -0.002896533813327551, 0.028440887108445168, -0.06791415065526962, 0.004055356606841087, -0.01596181094646454, -0.003846745239570737, 0.06762582808732986, -0.025632556527853012, 0.08132420480251312, 0.025554664433002472, -0.08994632959365845, 0.02521730400621891, 0.023826507851481438, 0.0004487193073146045, 0.01047397032380104, 0.03246957063674927, -0.033482909202575684, 0.05051224306225777, 0.005778896156698465, -0.0006257061613723636, 0.00522293895483017, -0.04666636884212494, 0.022335125133395195, -0.022150320932269096, 0.04510439187288284, -0.02769547514617443, 0.026804683730006218, 0.0710473507642746, -0.014513042755424976, 0.0695318952202797, 0.048469461500644684, -0.008654370903968811, -0.028613079339265823, -0.02918054349720478, -0.022721733897924423, -0.0004791628452949226, 0.011470172554254532, 0.08561886101961136, 0.07125027477741241, -0.05847848951816559, 0.011811288073658943, -0.025244031101465225, -0.03665035218000412, -0.03482883796095848, 0.04196881502866745, 0.06909161061048508, 0.02365143597126007, -0.0689089447259903, -0.0707414448261261, -0.03962424397468567, -0.025703679770231247, 0.06502455472946167, 0.057676125317811966, 0.026916807517409325, 0.024921152740716934, 0.009799988009035587, -0.018656229600310326, 0.009880480356514454, -0.06516153365373611, 0.019290866330266, 0.02236226759850979, -0.02598695270717144, -0.00299705658107996, 0.019448822364211082, -0.014883329160511494, 0.06645222008228302, -0.028751512989401817, -0.01589173451066017, 0.026225939393043518, 0.07285763323307037, -0.06037987396121025, -0.027615630999207497, -0.039930179715156555, -0.07122864574193954, 0.029825787991285324, 0.026364129036664963, -0.04438399150967598, 0.07015394419431686, -0.013950555585324764, 0.004367176443338394, 0.020521124824881554, 0.02030497044324875, 0.011951270513236523, 0.06765977293252945, -0.015042259357869625, 0.005189584568142891, -0.07532864063978195, -0.010886142030358315, 0.006792030762881041, -0.06348442286252975, 0.031859394162893295, -0.052482619881629944],
+        "Metadata":[-0.07879140228033066, 0.024690961465239525, 0.022790303453803062, 0.01354144886136055, -0.07098772376775742, 0.04053819552063942, -0.04038544371724129, -0.021055836230516434, 0.10361373424530029, 0.04415135458111763, -0.09545262902975082, 0.042553599923849106, -0.021835647523403168, 0.07703430950641632, -0.04880501329898834, -0.04054124280810356, 0.05049756169319153, 0.08986796438694, 0.0705084353685379, -0.0077315340749919415, -0.045390889048576355, 0.053155045956373215, 0.045656319707632065, -0.02663712576031685, -0.01446426473557949, -0.058978915214538574, 0.011314704082906246, 0.03043927252292633, -0.0843580812215805, 0.017854437232017517, -0.08720997720956802, 0.030351335182785988, -0.04896129295229912, 0.04189978539943695, -0.09887325763702393, 0.0015409664483740926, -0.08604399859905243, 0.10654544085264206, 0.1058540865778923, 0.014106648042798042, 0.0640459656715393, -0.05182884633541107, 0.006081609521061182, 0.07624028623104095, 0.02025698497891426, 0.08467324078083038, 0.027136018499732018, 0.026320911943912506, -0.035337720066308975, 0.03864980861544609, -0.019960917532444, -0.029152821749448776, 0.06562864780426025, 0.028298277407884598, -0.07397148013114929, -0.005078969523310661, 0.025909438729286194, -0.01157586183398962, 0.05436081811785698, 0.03408071771264076, -0.07142144441604614, -0.0523630827665329, -0.06302442401647568, -0.019975490868091583, -0.06937523931264877, 0.057667043060064316, -0.08580337464809418, -0.05092239752411842, -0.012613813392817974, 0.025480754673480988, 0.04219530522823334, -0.007300581783056259, 0.05323299020528793, 0.0489904023706913, 0.09260626882314682, -0.04819458723068237, 0.05419271066784859, 0.04558999091386795, 0.012036344967782497, -0.05483977124094963, -0.05181310698390007, -0.02104383148252964, -0.057876624166965485, 0.039601441472768784, 0.025240536779165268, -0.03984035924077034, 0.07654847204685211, -0.07073183357715607, -0.0018080074805766344, -0.016453349962830544, 0.03962434455752373, 0.05717255175113678, 0.01962372660636902, 0.00952839944511652, 0.0013127806596457958, 0.013634574599564075, 0.07692103832960129, 0.06334574520587921, 0.056647684425115585, -0.02965259924530983],
+        "MCSymbol":[0.05158298835158348, 0.05024643987417221, 0.06704410910606384, 0.0378347709774971, -0.03902719169855118, -0.08626251667737961, 0.03964311257004738, 0.06615762412548065, 0.04361319541931152, 0.03646374121308327, -0.018487416207790375, 0.0024993624538183212, 0.006693041883409023, 0.08311881870031357, 0.021111667156219482, 0.038208797574043274, 0.08689694851636887, -0.03659898787736893, 0.020775076001882553, 0.03553535416722298, 0.06854367256164551, -0.002012243028730154, 0.03658154606819153, 0.03127564862370491, 0.0363621786236763, -0.027205800637602806, -0.05243372917175293, 0.012564878910779953, -0.013430594466626644, -0.04043225944042206, -0.025083716958761215, 0.09665156900882721, 0.005077417939901352, -0.05181048810482025, 0.08925056457519531, 0.0777667909860611, -0.013708796352148056, 0.07754126191139221, 0.08393577486276627, 0.06395212560892105, -0.07428556680679321, -0.052424050867557526, 0.03497577831149101, 0.01964585855603218, -0.0429445318877697, 0.07072066515684128, 0.0017074055504053831, 0.059513408690690994, 0.013262910768389702, -0.07240563631057739, 0.09288764744997025, 0.030620144680142403, -0.046197980642318726, 0.04847298562526703, -0.03942957893013954, -0.0025783153250813484, -0.019526517018675804, 0.038867682218551636, 0.006007499527186155, -0.06366054713726044, 0.004640159662812948, 0.013837787322700024, -0.020015377551317215, -0.010317903012037277, 0.001741019543260336, 0.06261103600263596, -0.03374830260872841, 0.01629183441400528, -0.013137640431523323, 0.026046304032206535, -0.009679407812654972, -0.07085473090410233, 0.03035539574921131, -0.08764562010765076, -0.03820766881108284, -0.04181021824479103, -0.05163294076919556, 0.06666433811187744, -0.08939782530069351, 0.040260378271341324, -0.06847432255744934, 0.09106951206922531, -0.07388591021299362, -0.07479099184274673, -0.001779694459401071, -0.0963745042681694, -0.06515862792730331, -0.08404017239809036, -0.09935544431209564, 0.010541093535721302, -0.04491754248738289, 0.09378639608621597, 0.006655062548816204, 0.06637217849493027, -0.05623293295502663, -0.020134123042225838, 0.005873391404747963, -0.07765494287014008, -0.0008442706312052906, -0.03568055108189583]
+    },
+    "VirtualRegisters" : {
+        "VIRT_REG_FR32":[0.0034248235169798136, -0.011980761773884296, -0.0501178540289402, 0.0494888611137867, 0.06103336811065674, -0.06178610771894455, 0.007709897588938475, -0.011392943561077118, 0.06570645421743393, 0.0771368145942688, 0.0005577280535362661, 0.013396150432527065, -0.041660163551568985, 0.05122360959649086, 0.11354377865791321, -0.009875510819256306, -0.06466709822416306, 0.048170577734708786, 0.0007201629341579974, 0.06538223475217819, 0.08870227634906769, -0.05771782249212265, 0.009273379109799862, -0.03325295075774193, 0.01197165809571743, 0.06604835391044617, 0.08265330642461777, -0.005758166313171387, 0.02512396313250065, 0.03383670747280121, 0.038484204560518265, -0.06539343297481537, -0.013461028225719929, 0.001498897559940815, 0.05170154944062233, 0.06965786963701248, -0.07339458167552948, 0.05094756931066513, 0.01983451284468174, -0.06855696439743042, 0.07892709225416183, 0.06099703162908554, 0.08492864668369293, 0.05357863008975983, -0.009294840507209301, -0.0054923719726502895, -0.029938997700810432, 0.028260599821805954, 0.053790509700775146, -0.06574371457099915, -0.009621666744351387, -0.08131514489650726, -0.08474338054656982, 0.039622966200113297, 0.06945627927780151, 0.02545306645333767, 0.005390701815485954, 0.04582791030406952, -0.1103447750210762, -0.050917647778987885, 0.03087870217859745, 0.06918162852525711, 0.0548822283744812, -0.01838473603129387, 0.05597897991538048, 0.03548860549926758, -0.009931124746799469, -0.07856663316488266, 0.033994875848293304, 0.03467561677098274, 0.09580692648887634, -0.04153195023536682, -0.06732118874788284, -0.06857144832611084, 0.03419093042612076, -0.01200241968035698, -0.06983492523431778, 0.05929506942629814, -0.00041734304977580905, -0.026396293193101883, 0.05230500176548958, -0.006162640172988176, 0.044198282063007355, -0.028765834867954254, 0.031155114993453026, 0.06967037916183472, -0.0892564132809639, 0.028816571459174156, -0.037065472453832626, 0.06540130823850632, -0.01888667233288288, 0.030632384121418, 0.0359313078224659, 0.106044240295887, 0.03259910270571709, -0.0775517001748085, -0.04267778620123863, 0.04977935180068016, -0.01790289767086506, -0.11223265528678894],
+        "VIRT_REG_FR64":[0.08496882021427155, 0.049308884888887405, -0.016840212047100067, 0.010602951049804688, -4.6025739720789716e-05, -0.06524767726659775, 0.048670798540115356, -0.06444543600082397, -0.0031944462098181248, 0.05608433857560158, -0.03958145156502724, 0.05171080678701401, -0.03572545200586319, -0.054364755749702454, 0.052311528474092484, -0.0361458919942379, 0.024109655991196632, 0.15923210978507996, -0.07255382835865021, -0.011799084022641182, -0.06846465915441513, 0.0023571476340293884, 0.02642918936908245, -0.05057685822248459, 0.029800178483128548, -0.06036723777651787, -0.012272411957383156, -0.022802220657467842, -0.02426644042134285, 0.05623406544327736, -0.07506053894758224, -0.02078152634203434, 0.02549685165286064, -0.030025657266378403, -0.0627482682466507, 0.062375299632549286, 0.03684084117412567, 0.06365678459405899, 0.0004415051080286503, -0.002180535811930895, 0.05225013941526413, -0.0693102702498436, -0.03649357333779335, 0.005159272346645594, -0.03298519179224968, 0.041419681161642075, -0.05325934663414955, -0.017585784196853638, -0.03843431547284126, -0.002649943344295025, 0.033329058438539505, -0.04736043140292168, -0.043852102011442184, -0.06713785231113434, -0.03237355872988701, 0.012679073959589005, -0.01959240809082985, 0.07324203103780746, 0.07468831539154053, 0.03327644243836403, -0.01596391387283802, 0.12015434354543686, 0.051839299499988556, 0.00980563648045063, -0.08275608718395233, 0.04445798322558403, -0.03891860321164131, 0.10891054570674896, -0.008730625733733177, -0.051655255258083344, -0.05982912331819534, 0.04106972739100456, 0.06872759014368057, 0.013289053924381733, 0.03469584137201309, -0.06673429906368256, -0.0695682018995285, 0.047426726669073105, 0.02815094031393528, -0.05552271753549576, 0.0010567272547632456, -0.051840681582689285, -0.01704293303191662, -0.047185055911540985, 0.036965738981962204, 0.03452568128705025, -0.05430837720632553, 0.0383443646132946, 0.0003438846324570477, -0.030417989939451218, 0.02749026007950306, -0.0546082966029644, 0.03005768544971943, 0.0025131346192210913, 0.0013019279576838017, -0.054173994809389114, -0.008382225409150124, 0.02153395675122738, 0.011912085115909576, -0.10461334884166718],
+        "VIRT_REG_GR16":[0.09543223679065704, 0.03513967618346214, 0.08986528217792511, -0.012217407114803791, -0.02076001651585102, -0.04190119728446007, 0.01318269595503807, -0.010142332874238491, -0.011869532987475395, -0.040446147322654724, 0.06552371382713318, 0.04439055174589157, 0.08176156878471375, -0.06334159523248672, -0.033928077667951584, -0.00024628525716252625, 0.0244551170617342, -0.019419007003307343, -0.09592454880475998, 0.005961012560874224, 0.03278326243162155, -0.07028506696224213, -0.08484592288732529, -6.329250754788518e-05, 0.015018146485090256, -0.05068608745932579, 0.0732998326420784, 0.023434389382600784, 0.0002124009479302913, 0.060401707887649536, 0.013626078143715858, -0.010556582361459732, -0.005069760140031576, -0.004616749472916126, -0.034329116344451904, 0.060584329068660736, -0.05430089309811592, -0.029179023578763008, 0.042385730892419815, -0.0652197003364563, 0.09378205984830856, -0.05090794339776039, -0.008510591462254524, 0.0837036669254303, 0.009071480482816696, 0.04464874789118767, -0.012855015695095062, 0.06306030601263046, -0.08556588739156723, -0.05393703281879425, -0.06741822510957718, -0.03717748448252678, 0.017156923189759254, 0.07401604950428009, -0.06629005819559097, -0.04564857482910156, -0.055414989590644836, 0.039407771080732346, -0.04089723527431488, 0.06915309280157089, 0.030190052464604378, 0.027542876079678535, 0.03557966649532318, 0.05191207677125931, -0.03237364813685417, -0.02036256715655327, -0.071859210729599, -0.06704329699277878, 0.0336633175611496, 0.09511569887399673, 0.0048662531189620495, 0.05273270234465599, -0.056247059255838394, 0.06079721450805664, -0.04150049015879631, -0.08104457706212997, -0.10303051024675369, 0.04522428661584854, -0.04379847273230553, -0.019447194412350655, 0.0021319733932614326, -0.010465282015502453, 0.06857019662857056, -0.00443653529509902, -0.08039603382349014, -0.05012141168117523, 0.0875077098608017, -0.03053239732980728, -0.05321606993675232, 0.016501901671290398, -0.0563507042825222, -0.03187479078769684, -0.0015389680629596114, 0.022985411807894707, -0.05008963868021965, 0.028300117701292038, 0.02875804342329502, -0.024458128958940506, -0.022238614037632942, -0.049835607409477234],
+        "VIRT_REG_GR32":[-0.008479167707264423, -0.02941126376390457, 0.05343153327703476, 0.03769504278898239, -0.0006716987118124962, -0.0329299233853817, 0.03442851081490517, -0.06826753169298172, -0.09117511659860611, -0.018657755106687546, 0.029032904654741287, 0.02404048666357994, 0.010598761960864067, -0.0482308566570282, 0.06956348568201065, -0.027967501431703568, -0.07380961626768112, -0.021098148077726364, -0.0808446854352951, 0.0127912862226367, -0.01355082169175148, -0.040285225957632065, 0.035385165363550186, -0.001157263875938952, -0.026462145149707794, -0.08616211265325546, -0.044482193887233734, -0.010969695635139942, 0.04645564407110214, -0.018178211525082588, -0.038536932319402695, -0.027571648359298706, -0.007523007690906525, -0.02699458785355091, -0.039170436561107635, 0.12889482080936432, -0.04512789845466614, -0.03883056715130806, 0.051210880279541016, 0.03924906626343727, 0.036943964660167694, -0.016879307106137276, 0.011263007298111916, 0.053573690354824066, -0.018964825198054314, -0.041856080293655396, -0.036545924842357635, 0.07715532928705215, -0.041981130838394165, -0.04114629328250885, -0.04393022507429123, -0.030163627117872238, 0.0019487979589030147, 0.10988762229681015, 0.09039165079593658, -0.0035424421075731516, -0.06272851675748825, 0.007701062131673098, -0.01971622183918953, 0.06203003600239754, 0.048561323434114456, -0.04599940404295921, 0.00802221056073904, -0.002905400237068534, -0.1050020381808281, 0.003395768813788891, -0.07973644882440567, 0.008020970039069653, -0.08614815771579742, 0.0518532320857048, 0.021174483001232147, 0.03254232555627823, -0.01905026100575924, -0.0009989180834963918, -0.06409642845392227, -0.022425753995776176, -0.03563409671187401, 0.07717793434858322, -0.04553033784031868, -0.02112392708659172, -0.002374667674303055, 0.03828585892915726, -0.014221777208149433, -0.015974245965480804, -0.01805220916867256, 0.04202109947800636, -0.0841534212231636, 0.06608130037784576, -0.11586519330739975, 0.024179989472031593, 0.017091574147343636, 0.08567194640636444, -0.03692129999399185, 0.03266705200076103, -0.046154942363500595, 0.0040525165386497974, -0.03177625685930252, 0.039895471185445786, 0.042960215359926224, -0.05573953315615654],
+        "VIRT_REG_GR32_ABCD":[0.016604775562882423, -0.0028934956062585115, 0.041060179471969604, -0.025077441707253456, -0.018642406910657883, 0.023762650787830353, -0.028646549209952354, -0.02460283786058426, 0.005985732190310955, 0.01774146780371666, -0.004014404024928808, -0.05473850294947624, -0.0417158380150795, -0.06322457641363144, 0.060795728117227554, -0.036435071378946304, -0.04245952516794205, 0.08069344609975815, 0.035319335758686066, -0.012020719237625599, 0.045771341770887375, -0.10842540860176086, 0.046253710985183716, -0.004099135287106037, 0.030616935342550278, -0.08288344740867615, 0.08569363504648209, -0.014164377935230732, -0.004303323570638895, 0.09726760536432266, 0.06208871304988861, -0.04007713496685028, 0.005815347656607628, 0.02377200312912464, 0.07813961058855057, 0.03192306309938431, -0.006230524741113186, 0.10110925883054733, -0.023409254848957062, 0.030774405226111412, -0.011607645079493523, -0.03929119184613228, 0.004817614797502756, -0.013827506452798843, 0.07770339399576187, -0.07994075864553452, -0.03157062083482742, 0.06743781268596649, 0.014881699346005917, -0.030165214091539383, -0.07844353467226028, -0.04563238099217415, 0.09747181832790375, 0.057128582149744034, 0.04173563793301582, -0.0011194447288289666, -0.01902887038886547, -0.032171595841646194, 0.04824799671769142, 0.008433254435658455, 0.024706291034817696, 0.0746094286441803, 0.04515853151679039, -0.0018984260968863964, -0.10070884972810745, -0.01883143000304699, -0.07785795629024506, 0.10938235372304916, -0.08001448959112167, -0.07419873028993607, 0.010544849559664726, 0.025767439976334572, -0.1005895584821701, 0.05103800818324089, -0.03675306960940361, -0.020510872825980186, 0.022482097148895264, 0.06463642418384552, -0.03149804100394249, -0.021647030487656593, 0.04025804623961449, 0.003628256032243371, 0.03532547131180763, -0.08667688816785812, 0.018817460164427757, -0.01690257526934147, -0.10114696621894836, -0.022815177217125893, 0.024386661127209663, 0.10286301374435425, 0.030005114153027534, 0.0370776504278183, -0.008584428578615189, -0.077603779733181, -0.03588058054447174, 0.030617419630289078, -0.07383710891008377, 0.03215676173567772, 0.03288266062736511, -0.036702848970890045],
+        "VIRT_REG_GR32_NOREX":[0.019052108749747276, -0.006784944795072079, -0.05410394072532654, 0.001966317882761359, -0.06686867773532867, 0.013514372520148754, 0.030097918584942818, -0.03868359327316284, 0.004314934369176626, -0.06713679432868958, 0.02491898462176323, 0.027683967724442482, 0.035907283425331116, -0.023093875497579575, -0.0892200842499733, -0.1052003800868988, -0.03923499956727028, 0.08808581531047821, -0.10092058777809143, 0.03336786851286888, -0.08974049985408783, -0.015254802070558071, 0.039686985313892365, -0.010083628818392754, -0.03423550724983215, -0.08821681141853333, -0.05621311068534851, -0.020327769219875336, -0.016793876886367798, 0.08908043801784515, -0.04112761467695236, -0.050139520317316055, -0.01524045504629612, 0.05841142684221268, 0.08270087838172913, 0.0348736047744751, -0.016146546229720116, 0.05751227214932442, 0.05081859603524208, -0.07304663956165314, -0.047101784497499466, -0.02825125865638256, 0.0006340605323202908, 0.0008785317186266184, -0.044239338487386703, 0.007173972204327583, -0.029449066147208214, 0.07254412025213242, -0.026029080152511597, 0.025982191786170006, -0.09524690359830856, -0.052613094449043274, -0.1270490437746048, 0.05319184809923172, 0.1046818196773529, 0.0477570965886116, -0.06291303783655167, 0.04725426062941551, -0.05330964922904968, 0.04056742787361145, 0.01543382927775383, 0.03627128154039383, -0.048232536762952805, 0.014761016704142094, -0.007380587514489889, -0.008060632273554802, -0.021923277527093887, -0.022500980645418167, -0.08495079725980759, 0.045358967036008835, -0.04728720709681511, 0.03550735488533974, 0.03445536270737648, -0.01891610585153103, -0.09439470618963242, -0.044266197830438614, -0.07952893525362015, 0.05221104994416237, -0.03507477045059204, 0.04218391329050064, 0.040326621383428574, -0.0395088866353035, 0.02447870559990406, -0.04280063137412071, 0.06520935893058777, -0.003358252113685012, -0.057561881840229034, 0.01911463774740696, 0.05295571684837341, 0.030342884361743927, 0.03814920783042908, -0.03366788476705551, 0.03090745024383068, 0.09487249702215195, -0.002995486371219158, -0.012020634487271309, -0.029147809371352196, 0.09558248519897461, 0.02548893168568611, 0.0931544378399849],
+        "VIRT_REG_GR64":[0.02717440389096737, -0.026730243116617203, -0.023244258016347885, 0.04027782380580902, 0.006808254402130842, -0.027519788593053818, -0.01906559243798256, 0.027793627232313156, -0.00129543652292341, -0.03455121070146561, 0.021734628826379776, 0.035481199622154236, -0.07251942157745361, -0.025691546499729156, -0.03271827474236488, -0.13225725293159485, -0.0601421520113945, 0.09084498882293701, -0.10225717723369598, 0.004034099169075489, 0.023578351363539696, -0.041603971272706985, 0.04199974611401558, -0.014711204916238785, -0.04272732138633728, -0.12534455955028534, -0.023738788440823555, 0.005328727886080742, 0.038416482508182526, -0.026419155299663544, -0.041119154542684555, 0.00022502713545691222, -0.05204978585243225, -0.019709734246134758, -0.04102563485503197, 0.06480151414871216, 0.009224721230566502, 0.04627599939703941, 0.027821402996778488, -0.05595114827156067, 0.04526345059275627, 0.024196594953536987, 0.10446277260780334, 0.07561361789703369, -0.08028160035610199, -0.0314163975417614, 0.11944323033094406, 0.1025814488530159, -0.08457476645708084, 0.02227119728922844, -0.041679076850414276, -0.02260834351181984, 0.036674268543720245, 0.10488750785589218, 0.019218411296606064, -0.015966340899467468, -0.06852715462446213, 0.026523491367697716, -0.11090730130672455, -0.0021082640159875154, -0.048291631042957306, -0.032388005405664444, 0.015713853761553764, 0.03355225548148155, -0.06502845883369446, -0.010098783299326897, -0.09930021315813065, -0.017413528636097908, -0.055861033499240875, 0.0801810696721077, -0.03900628536939621, -0.03278445452451706, -0.0337282195687294, -0.11434067040681839, -0.04371264949440956, -0.01736009307205677, -0.05100121721625328, 0.07490750402212143, -0.014680330641567707, -0.02126181870698929, 0.018013890832662582, 0.0018135658465325832, 0.029781077057123184, -0.012477489188313484, -0.021443217992782593, 0.047576501965522766, -0.05993758141994476, -0.06040889024734497, 0.016642581671476364, 0.011624492704868317, -0.042229063808918, -0.007573941722512245, -0.04010608047246933, -0.006444427650421858, -0.014495199546217918, -0.04122597724199295, -0.08505907654762268, -0.004049300216138363, 0.06545045226812363, -0.04762336611747742],
+        "VIRT_REG_GR64_ABCD":[0.04577033221721649, -0.07758746296167374, 0.00799313560128212, -0.11011485010385513, -0.010862522758543491, 0.012709266506135464, 0.05257265642285347, -0.07354705780744553, 0.04262387007474899, 0.07554348558187485, -0.06358839571475983, 0.006669520866125822, 0.049098193645477295, 0.11183933168649673, -0.028112098574638367, 0.021986473351716995, -0.02839403599500656, -0.06199958547949791, 0.08614487200975418, -0.041216861456632614, 0.041238460689783096, 0.005937385838478804, 0.00200703926384449, -0.05337367579340935, 0.037919919937849045, -0.07485998421907425, -0.09153831005096436, -0.0554175041615963, -0.10251995176076889, -0.01289951242506504, -0.030631467700004578, 0.04197017475962639, -0.03578301519155502, 0.010593005456030369, -0.05836241692304611, 0.06809061765670776, 0.10871735960245132, -0.09833388775587082, -0.009873395785689354, -0.056898634880781174, 0.05946199968457222, 0.015534073114395142, 0.01677171140909195, -0.020233800634741783, -0.006396631710231304, -0.049332089722156525, 0.012649210169911385, 0.03756912052631378, 0.0033660116605460644, -0.09084216505289078, -0.07142844051122665, -0.0030346515122801065, 0.0019640070386230946, 0.038837920874357224, 0.011760945431888103, 0.04995080456137657, -0.06997165083885193, -0.035297296941280365, 0.01996617764234543, 0.01954355463385582, -0.0934600979089737, 0.030165065079927444, -0.007337240036576986, -0.05346155911684036, 0.0732186883687973, -0.04716489836573601, -0.06555212289094925, -0.018465254455804825, 0.051119767129421234, -0.03106619231402874, 0.0748852789402008, -0.02095886692404747, 0.006320921704173088, 0.03146332502365112, -0.08238139003515244, -0.03618254140019417, -0.014570276252925396, 0.062481846660375595, -0.0394093319773674, -0.05171547457575798, -0.044726233929395676, -0.01228095218539238, 0.09699232876300812, 0.07471026480197906, 0.03112417459487915, 0.022543631494045258, -0.08634103089570999, 0.059702761471271515, -0.013801504857838154, 0.004984616301953793, 0.045798566192388535, -0.03205988556146622, -0.06150995194911957, -0.02244667150080204, 0.03318532556295395, 0.03462471440434456, 0.03236381709575653, 0.0884014293551445, -0.01604369841516018, -0.05234146490693092],
+        "VIRT_REG_GR64_NOREX":[-0.03959479182958603, -0.06190898269414902, -0.02920372597873211, -0.09973344951868057, -0.004333901684731245, -0.08522991091012955, 0.0459987074136734, -0.057674553245306015, 0.037046968936920166, -0.05669403821229935, -0.02221340872347355, -0.062426190823316574, 0.05804889276623726, -0.02635439857840538, -0.045627325773239136, 0.03632078319787979, 0.07128578424453735, 0.07544906437397003, -0.0537678524851799, -0.04624016210436821, 0.014316501095890999, 0.05580946058034897, 0.05251356214284897, -0.08244197070598602, -0.08901460468769073, -0.07641059905290604, -0.04924754425883293, 0.05417120084166527, -0.0060508353635668755, -0.00814742036163807, -0.06154030188918114, 0.05966867506504059, -0.03231468051671982, 0.021429890766739845, 0.031103987246751785, 0.04343251883983612, -0.08997714519500732, 0.039365898817777634, 0.052908625453710556, -0.02683917060494423, -0.05547752603888512, -0.014131218194961548, 0.0016863569617271423, -0.041112788021564484, -0.010230163112282753, -0.06687774509191513, -0.006144971586763859, -0.08074352145195007, 0.04034091532230377, -0.08176303654909134, -0.004055786412209272, -0.0024839320685714483, -0.007289807312190533, 0.06915127485990524, 0.023709064349532127, 0.04671626538038254, 0.06229325756430626, 0.04707597941160202, 0.06800796836614609, -0.02885584905743599, 0.030613983049988747, -0.019083039835095406, 0.045457858592271805, 0.040770504623651505, -0.05441175401210785, -0.05712401866912842, 0.07744520157575607, -0.0756613239645958, -0.06890957802534103, -0.07997069507837296, 0.09348486363887787, -0.04511028528213501, 0.036194607615470886, 0.040017660707235336, 0.016245214268565178, 0.023104460909962654, 0.058383163064718246, 0.0679842159152031, -0.00921112485229969, -0.10036550462245941, 0.09075804799795151, -0.059704095125198364, -0.013338442891836166, -0.005139742512255907, 0.07807526737451553, 0.06255412846803665, -0.008151572197675705, -0.0624256506562233, 0.012590888887643814, 0.03665084019303322, -0.028498578816652298, -0.01614067517220974, 0.007552243769168854, -0.007216903381049633, 0.0760180801153183, -0.04200543463230133, 0.06412865966558456, -0.05136435106396675, -0.0024792966432869434, 0.06856651604175568],
+        "VIRT_REG_GR64_NOREX_NOSP":[-0.0656895712018013, 0.058077458292245865, -0.006653467658907175, 0.037784356623888016, 0.07274001836776733, 0.07232078164815903, 0.07074914127588272, 0.05637859180569649, 0.04296007752418518, 0.05499762296676636, -0.01783664897084236, -0.08387365937232971, -0.01376343984156847, -0.07938199490308762, -0.027822256088256836, -0.0663403570652008, 0.036170270293951035, -0.07460261881351471, 0.08652043342590332, 0.02483147382736206, -0.07939319312572479, 0.033202506601810455, 0.0903514102101326, -0.10181311517953873, 0.060751549899578094, 0.07619930803775787, 0.05017509311437607, -0.0470910519361496, 0.07713821530342102, -0.0426195003092289, -0.04506472498178482, 0.003363420255482197, -0.0017315347213298082, 0.06264199316501617, 0.005245774984359741, -0.027923958376049995, 0.09868567436933517, 0.06738796830177307, -0.10339145362377167, 0.0020383980590850115, 0.087734155356884, 0.011040030047297478, -0.05993311479687691, -0.05790332704782486, 0.01574312523007393, 0.009771298617124557, 0.022676382213830948, -0.009197148494422436, 0.03372732177376747, 0.08404259383678436, -0.015135225839912891, -0.04693703353404999, 0.09917140752077103, 0.007134507410228252, 0.020209072157740593, -0.00027669535484164953, -0.0351635180413723, 0.03751315921545029, -0.019665181636810303, 0.028500953689217567, 0.034186746925115585, -0.005931361112743616, 0.05645192414522171, -0.02027188241481781, -0.022675039246678352, -0.08812297880649567, -0.014896178618073463, -0.048788342624902725, 0.008708382956683636, 0.019917558878660202, -0.002275944221764803, 0.03409638628363609, 0.033304013311862946, 0.057676300406455994, 0.039842985570430756, -0.025169866159558296, 0.016520975157618523, -0.030201178044080734, -0.021718870848417282, -0.07023277878761292, -0.007528252899646759, 0.009067370556294918, -0.0460657961666584, 0.07117785513401031, -0.03609836474061012, -0.011893372051417828, -0.006047600414603949, 0.0179970171302557, 0.024480223655700684, -0.03918423503637314, 0.004897980485111475, 0.05040167644619942, 0.010113563388586044, -0.1074901670217514, -0.06277655810117722, -0.02934161201119423, -0.06922926008701324, -0.05638887360692024, 0.05314395949244499, 0.04588884115219116],
+        "VIRT_REG_GR64_NOSP":[0.0015277941711246967, -0.03938478231430054, -0.030811766162514687, 0.027071669697761536, 0.02127140760421753, 0.0015787228476256132, -0.07842491567134857, 0.004658385645598173, -0.05909501388669014, -0.03576778993010521, -0.07251477241516113, 0.12117832154035568, 0.04499363154172897, -0.009405314922332764, -0.01015283353626728, -0.002841090550646186, 0.0689091831445694, 0.10697457194328308, -0.09274765104055405, -0.027955353260040283, -0.0379958301782608, -0.044126156717538834, 0.04907212778925896, -0.038063473999500275, -0.003686746582388878, -0.08313410729169846, -0.045181579887866974, -0.011702840216457844, -0.006579228211194277, 0.046807315200567245, -0.045654296875, -0.03466613590717316, -0.08313826471567154, -0.06678880006074905, -0.027727074921131134, 0.036734677851200104, -0.040936414152383804, 0.05170389637351036, 0.038199927657842636, 0.02960256300866604, 0.0355701707303524, -0.02052776888012886, 0.06218089163303375, 0.10570456087589264, -0.036479029804468155, -0.008999336510896683, -0.031860992312431335, 0.07250168174505234, -0.061084795743227005, -0.057996805757284164, -0.010533110238611698, -0.018169214949011803, 0.017261315137147903, 0.10023517906665802, -0.044131457805633545, -0.07618662714958191, -0.09124933928251266, 0.01819406822323799, -0.05906827375292778, 0.04295642301440239, -0.03197735920548439, 0.03641442954540253, 0.005168464966118336, -0.00010972691961796954, -0.0829579159617424, -0.014677388593554497, -0.08750011026859283, -0.04695136100053787, -0.07696729153394699, -0.00718996487557888, 0.018294518813490868, -0.014321570284664631, -0.04416860267519951, -0.0890057235956192, -0.014466283842921257, 0.02831638976931572, -0.04845190420746803, 0.08228176832199097, 0.03420877829194069, 0.056510377675294876, 0.037403274327516556, 0.04364967346191406, 0.08903267979621887, -0.016827082261443138, -0.0682789757847786, 0.06286796927452087, -0.0958203598856926, 0.018489282578229904, 0.02886355295777321, 0.028006011620163918, 0.039986785501241684, -0.04771937429904938, -0.004648604430258274, 0.033939141780138016, -0.027820419520139694, -0.026187442243099213, -0.07972361892461777, 0.006323353853076696, 0.016448041424155235, -0.01961681991815567],
+        "VIRT_REG_GR64_NOSP_and_GR64_TC":[0.08079065382480621, -0.05147358775138855, -0.08338657021522522, 0.06757336109876633, -0.015237463638186455, 0.026806311681866646, 0.07564966380596161, -0.037159934639930725, -0.02222878858447075, -0.04553138092160225, -0.006632891017943621, 0.001604291144758463, 0.043711669743061066, 0.0710049569606781, -0.08854726701974869, -0.03142566233873367, -0.0865127220749855, 0.08521236479282379, 0.039203498512506485, 0.04737624153494835, 0.02893459051847458, 0.004120660945773125, 0.03552098199725151, -0.0010448878165334463, 0.04423774778842926, 0.03258584439754486, 0.03433830663561821, -0.019990455359220505, -0.03263172507286072, 0.09782663732767105, -0.00702365068718791, -0.06544602662324905, 0.013447105884552002, 0.04603038728237152, 0.029931804165244102, 0.0988783910870552, -0.062023941427469254, -0.0070026409812271595, 0.032557111233472824, -0.08212000876665115, 0.03199682757258415, 0.020828546956181526, 0.07071725279092789, -0.018812179565429688, -0.0184739138931036, -0.06008931249380112, 0.01504000648856163, -0.019235603511333466, 0.014653048478066921, -0.009083813987672329, 0.03171474114060402, 0.019499456509947777, 0.05263463407754898, 0.10554639250040054, -0.02759619802236557, -0.00156494346447289, -0.03898271545767784, 0.06027846410870552, -0.061001915484666824, 0.039365388453006744, -0.06546281278133392, 0.0006352368509396911, 0.0500405877828598, -0.03232716768980026, -0.010176514275372028, 0.002549059921875596, 0.0666508674621582, -0.037290267646312714, -0.028836704790592194, 0.06271649152040482, -0.016647985205054283, 0.013602355495095253, 0.020110899582505226, 0.011730309575796127, -0.10071564465761185, -0.06239647418260574, -0.09507977962493896, -0.09190725535154343, -0.08861985802650452, -0.0006123466300778091, 0.0951915979385376, -0.035364676266908646, -0.04007220268249512, 0.08415472507476807, 0.0006664254469797015, 0.05864431709051132, 0.01460045762360096, -0.09507087618112564, 0.024228032678365707, 0.04208158329129219, 0.006106846500188112, 0.09294755011796951, 0.06157369166612625, 0.0826527327299118, -0.058974966406822205, -0.09958664327859879, 0.06913749873638153, -0.08108915388584137, 0.07425157725811005, 0.04784728214144707],
+        "VIRT_REG_GR64_TC":[-0.0944172665476799, 0.040403831750154495, -0.017597073689103127, 0.04766053333878517, -0.03104357235133648, 0.025751160457730293, 0.036779265850782394, -0.0235747080296278, 0.032111138105392456, 0.009872193448245525, -0.01596468687057495, 0.05234881862998009, -0.047335200011730194, 0.005157034378498793, -0.02132921665906906, -0.0544377863407135, 0.057515472173690796, -0.006743279751390219, -0.01474941335618496, -0.0990658849477768, 0.022418741136789322, -0.007098495960235596, 0.046933863312006, 0.1002131924033165, 0.01583809033036232, 0.03995800018310547, -0.017743254080414772, -0.01684877835214138, 0.06543229520320892, 0.04597911611199379, 0.05365373566746712, -0.008774830959737301, -0.01341968309134245, -0.004754040390253067, 0.04739849269390106, 0.032378777861595154, -0.0020728895906358957, 0.03502136841416359, 0.05946416035294533, -0.06190952658653259, 0.01910495012998581, -0.023678753525018692, 0.012653682380914688, -0.06766874343156815, -0.0729866623878479, 0.0757005363702774, -0.027033904567360878, -0.06776778399944305, -0.010131776332855225, -0.06334701925516129, -0.04702980816364288, 0.06837917864322662, 0.002726735547184944, 0.04345812648534775, 0.04288078844547272, -0.06921732425689697, -0.07625382393598557, 0.037991974502801895, -0.04257906600832939, 0.06338586658239365, 0.05315309390425682, -0.02785014547407627, 0.04054750129580498, 0.06967299431562424, -0.07271680235862732, 0.0032969408202916384, -0.08254148811101913, 0.07269596308469772, -0.01827111467719078, 0.034775473177433014, 0.010106234811246395, 0.0389409065246582, 0.042805008590221405, -0.03822058066725731, 0.0668339803814888, -0.005216705612838268, -0.00022202919353730977, -0.0221820380538702, -0.027401722967624664, -0.045061662793159485, -0.05296671763062477, -0.0190189890563488, -0.002744461875408888, -0.04073096439242363, -0.06974441558122635, 0.05868958309292793, -0.06907399743795395, -0.026619713753461838, 0.015318086370825768, 0.035948701202869415, -0.08301021158695221, 0.03955607861280441, 0.028369972482323647, 0.0202812347561121, -0.12075140327215195, -0.039504438638687134, -0.03826067969202995, 0.01607581228017807, 0.02135113812983036, -0.08897850662469864],
+        "VIRT_REG_GR64_TC_with_sub_8bit":[0.00805664248764515, 0.06228634715080261, -0.005148644559085369, -0.025605352595448494, -0.04853198677301407, -0.018169978633522987, 0.008530518971383572, -0.1050964742898941, -0.08428415656089783, -0.014802628196775913, 0.05918573588132858, 0.07529161125421524, 0.09815273433923721, -0.014188972301781178, 0.06676790118217468, 0.09496084600687027, -0.03843621164560318, -0.00740150036290288, -0.11988909542560577, -0.01781499572098255, -0.03719411790370941, -0.07447166740894318, 0.005513608455657959, -0.014381160028278828, 0.036786310374736786, -0.04839075356721878, -0.009440913796424866, 0.03984222561120987, -0.08096668124198914, 0.026751000434160233, 0.06400448083877563, 0.07998895645141602, 2.295125523232855e-05, 0.0266779325902462, -0.0030931613873690367, 0.05236855521798134, -0.010479471646249294, -0.011119752191007137, -0.06124376133084297, -0.019449712708592415, 0.03448517248034477, -0.04095051810145378, 0.01377212442457676, 0.09643338620662689, 0.021325431764125824, 0.06029453128576279, 0.048866767436265945, -0.03436344116926193, -0.043422505259513855, 0.03822150453925133, 0.004718889016658068, -0.04090931639075279, -0.04219569265842438, 0.019032739102840424, 0.06111171841621399, 0.04305591061711311, -0.0379939004778862, -0.03224434703588486, -0.06517905741930008, 0.002272483194246888, 0.09273418039083481, -0.028145847842097282, 0.01824336126446724, 0.00936606340110302, -0.07281909137964249, -0.028650810942053795, -0.060721538960933685, -0.09477518498897552, -0.0014060320099815726, 0.06919887661933899, -0.03463669493794441, 0.0026504716370254755, -0.0653621107339859, -0.02800566703081131, -0.02503957599401474, -0.060285311192274094, 0.014794053509831429, -0.08424058556556702, 0.0482206828892231, -0.07467620074748993, -0.09909844398498535, -0.06888734549283981, -0.0014173799427226186, -0.09022543579339981, 0.06461413204669952, 0.024526789784431458, -0.07400602847337723, -0.008816084824502468, 0.025513656437397003, 0.047476526349782944, -0.05981749668717384, 0.08338218182325363, 0.02657591737806797, 0.03547860309481621, -0.043622229248285294, 0.10129662603139877, 0.08802521973848343, -0.09759330749511719, 0.025680232793092728, 0.05964493378996849],
+        "VIRT_REG_GR64_with_sub_16bit_in_GR16_NOREX":[-0.03117012232542038, -0.02872271090745926, -0.039712607860565186, 0.03738812729716301, 0.030099159106612206, 0.00013636364019475877, -0.019107641652226448, -0.04186702147126198, -0.053099144250154495, -0.020432034507393837, -0.0004185919533483684, 0.010934959165751934, 0.036054231226444244, 0.03788067027926445, 0.05227302014827728, -0.034505825489759445, -0.08298061788082123, 0.0399160161614418, 0.03668724000453949, 0.014606554992496967, -0.0071771652437746525, 0.059049926698207855, -0.06330917030572891, 0.007379058748483658, -0.0750177726149559, -0.0423760749399662, -0.019386067986488342, -0.018436923623085022, -0.015116279944777489, 0.023602722212672234, 0.0533282607793808, -0.026401247829198837, 0.023750485852360725, -0.027648568153381348, -0.016443056985735893, 0.04291580244898796, -0.04391908273100853, 0.05113501846790314, -0.03743087872862816, 0.056367188692092896, 0.048130668699741364, -0.0230261143296957, 0.03358393907546997, -0.030188169330358505, 0.08421863615512848, 0.0033821314573287964, 0.03151029348373413, -0.042818162590265274, 0.04007953777909279, -0.0050337472930550575, 0.03335743024945259, -0.026563530787825584, 0.016440672799944878, -0.04272226244211197, -0.07304228097200394, 0.024836458265781403, -0.016342775896191597, -0.055494848638772964, -0.05826134234666824, 0.027478834614157677, 0.025981346145272255, -0.04745938256382942, 0.013695796020328999, -0.027888784185051918, 0.03769542649388313, -0.024486247450113297, 0.04720773920416832, -0.012697651982307434, -0.03559652715921402, 0.012948199175298214, -0.025600459426641464, 0.014954420737922192, -0.06651762872934341, 0.04277091473340988, -0.08291683346033096, 0.016881149262189865, 0.04145864024758339, -0.04162050038576126, -0.03363965451717377, -0.05018439516425133, 0.06321889907121658, -0.00871780700981617, 0.06867428869009018, 0.057975344359874725, 0.009704249911010265, 0.049075234681367874, -0.06111253425478935, 0.027943406254053116, 0.03725599870085716, 0.032480716705322266, -0.01960119605064392, -0.0295172780752182, 0.014026675373315811, 0.056797921657562256, -0.031707022339105606, 0.0010152219329029322, -0.023705823346972466, -0.07695567607879639, 0.017504720017313957, -0.0020094760693609715],
+        "VIRT_REG_GR64_with_sub_8bit":[-0.011493992060422897, -0.027181852608919144, 0.022013556212186813, 0.05687474459409714, -0.03289574757218361, -0.04803529754281044, -0.04204253479838371, 0.044671084731817245, -0.0849028080701828, -0.09561576694250107, 0.03596775606274605, 0.027156801894307137, 0.05034027621150017, -0.006308000069111586, 0.012393618933856487, -0.048590339720249176, -0.049129705876111984, 0.059305012226104736, -0.10330235958099365, 0.00738809397444129, 0.03855152800679207, -0.03220852091908455, 0.05221837759017944, -0.01274650078266859, 0.024303985759615898, -0.05925533175468445, -0.015623844228684902, -0.025864524766802788, 0.009918035939335823, 0.004779431037604809, -0.02866589091718197, 0.006512579973787069, -0.037251196801662445, 0.005028596147894859, -0.011677909642457962, 0.051886074244976044, -0.03552602231502533, 0.011968757025897503, 0.00829426757991314, -0.06981230527162552, -0.029781555756926537, -0.012621275149285793, 0.08595969527959824, 0.08630531281232834, 0.10018875449895859, -0.054863955825567245, -0.044519901275634766, 0.0893385037779808, 0.04004377871751785, 0.003711731405928731, -0.021447300910949707, -0.08500636368989944, 0.0037281641270965338, 0.14561010897159576, 0.03993009030818939, 0.07621612399816513, 0.020513180643320084, 0.004926605150103569, -0.035578932613134384, 0.06101486086845398, -0.08422145247459412, -0.03511432558298111, 0.01537742093205452, -0.010146304965019226, -0.05133780837059021, -0.010472903028130531, -0.09726933389902115, -0.010570867918431759, -0.09348491579294205, 0.002129049738869071, -0.01265127956867218, 0.03504374623298645, -0.008679943159222603, -0.002507386729121208, -0.06586045026779175, -0.04775359109044075, -0.042809367179870605, 0.08359787613153458, -0.0230431966483593, -0.015440763905644417, 0.0195400882512331, -0.0186530202627182, -0.03176320344209671, -0.019522372633218765, -0.02984560839831829, 0.024256182834506035, -0.07656785100698471, 0.03944750130176544, 0.016559945419430733, 0.007124909665435553, 0.08061631768941879, 0.08561833202838898, -0.018525447696447372, -0.0019649232272058725, -0.018469924107193947, -0.012311050668358803, -0.08448101580142975, 0.060216110199689865, 0.06368701905012131, -0.07110093533992767],
+        "VIRT_REG_GR8":[0.02255251444876194, 0.012649326585233212, 0.05363747105002403, -0.006129346787929535, 0.027027001604437828, 0.03703385218977928, -0.045294541865587234, -0.02489621751010418, 0.026587747037410736, -0.06228360906243324, 0.01547946222126484, 0.03494448587298393, 0.08276952058076859, -0.03470698744058609, 0.0036826131399720907, 0.04216131567955017, -0.04518325626850128, 0.09584730118513107, -0.09126991778612137, -0.11293632537126541, 0.0141398124396801, -0.05086163431406021, 0.0421922467648983, -0.0001364851341350004, 0.05821910500526428, -0.04154132679104805, 0.036521218717098236, -0.016718950122594833, 0.0773339569568634, 0.05134757608175278, -0.03728386387228966, -0.014684299007058144, 0.016949277371168137, 0.025767508894205093, -0.01573120802640915, 0.0343811996281147, 0.008209497667849064, 0.0011038129450753331, -0.06688684970140457, -0.08167136460542679, 0.03875276446342468, 0.08301592618227005, 0.023012684658169746, 0.07135005295276642, 0.008461466059088707, 0.004998552612960339, 0.02622731775045395, -0.09479465335607529, 0.014987453818321228, -0.008574756793677807, -0.008050303906202316, -0.005560623947530985, 0.04616820812225342, 0.11537269502878189, 0.032199542969465256, 0.05507092550396919, -0.053164780139923096, 0.012255114503204823, -0.01981479674577713, 0.06012535095214844, 0.043957680463790894, 0.02384384348988533, 0.04837791621685028, 0.04945961385965347, -0.1063770279288292, -0.07354240119457245, -0.08922741562128067, -0.026019031181931496, -0.08768662065267563, 0.09241457283496857, 0.03253300115466118, -0.018267929553985596, -0.04406850412487984, -0.05577726289629936, -0.05304105579853058, 0.016035545617341995, 0.05610279366374016, 0.06247573718428612, -0.019430609419941902, -0.017088554799556732, -0.022114543244242668, 0.07442588359117508, -0.017668865621089935, -0.02403153106570244, 0.006919574458152056, 0.05879344418644905, -0.0885634645819664, -0.016336753964424133, -0.024662213400006294, 0.029266972094774246, -0.04889025166630745, 0.042460259050130844, -0.013102580793201923, 0.023992935195565224, 0.024768078699707985, 0.047551900148391724, -0.02243787795305252, 0.05929713696241379, 0.03110451251268387, -0.00550821190699935],
+        "VIRT_REG_RFP80":[-0.04414765536785126, 0.05147779360413551, -0.035608600825071335, -0.03939598798751831, 0.0430026613175869, -0.03331028297543526, 0.015591064468026161, 0.01892651617527008, -0.011428372003138065, -0.06980786472558975, 0.06445881724357605, 0.1036338210105896, 0.01164929661899805, -0.07599718868732452, 0.022036561742424965, 0.10396245121955872, -0.041171155869960785, -0.07264886051416397, 0.00032837275648489594, 0.02848120965063572, -0.031889040023088455, 0.023848745971918106, -0.02298046089708805, -0.05559201166033745, 0.026687605306506157, 0.0565699003636837, -0.0134252505376935, 0.05494402348995209, -0.0584089457988739, 0.05422470346093178, -0.024360226467251778, 0.03570455685257912, 0.013681530021131039, -0.006910417694598436, 0.011886067688465118, 0.07619262486696243, 0.08147607743740082, 0.05824091285467148, 0.001224246108904481, -0.030463339760899544, -0.023527851328253746, 0.03078501485288143, -0.02225799672305584, -0.058049511164426804, 0.015403151512145996, 0.07900431007146835, 0.025944147258996964, 0.021455328911542892, 0.023985104635357857, -0.0327906534075737, 0.04195002466440201, -0.10313323140144348, -0.023333510383963585, -0.010316243395209312, -0.02042137086391449, 0.07474000751972198, 0.02313513681292534, -0.0030733307357877493, 0.06138097122311592, 0.005197131074965, -0.03222955763339996, 0.005364845506846905, -0.05313501134514809, 0.0013082564109936357, 0.025044983252882957, 0.0349799208343029, 0.09704083949327469, -0.017403649166226387, -0.03375721350312233, 0.05970870703458786, -0.021679691970348358, -0.04719642922282219, 0.024217652156949043, -0.06130526587367058, 0.004813425708562136, 0.07473690062761307, -0.039600174874067307, -0.009295261465013027, 0.05440402403473854, 0.04785943776369095, -0.04006686061620712, -0.020133933052420616, 0.00989031046628952, -0.054447200149297714, 0.06291327625513077, -0.01196430902928114, 0.0841275230050087, -0.05557875707745552, -0.0813804343342781, -0.0746457576751709, -0.024255990982055664, -0.048101916909217834, -0.014132879674434662, -0.013147399760782719, -0.009715595282614231, 0.08717820793390274, -0.04318689927458763, -0.0311901792883873, -0.017253845930099487, 0.005144816357642412],
+        "VIRT_REG_VR128":[0.08292517066001892, 0.053138989955186844, 0.0019234063802286983, -0.030035940930247307, 0.0821828693151474, -0.0540342852473259, 0.06449387222528458, -0.03985493257641792, 0.026820721104741096, 0.0352952741086483, -0.1056072935461998, 0.054804764688014984, 0.01685425080358982, 0.05867069214582443, 0.11665259301662445, -0.07655566930770874, 0.021201618015766144, 0.00927705504000187, -0.04723019897937775, 0.016582123935222626, -0.01160470675677061, -0.013075411319732666, 0.01054342370480299, -0.05403316020965576, 0.033609066158533096, -0.07971179485321045, 0.1005927175283432, -0.020655132830142975, -0.0036442605778574944, 0.018269486725330353, 0.036334097385406494, -0.06517180055379868, -0.028530113399028778, -0.03768114373087883, 0.10582506656646729, 0.011199450120329857, -0.06707775592803955, 0.02332702837884426, -0.014528930187225342, -0.09369251132011414, 0.069722481071949, 0.031001657247543335, 0.08032777905464172, -0.060744334012269974, 0.015131807886064053, 0.01935953088104725, -0.087028868496418, 0.041773099452257156, 0.0381581112742424, -0.07518653571605682, 0.021307995542883873, -0.07350508868694305, -0.04699733853340149, -0.007377162110060453, 0.07836157828569412, 0.016066696494817734, -0.02160775288939476, -0.030519334599375725, -0.09255059063434601, 0.03597188740968704, -0.11260625720024109, -0.08602424710988998, 0.058293748646974564, -0.034749604761600494, 0.005541469436138868, -0.07924741506576538, -0.024103455245494843, 0.06047135218977928, 0.026729481294751167, 0.03493977710604668, -0.07453227788209915, -0.01716521382331848, 0.008985077030956745, -0.08075122535228729, 0.03353623300790787, -0.08125714957714081, 0.04245763644576073, 0.06520543247461319, 0.020550349727272987, -0.003161275526508689, -0.03491697832942009, -0.005496494937688112, 0.09021904319524765, -0.057418785989284515, 0.03494826331734657, -0.052578359842300415, -0.044952504336833954, 0.11770184338092804, -0.048565153032541275, -0.03815764561295509, 0.06020108237862587, -0.09397949278354645, 0.03820547088980675, 0.08039405196905136, 0.014751153998076916, 0.006572262849658728, 0.05658692866563797, 0.05043925344944, -0.0060436660423874855, -0.12018798291683197],
+        "VIRT_REG_VR256":[0.032775089144706726, 0.029240285977721214, 0.01821955479681492, 0.023595772683620453, -0.02587016113102436, -0.12190376222133636, 0.09720813482999802, 0.005780891049653292, -0.0581410676240921, 0.04817686229944229, -0.04627984017133713, 0.03618951886892319, -0.10393846780061722, 0.04380590096116066, 0.030101926997303963, -0.021811308339238167, 0.0012455569813027978, 0.06209835410118103, -0.08859474956989288, 0.0671553835272789, -0.006448917090892792, 0.0169842429459095, 0.031113164499402046, -0.07417412847280502, 0.05549546331167221, -0.013042094185948372, 0.0948401540517807, -0.07335975021123886, -0.03987044095993042, -0.005343804135918617, -0.08741248399019241, -0.08009110391139984, 0.005667346995323896, 0.03745159134268761, 0.019986214116215706, -0.03723142296075821, -0.0037649653386324644, 0.005682446528226137, 0.0659727230668068, -0.002658356446772814, 0.07049102336168289, -0.01944110542535782, -0.014278342947363853, 0.04189611226320267, 0.0312303826212883, -0.046760618686676025, 0.040438465774059296, 0.054074693471193314, 0.07479880005121231, -0.016405146569013596, 0.027125591412186623, -0.04216836765408516, 0.0011189498472958803, -0.01471384521573782, -0.010250975377857685, -0.006412460468709469, -0.12170380353927612, 0.015495882369577885, -0.054699406027793884, 0.05955614894628525, 0.06753991544246674, -0.03688138723373413, 0.049010518938302994, -0.07614680379629135, 0.06504888087511063, -0.014145595952868462, 0.02210555598139763, 0.023598313331604004, 0.00511248828843236, 0.013318972662091255, -0.11605404317378998, -0.032067783176898956, -0.05010659247636795, -0.023693162947893143, 0.06650379300117493, -0.026386691257357597, 0.06052805855870247, 0.0515507273375988, 0.033960308879613876, -0.06421340256929398, -0.09355985373258591, -0.0658700093626976, 0.10278744995594025, -0.10271084308624268, -0.012089421041309834, -0.04169749841094017, -0.07112454622983932, -0.032573599368333817, -0.0003141233173664659, 0.017007946968078613, 0.03622191399335861, 0.05829676240682602, 0.06261610984802246, 0.005667738616466522, 0.009631159715354443, 0.022852277383208275, 0.057013869285583496, -0.05015721917152405, 0.027599012479186058, -0.08637165278196335]
+    },
+    "PhysicalRegisters" : {
+        "PHY_REG":[-0.008169060572981834, -0.017023155465722084, -0.04927198588848114, 0.0014261528849601746, 0.012259463779628277, -0.02794509381055832, -0.024857040494680405, 0.029203711077570915, 0.0433109886944294, 0.009679347276687622, -0.05811547115445137, -0.09075025469064713, -0.08525611460208893, -0.10545054078102112, 0.06474080681800842, 0.056396666914224625, 0.06781823933124542, 0.09059076011180878, -0.10420752316713333, -0.08284831047058105, 0.02349182404577732, -0.0354253351688385, -0.004627702757716179, 0.0068538435734808445, -0.053724177181720734, -0.02113335393369198, 0.05254676192998886, -0.050769440829753876, 0.061386119574308395, -0.07541731745004654, -0.024204161018133163, -0.0009893826209008694, -0.007493770215660334, -0.017051052302122116, 0.015025814063847065, -0.020427946001291275, -0.0844966471195221, 0.04589429497718811, 0.025571472942829132, -0.05280151963233948, 0.06895384937524796, 0.03960262984037399, 0.0068003153428435326, 0.09397424012422562, -0.0523529127240181, 0.03780638054013252, -0.015423302538692951, 0.029167350381612778, 0.01019437238574028, 0.023989612236618996, -0.03344425559043884, -0.07926471531391144, -0.09238854795694351, 0.04794330149888992, 0.01872367039322853, -0.029179377481341362, -0.05339968949556351, -0.04575541242957115, -0.004491546656936407, -0.009650425054132938, 0.026945313438773155, -0.02115861512720585, 0.06488905847072601, -0.06647083908319473, 0.008904196321964264, 0.010536684654653072, -0.06012551859021187, -0.00022655133216176182, -0.10175421833992004, 0.062001921236515045, -0.054452817887067795, 0.01785552129149437, -0.06749527156352997, -0.04883178323507309, -0.023449009284377098, 0.040745027363300323, 0.002448269398882985, 0.07842953503131866, -0.019806355237960815, -0.08275315910577774, 0.01131721492856741, 0.0482926219701767, 0.01892486959695816, 0.005685009527951479, -0.0055344682186841965, -0.0034555341117084026, -0.07923021167516708, 0.06387833505868912, 0.05978211387991905, -0.001252106623724103, 0.07216084003448486, -0.01223798282444477, 0.09716741740703583, 0.009659498929977417, -0.09404221922159195, -0.10122949630022049, -0.003581057768315077, 0.07885389029979706, 0.05305042862892151, -0.04988719895482063]
     }
 }
\ No newline at end of file
diff --git a/llvm/lib/AsmParser/AsmParserContext.cpp b/llvm/lib/AsmParser/AsmParserContext.cpp
new file mode 100644
index 0000000000000..59d3ffcb470e4
--- /dev/null
+++ b/llvm/lib/AsmParser/AsmParserContext.cpp
@@ -0,0 +1,89 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/AsmParser/AsmParserContext.h"
+
+namespace llvm {
+
+std::optional<FileLocRange>
+AsmParserContext::getFunctionLocation(const Function *F) const {
+  if (auto FIt = Functions.find(F); FIt != Functions.end())
+    return FIt->second;
+  return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getBlockLocation(const BasicBlock *BB) const {
+  if (auto BBIt = Blocks.find(BB); BBIt != Blocks.end())
+    return BBIt->second;
+  return std::nullopt;
+}
+
+std::optional<FileLocRange>
+AsmParserContext::getInstructionLocation(const Instruction *I) const {
+  if (auto IIt = Instructions.find(I); IIt != Instructions.end())
+    return IIt->second;
+  return std::nullopt;
+}
+
+Function *
+AsmParserContext::getFunctionAtLocation(const FileLocRange &Query) const {
+  for (auto &[F, Loc] : Functions) {
+    if (Loc.contains(Query))
+      return F;
+  }
+  return nullptr;
+}
+
+Function *AsmParserContext::getFunctionAtLocation(const FileLoc &Query) const {
+  return getFunctionAtLocation(FileLocRange(Query, Query));
+}
+
+BasicBlock *
+AsmParserContext::getBlockAtLocation(const FileLocRange &Query) const {
+  for (auto &[BB, Loc] : Blocks) {
+    if (Loc.contains(Query))
+      return BB;
+  }
+  return nullptr;
+}
+
+BasicBlock *AsmParserContext::getBlockAtLocation(const FileLoc &Query) const {
+  return getBlockAtLocation(FileLocRange(Query, Query));
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLocRange &Query) const {
+  for (auto &[I, Loc] : Instructions) {
+    if (Loc.contains(Query))
+      return I;
+  }
+  return nullptr;
+}
+
+Instruction *
+AsmParserContext::getInstructionAtLocation(const FileLoc &Query) const {
+  return getInstructionAtLocation(FileLocRange(Query, Query));
+}
+
+bool AsmParserContext::addFunctionLocation(Function *F,
+                                           const FileLocRange &Loc) {
+  return Functions.insert({F, Loc}).second;
+}
+
+bool AsmParserContext::addBlockLocation(BasicBlock *BB,
+                                        const FileLocRange &Loc) {
+  return Blocks.insert({BB, Loc}).second;
+}
+
+bool AsmParserContext::addInstructionLocation(Instruction *I,
+                                              const FileLocRange &Loc) {
+  return Instructions.insert({I, Loc}).second;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/AsmParser/CMakeLists.txt b/llvm/lib/AsmParser/CMakeLists.txt
index 20d0c50a029ca..dcfcc06f093a7 100644
--- a/llvm/lib/AsmParser/CMakeLists.txt
+++ b/llvm/lib/AsmParser/CMakeLists.txt
@@ -1,5 +1,6 @@
 # AsmParser
 add_llvm_component_library(LLVMAsmParser
+  AsmParserContext.cpp
   LLLexer.cpp
   LLParser.cpp
   Parser.cpp
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index 218c0181f7b23..a4d82a4c6f3b6 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -191,6 +191,8 @@ int LLLexer::getNextChar() {
 }
 
 lltok::Kind LLLexer::LexToken() {
+  // Set token end to next location, since the end is exclusive.
+  PrevTokEnd = CurPtr;
   while (true) {
     TokStart = CurPtr;
 
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 632a72a5b1f7e..54e5455bce458 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -451,6 +451,7 @@ bool LLParser::validateEndOfModule(bool UpgradeDebugInfo) {
   UpgradeModuleFlags(*M);
   UpgradeNVVMAnnotations(*M);
   UpgradeSectionAttributes(*M);
+  copyModuleAttrToFunctions(*M);
 
   if (!Slots)
     return false;
@@ -751,14 +752,21 @@ bool LLParser::parseDeclare() {
 ///   ::= 'define' FunctionHeader (!dbg !56)* '{' ...
 bool LLParser::parseDefine() {
   assert(Lex.getKind() == lltok::kw_define);
+  FileLoc FunctionStart(Lex.getTokLineColumnPos());
   Lex.Lex();
 
   Function *F;
   unsigned FunctionNumber = -1;
   SmallVector<unsigned> UnnamedArgNums;
-  return parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
-         parseOptionalFunctionMetadata(*F) ||
-         parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+  bool RetValue =
+      parseFunctionHeader(F, true, FunctionNumber, UnnamedArgNums) ||
+      parseOptionalFunctionMetadata(*F) ||
+      parseFunctionBody(*F, FunctionNumber, UnnamedArgNums);
+  if (ParserContext)
+    ParserContext->addFunctionLocation(
+        F, FileLocRange(FunctionStart, Lex.getPrevTokEndLineColumnPos()));
+
+  return RetValue;
 }
 
 /// parseGlobalType
@@ -7204,6 +7212,8 @@ bool LLParser::parseFunctionBody(Function &Fn, unsigned FunctionNumber,
 /// parseBasicBlock
 ///   ::= (LabelStr|LabelID)? Instruction*
 bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
+  FileLoc BBStart(Lex.getTokLineColumnPos());
+
   // If this basic block starts out with a name, remember it.
   std::string Name;
   int NameID = -1;
@@ -7245,6 +7255,7 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
       TrailingDbgRecord.emplace_back(DR, DeleteDbgRecord);
     }
 
+    FileLoc InstStart(Lex.getTokLineColumnPos());
     // This instruction may have three possibilities for a name: a) none
     // specified, b) name specified "%foo =", c) number specified: "%4 =".
     LocTy NameLoc = Lex.getLoc();
@@ -7294,8 +7305,16 @@ bool LLParser::parseBasicBlock(PerFunctionState &PFS) {
     for (DbgRecordPtr &DR : TrailingDbgRecord)
       BB->insertDbgRecordBefore(DR.release(), Inst->getIterator());
     TrailingDbgRecord.clear();
+    if (ParserContext) {
+      ParserContext->addInstructionLocation(
+          Inst, FileLocRange(InstStart, Lex.getPrevTokEndLineColumnPos()));
+    }
   } while (!Inst->isTerminator());
 
+  if (ParserContext)
+    ParserContext->addBlockLocation(
+        BB, FileLocRange(BBStart, Lex.getPrevTokEndLineColumnPos()));
+
   assert(TrailingDbgRecord.empty() &&
          "All debug values should have been attached to an instruction.");
 
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index 07fdce981b084..c5346d0977314 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -24,33 +24,38 @@ using namespace llvm;
 static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
                               ModuleSummaryIndex *Index, SMDiagnostic &Err,
                               SlotMapping *Slots, bool UpgradeDebugInfo,
-                              DataLayoutCallbackTy DataLayoutCallback) {
+                              DataLayoutCallbackTy DataLayoutCallback,
+                              AsmParserContext *ParserContext = nullptr) {
   SourceMgr SM;
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
   std::optional<LLVMContext> OptContext;
   return LLParser(F.getBuffer(), SM, Err, M, Index,
-                  M ? M->getContext() : OptContext.emplace(), Slots)
+                  M ? M->getContext() : OptContext.emplace(), Slots,
+                  ParserContext)
       .Run(UpgradeDebugInfo, DataLayoutCallback);
 }
 
 bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
                              ModuleSummaryIndex *Index, SMDiagnostic &Err,
                              SlotMapping *Slots,
-                             DataLayoutCallbackTy DataLayoutCallback) {
+                             DataLayoutCallbackTy DataLayoutCallback,
+                             AsmParserContext *ParserContext) {
   return ::parseAssemblyInto(F, M, Index, Err, Slots,
-                             /*UpgradeDebugInfo*/ true, DataLayoutCallback);
+                             /*UpgradeDebugInfo*/ true, DataLayoutCallback,
+                             ParserContext);
 }
 
 std::unique_ptr<Module>
 llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
-                    SlotMapping *Slots,
-                    DataLayoutCallbackTy DataLayoutCallback) {
+                    SlotMapping *Slots, DataLayoutCallbackTy DataLayoutCallback,
+                    AsmParserContext *ParserContext) {
   std::unique_ptr<Module> M =
       std::make_unique<Module>(F.getBufferIdentifier(), Context);
 
-  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback))
+  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback,
+                        ParserContext))
     return nullptr;
 
   return M;
@@ -133,12 +138,14 @@ ParsedModuleAndIndex llvm::parseAssemblyFileWithIndexNoUpgradeDebugInfo(
                                       DataLayoutCallback);
 }
 
-std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
-                                                  SMDiagnostic &Err,
-                                                  LLVMContext &Context,
-                                                  SlotMapping *Slots) {
+std::unique_ptr<Module>
+llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
+                          LLVMContext &Context, SlotMapping *Slots,
+                          AsmParserContext *ParserContext) {
   MemoryBufferRef F(AsmString, "<string>");
-  return parseAssembly(F, Err, Context, Slots);
+  return parseAssembly(
+      F, Err, Context, Slots, [](StringRef, StringRef) { return std::nullopt; },
+      ParserContext);
 }
 
 static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 4ff5f50877d82..1219cb9defe0e 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7144,6 +7144,8 @@ Error BitcodeReader::materializeModule() {
 
   UpgradeARCRuntime(*TheModule);
 
+  copyModuleAttrToFunctions(*TheModule);
+
   return Error::success();
 }
 
@@ -8602,7 +8604,7 @@ getEnableSplitLTOUnitAndUnifiedFlag(BitstreamCursor &Stream,
     case bitc::FS_FLAGS: { // [flags]
       uint64_t Flags = Record[0];
       // Scan flags.
-      assert(Flags <= 0x2ff && "Unexpected bits in flag");
+      assert(Flags <= 0x7ff && "Unexpected bits in flag");
 
       bool EnableSplitLTOUnit = Flags & 0x8;
       bool UnifiedLTO = Flags & 0x200;
diff --git a/llvm/lib/CGData/CodeGenDataReader.cpp b/llvm/lib/CGData/CodeGenDataReader.cpp
index b1cd939db9a4f..aeb4a4d0d4b94 100644
--- a/llvm/lib/CGData/CodeGenDataReader.cpp
+++ b/llvm/lib/CGData/CodeGenDataReader.cpp
@@ -125,7 +125,7 @@ Error IndexedCodeGenDataReader::read() {
     FunctionMapRecord.setReadStableFunctionMapNames(
         IndexedCodeGenDataReadFunctionMapNames);
     if (IndexedCodeGenDataLazyLoading)
-      FunctionMapRecord.lazyDeserialize(SharedDataBuffer,
+      FunctionMapRecord.lazyDeserialize(std::move(SharedDataBuffer),
                                         Header.StableFunctionMapOffset);
     else
       FunctionMapRecord.deserialize(Ptr);
diff --git a/llvm/lib/CGData/StableFunctionMap.cpp b/llvm/lib/CGData/StableFunctionMap.cpp
index 46e04bd850104..d0fae3a9d32f0 100644
--- a/llvm/lib/CGData/StableFunctionMap.cpp
+++ b/llvm/lib/CGData/StableFunctionMap.cpp
@@ -137,6 +137,7 @@ size_t StableFunctionMap::size(SizeType Type) const {
 const StableFunctionMap::StableFunctionEntries &
 StableFunctionMap::at(HashFuncsMapType::key_type FunctionHash) const {
   auto It = HashToFuncs.find(FunctionHash);
+  assert(It != HashToFuncs.end() && "FunctionHash not found!");
   if (isLazilyLoaded())
     deserializeLazyLoadingEntry(It);
   return It->second.Entries;
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index fabfbc4724560..efb494090fdf3 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -41,6 +41,7 @@
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineBlockHashInfo.h"
 #include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -184,6 +185,8 @@ static cl::opt<bool> PrintLatency(
     cl::desc("Print instruction latencies as verbose asm comments"), cl::Hidden,
     cl::init(false));
 
+extern cl::opt<bool> EmitBBHash;
+
 STATISTIC(EmittedInsts, "Number of machine instrs printed");
 
 char AsmPrinter::ID = 0;
@@ -474,6 +477,8 @@ void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<GCModuleInfo>();
   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
   AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
+  if (EmitBBHash)
+    AU.addRequired<MachineBlockHashInfo>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -1481,13 +1486,11 @@ getBBAddrMapFeature(const MachineFunction &MF, int NumMBBSectionRanges,
         "BB entries info is required for BBFreq and BrProb "
         "features");
   }
-  return {FuncEntryCountEnabled,
-          BBFreqEnabled,
-          BrProbEnabled,
+  return {FuncEntryCountEnabled, BBFreqEnabled, BrProbEnabled,
           MF.hasBBSections() && NumMBBSectionRanges > 1,
-          static_cast<bool>(BBAddrMapSkipEmitBBEntries),
-          HasCalls,
-          false};
+          // Use static_cast to avoid breakage of tests on windows.
+          static_cast<bool>(BBAddrMapSkipEmitBBEntries), HasCalls,
+          static_cast<bool>(EmitBBHash)};
 }
 
 void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
@@ -1546,6 +1549,9 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
       PrevMBBEndSymbol = MBBSymbol;
     }
 
+    auto MBHI =
+        Features.BBHash ? &getAnalysis<MachineBlockHashInfo>() : nullptr;
+
     if (!Features.OmitBBEntries) {
       OutStreamer->AddComment("BB id");
       // Emit the BB ID for this basic block.
@@ -1573,6 +1579,10 @@ void AsmPrinter::emitBBAddrMapSection(const MachineFunction &MF) {
       emitLabelDifferenceAsULEB128(MBB.getEndSymbol(), CurrentLabel);
       // Emit the Metadata.
       OutStreamer->emitULEB128IntValue(getBBAddrMapMetadata(MBB));
+      // Emit the Hash.
+      if (MBHI) {
+        OutStreamer->emitInt64(MBHI->getMBBHash(MBB));
+      }
     }
     PrevMBBEndSymbol = MBB.getEndSymbol();
   }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 638676284e354..7841ff7fa5952 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -339,7 +339,7 @@ class DwarfUnit : public DIEUnit {
                                const DIE &TyDIE);
 
 protected:
-  ~DwarfUnit();
+  ~DwarfUnit() override;
 
   /// Create new static data member DIE.
   DIE *getOrCreateStaticMemberDIE(const DIDerivedType *DT);
diff --git a/llvm/lib/CodeGen/CMakeLists.txt b/llvm/lib/CodeGen/CMakeLists.txt
index b6872605e22aa..4373c5397a3c6 100644
--- a/llvm/lib/CodeGen/CMakeLists.txt
+++ b/llvm/lib/CodeGen/CMakeLists.txt
@@ -108,6 +108,7 @@ add_llvm_component_library(LLVMCodeGen
   LowerEmuTLS.cpp
   MachineBasicBlock.cpp
   MachineBlockFrequencyInfo.cpp
+  MachineBlockHashInfo.cpp
   MachineBlockPlacement.cpp
   MachineBranchProbabilityInfo.cpp
   MachineCFGPrinter.cpp
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index 6802dd4b6cb2c..f1637f9785b0e 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -4076,7 +4076,6 @@ bool PhiNodeSetIterator::operator!=(const PhiNodeSetIterator &RHS) const {
 /// if it is simplified.
 class SimplificationTracker {
   DenseMap<Value *, Value *> Storage;
-  const SimplifyQuery &SQ;
   // Tracks newly created Phi nodes. The elements are iterated by insertion
   // order.
   PhiNodeSet AllPhiNodes;
@@ -4084,8 +4083,6 @@ class SimplificationTracker {
   SmallPtrSet<SelectInst *, 32> AllSelectNodes;
 
 public:
-  SimplificationTracker(const SimplifyQuery &sq) : SQ(sq) {}
-
   Value *Get(Value *V) {
     do {
       auto SV = Storage.find(V);
@@ -4095,30 +4092,6 @@ class SimplificationTracker {
     } while (true);
   }
 
-  Value *Simplify(Value *Val) {
-    SmallVector<Value *, 32> WorkList;
-    SmallPtrSet<Value *, 32> Visited;
-    WorkList.push_back(Val);
-    while (!WorkList.empty()) {
-      auto *P = WorkList.pop_back_val();
-      if (!Visited.insert(P).second)
-        continue;
-      if (auto *PI = dyn_cast<Instruction>(P))
-        if (Value *V = simplifyInstruction(cast<Instruction>(PI), SQ)) {
-          for (auto *U : PI->users())
-            WorkList.push_back(cast<Value>(U));
-          Put(PI, V);
-          PI->replaceAllUsesWith(V);
-          if (auto *PHI = dyn_cast<PHINode>(PI))
-            AllPhiNodes.erase(PHI);
-          if (auto *Select = dyn_cast<SelectInst>(PI))
-            AllSelectNodes.erase(Select);
-          PI->eraseFromParent();
-        }
-    }
-    return Get(Val);
-  }
-
   void Put(Value *From, Value *To) { Storage.insert({From, To}); }
 
   void ReplacePhi(PHINode *From, PHINode *To) {
@@ -4179,8 +4152,7 @@ class AddressingModeCombiner {
   /// Common Type for all different fields in addressing modes.
   Type *CommonType = nullptr;
 
-  /// SimplifyQuery for simplifyInstruction utility.
-  const SimplifyQuery &SQ;
+  const DataLayout &DL;
 
   /// Original Address.
   Value *Original;
@@ -4189,8 +4161,8 @@ class AddressingModeCombiner {
   Value *CommonValue = nullptr;
 
 public:
-  AddressingModeCombiner(const SimplifyQuery &_SQ, Value *OriginalValue)
-      : SQ(_SQ), Original(OriginalValue) {}
+  AddressingModeCombiner(const DataLayout &DL, Value *OriginalValue)
+      : DL(DL), Original(OriginalValue) {}
 
   ~AddressingModeCombiner() { eraseCommonValueIfDead(); }
 
@@ -4302,7 +4274,7 @@ class AddressingModeCombiner {
     // Keep track of keys where the value is null. We will need to replace it
     // with constant null when we know the common type.
     SmallVector<Value *, 2> NullValue;
-    Type *IntPtrTy = SQ.DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
+    Type *IntPtrTy = DL.getIntPtrType(AddrModes[0].OriginalValue->getType());
     for (auto &AM : AddrModes) {
       Value *DV = AM.GetFieldAsValue(DifferentField, IntPtrTy);
       if (DV) {
@@ -4352,7 +4324,7 @@ class AddressingModeCombiner {
     // simplification is possible only if original phi/selects were not
     // simplified yet.
     // Using this mapping we can find the current value in AddrToBase.
-    SimplificationTracker ST(SQ);
+    SimplificationTracker ST;
 
     // First step, DFS to create PHI nodes for all intermediate blocks.
     // Also fill traverse order for the second step.
@@ -4511,7 +4483,6 @@ class AddressingModeCombiner {
           PHI->addIncoming(ST.Get(Map[PV]), B);
         }
       }
-      Map[Current] = ST.Simplify(V);
     }
   }
 
@@ -5902,8 +5873,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   // the graph are compatible.
   bool PhiOrSelectSeen = false;
   SmallVector<Instruction *, 16> AddrModeInsts;
-  const SimplifyQuery SQ(*DL, TLInfo);
-  AddressingModeCombiner AddrModes(SQ, Addr);
+  AddressingModeCombiner AddrModes(*DL, Addr);
   TypePromotionTransaction TPT(RemovedInsts);
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
index 0522698adf183..c1365f499dcf5 100644
--- a/llvm/lib/CodeGen/CommandFlags.cpp
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -64,7 +64,6 @@ CGOPT_EXP(uint64_t, LargeDataThreshold)
 CGOPT(ExceptionHandling, ExceptionModel)
 CGOPT_EXP(CodeGenFileType, FileType)
 CGOPT(FramePointerKind, FramePointerUsage)
-CGOPT(bool, EnableUnsafeFPMath)
 CGOPT(bool, EnableNoInfsFPMath)
 CGOPT(bool, EnableNoNaNsFPMath)
 CGOPT(bool, EnableNoSignedZerosFPMath)
@@ -219,12 +218,6 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
                      "Enable frame pointer elimination")));
   CGBINDOPT(FramePointerUsage);
 
-  static cl::opt<bool> EnableUnsafeFPMath(
-      "enable-unsafe-fp-math",
-      cl::desc("Enable optimizations that may decrease FP precision"),
-      cl::init(false));
-  CGBINDOPT(EnableUnsafeFPMath);
-
   static cl::opt<bool> EnableNoInfsFPMath(
       "enable-no-infs-fp-math",
       cl::desc("Enable FP math optimizations that assume no +-Infs"),
@@ -552,7 +545,6 @@ TargetOptions
 codegen::InitTargetOptionsFromCodeGenFlags(const Triple &TheTriple) {
   TargetOptions Options;
   Options.AllowFPOpFusion = getFuseFPOps();
-  Options.UnsafeFPMath = getEnableUnsafeFPMath();
   Options.NoInfsFPMath = getEnableNoInfsFPMath();
   Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
   Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
@@ -706,7 +698,6 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
   if (getStackRealign())
     NewAttrs.addAttribute("stackrealign");
 
-  HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math");
   HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
   HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
   HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/CodeGen/ExpandFp.cpp b/llvm/lib/CodeGen/ExpandFp.cpp
index 2b5ced3915a2c..f44eb227133ae 100644
--- a/llvm/lib/CodeGen/ExpandFp.cpp
+++ b/llvm/lib/CodeGen/ExpandFp.cpp
@@ -1108,8 +1108,8 @@ class ExpandFpLegacyPass : public FunctionPass {
 };
 } // namespace
 
-ExpandFpPass::ExpandFpPass(const TargetMachine *TM, CodeGenOptLevel OptLevel)
-    : TM(TM), OptLevel(OptLevel) {}
+ExpandFpPass::ExpandFpPass(const TargetMachine &TM, CodeGenOptLevel OptLevel)
+    : TM(&TM), OptLevel(OptLevel) {}
 
 void ExpandFpPass::printPipeline(
     raw_ostream &OS, function_ref<StringRef(StringRef)> MapClassName2PassName) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
index 2cba6f0504540..066543744984e 100644
--- a/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Combiner.cpp
@@ -62,7 +62,7 @@ class Combiner::WorkListMaintainer : public GISelChangeObserver {
   static std::unique_ptr<WorkListMaintainer>
   create(Level Lvl, WorkListTy &WorkList, MachineRegisterInfo &MRI);
 
-  virtual ~WorkListMaintainer() = default;
+  ~WorkListMaintainer() override = default;
 
   void reportFullyCreatedInstrs() {
     LLVM_DEBUG({
@@ -95,7 +95,7 @@ class Combiner::WorkListMaintainerImpl : public Combiner::WorkListMaintainer {
   WorkListMaintainerImpl(WorkListTy &WorkList, MachineRegisterInfo &MRI)
       : WorkList(WorkList), MRI(MRI) {}
 
-  virtual ~WorkListMaintainerImpl() = default;
+  ~WorkListMaintainerImpl() override = default;
 
   void reset() override {
     DeferList.clear();
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index b425b952bfc1d..1f104784a97ec 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -391,19 +391,6 @@ void CombinerHelper::applyCombineConcatVectors(
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchCombineShuffleToBuildVector(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
-         "Invalid instruction");
-  auto &Shuffle = cast<GShuffleVector>(MI);
-
-  Register SrcVec1 = Shuffle.getSrc1Reg();
-  Register SrcVec2 = Shuffle.getSrc2Reg();
-
-  LLT SrcVec1Type = MRI.getType(SrcVec1);
-  LLT SrcVec2Type = MRI.getType(SrcVec2);
-  return SrcVec1Type.isVector() && SrcVec2Type.isVector();
-}
-
 void CombinerHelper::applyCombineShuffleToBuildVector(MachineInstr &MI) const {
   auto &Shuffle = cast<GShuffleVector>(MI);
 
@@ -535,11 +522,9 @@ bool CombinerHelper::matchCombineShuffleVector(
   LLT DstType = MRI.getType(MI.getOperand(0).getReg());
   Register Src1 = MI.getOperand(1).getReg();
   LLT SrcType = MRI.getType(Src1);
-  // As bizarre as it may look, shuffle vector can actually produce
-  // scalar! This is because at the IR level a <1 x ty> shuffle
-  // vector is perfectly valid.
-  unsigned DstNumElts = DstType.isVector() ? DstType.getNumElements() : 1;
-  unsigned SrcNumElts = SrcType.isVector() ? SrcType.getNumElements() : 1;
+
+  unsigned DstNumElts = DstType.getNumElements();
+  unsigned SrcNumElts = SrcType.getNumElements();
 
   // If the resulting vector is smaller than the size of the source
   // vectors being concatenated, we won't be able to replace the
@@ -556,7 +541,7 @@ bool CombinerHelper::matchCombineShuffleVector(
   //
   // TODO: If the size between the source and destination don't match
   //       we could still emit an extract vector element in that case.
-  if (DstNumElts < 2 * SrcNumElts && DstNumElts != 1)
+  if (DstNumElts < 2 * SrcNumElts)
     return false;
 
   // Check that the shuffle mask can be broken evenly between the
@@ -619,39 +604,6 @@ void CombinerHelper::applyCombineShuffleVector(
   MI.eraseFromParent();
 }
 
-bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) const {
-  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
-         "Invalid instruction kind");
-
-  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
-  return Mask.size() == 1;
-}
-
-void CombinerHelper::applyShuffleToExtract(MachineInstr &MI) const {
-  Register DstReg = MI.getOperand(0).getReg();
-  Builder.setInsertPt(*MI.getParent(), MI);
-
-  int I = MI.getOperand(3).getShuffleMask()[0];
-  Register Src1 = MI.getOperand(1).getReg();
-  LLT Src1Ty = MRI.getType(Src1);
-  int Src1NumElts = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1;
-  Register SrcReg;
-  if (I >= Src1NumElts) {
-    SrcReg = MI.getOperand(2).getReg();
-    I -= Src1NumElts;
-  } else if (I >= 0)
-    SrcReg = Src1;
-
-  if (I < 0)
-    Builder.buildUndef(DstReg);
-  else if (!MRI.getType(SrcReg).isVector())
-    Builder.buildCopy(DstReg, SrcReg);
-  else
-    Builder.buildExtractVectorElementConstant(DstReg, SrcReg, I);
-
-  MI.eraseFromParent();
-}
-
 namespace {
 
 /// Select a preference between two uses. CurrentUse is the current preference
@@ -8369,7 +8321,7 @@ bool CombinerHelper::matchShuffleDisjointMask(MachineInstr &MI,
     return false;
 
   ArrayRef<int> Mask = Shuffle.getMask();
-  const unsigned NumSrcElems = Src1Ty.isVector() ? Src1Ty.getNumElements() : 1;
+  const unsigned NumSrcElems = Src1Ty.getNumElements();
 
   bool TouchesSrc1 = false;
   bool TouchesSrc2 = false;
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
index 04d93098a5260..d6f23b62519fe 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelValueTracking.cpp
@@ -602,6 +602,8 @@ void GISelValueTracking::computeKnownBitsImpl(Register R, KnownBits &Known,
                          Depth + 1);
     computeKnownBitsImpl(MI.getOperand(3).getReg(), WidthKnown, DemandedElts,
                          Depth + 1);
+    OffsetKnown = OffsetKnown.sext(BitWidth);
+    WidthKnown = WidthKnown.sext(BitWidth);
     Known = extractBits(BitWidth, SrcOpKnown, OffsetKnown, WidthKnown);
     // Sign extend the extracted value using shift left and arithmetic shift
     // right.
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index f5db390e3ba17..5cff1a3425e77 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -139,7 +139,7 @@ class DILocationVerifier : public GISelChangeObserver {
 
 public:
   DILocationVerifier() = default;
-  ~DILocationVerifier() = default;
+  ~DILocationVerifier() override = default;
 
   const Instruction *getCurrentInst() const { return CurrInst; }
   void setCurrentInst(const Instruction *Inst) { CurrInst = Inst; }
@@ -1862,15 +1862,19 @@ bool IRTranslator::translateVectorDeinterleave2Intrinsic(
 
 void IRTranslator::getStackGuard(Register DstReg,
                                  MachineIRBuilder &MIRBuilder) {
+  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
+  if (!Global) {
+    LLVMContext &Ctx = MIRBuilder.getContext();
+    Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+    MIRBuilder.buildUndef(DstReg);
+    return;
+  }
+
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MRI->setRegClass(DstReg, TRI->getPointerRegClass());
   auto MIB =
       MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
-  Value *Global = TLI->getSDagStackGuard(*MF->getFunction().getParent());
-  if (!Global)
-    return;
-
   unsigned AddrSpace = Global->getType()->getPointerAddressSpace();
   LLT PtrTy = LLT::pointer(AddrSpace, DL->getPointerSizeInBits(AddrSpace));
 
@@ -3358,6 +3362,54 @@ bool IRTranslator::translateShuffleVector(const User &U,
     Mask = SVI->getShuffleMask();
   else
     Mask = cast<ConstantExpr>(U).getShuffleMask();
+
+  // As GISel does not represent <1 x > vectors as a separate type from scalars,
+  // we transform shuffle_vector with a scalar output to an
+  // ExtractVectorElement. If the input type is also scalar it becomes a Copy.
+  unsigned DstElts = cast<FixedVectorType>(U.getType())->getNumElements();
+  unsigned SrcElts =
+      cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements();
+  if (DstElts == 1) {
+    unsigned M = Mask[0];
+    if (SrcElts == 1) {
+      if (M == 0 || M == 1)
+        return translateCopy(U, *U.getOperand(M), MIRBuilder);
+      MIRBuilder.buildUndef(getOrCreateVReg(U));
+    } else {
+      Register Dst = getOrCreateVReg(U);
+      if (M < SrcElts) {
+        MIRBuilder.buildExtractVectorElementConstant(
+            Dst, getOrCreateVReg(*U.getOperand(0)), M);
+      } else if (M < SrcElts * 2) {
+        MIRBuilder.buildExtractVectorElementConstant(
+            Dst, getOrCreateVReg(*U.getOperand(1)), M - SrcElts);
+      } else {
+        MIRBuilder.buildUndef(Dst);
+      }
+    }
+    return true;
+  }
+
+  // A single element src is transformed to a build_vector.
+  if (SrcElts == 1) {
+    SmallVector<Register> Ops;
+    Register Undef;
+    for (int M : Mask) {
+      LLT SrcTy = getLLTForType(*U.getOperand(0)->getType(), *DL);
+      if (M == 0 || M == 1) {
+        Ops.push_back(getOrCreateVReg(*U.getOperand(M)));
+      } else {
+        if (!Undef.isValid()) {
+          Undef = MRI->createGenericVirtualRegister(SrcTy);
+          MIRBuilder.buildUndef(Undef);
+        }
+        Ops.push_back(Undef);
+      }
+    }
+    MIRBuilder.buildBuildVector(getOrCreateVReg(U), Ops);
+    return true;
+  }
+
   ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);
   MIRBuilder
       .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)},
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 38ec83f81f471..52c43a4ac4a04 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -4748,6 +4748,9 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT LowerHintTy) {
   case G_FMINIMUMNUM:
   case G_FMAXIMUMNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_FMINIMUM:
+  case G_FMAXIMUM:
+    return lowerFMinimumMaximum(MI);
   case G_MERGE_VALUES:
     return lowerMergeValues(MI);
   case G_UNMERGE_VALUES:
@@ -5819,6 +5822,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorShuffle(
     } else if (InputUsed[0] == -1U) {
       // No input vectors were used! The result is undefined.
       Output = MIRBuilder.buildUndef(NarrowTy).getReg(0);
+    } else if (NewElts == 1) {
+      Output = MIRBuilder.buildCopy(NarrowTy, Inputs[InputUsed[0]]).getReg(0);
     } else {
       Register Op0 = Inputs[InputUsed[0]];
       // If only one input was used, use an undefined vector for the other.
@@ -8775,6 +8780,77 @@ LegalizerHelper::lowerFMinNumMaxNum(MachineInstr &MI) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFMinimumMaximum(MachineInstr &MI) {
+  unsigned Opc = MI.getOpcode();
+  auto [Dst, Src0, Src1] = MI.getFirst3Regs();
+  LLT Ty = MRI.getType(Dst);
+  LLT CmpTy = Ty.changeElementSize(1);
+
+  bool IsMax = (Opc == TargetOpcode::G_FMAXIMUM);
+  unsigned OpcIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM_IEEE : TargetOpcode::G_FMINNUM_IEEE;
+  unsigned OpcNonIeee =
+      IsMax ? TargetOpcode::G_FMAXNUM : TargetOpcode::G_FMINNUM;
+  bool MinMaxMustRespectOrderedZero = false;
+  Register Res;
+
+  // IEEE variants don't need canonicalization
+  if (LI.isLegalOrCustom({OpcIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcIeee, {Ty}, {Src0, Src1}).getReg(0);
+    MinMaxMustRespectOrderedZero = true;
+  } else if (LI.isLegalOrCustom({OpcNonIeee, Ty})) {
+    Res = MIRBuilder.buildInstr(OpcNonIeee, {Ty}, {Src0, Src1}).getReg(0);
+  } else {
+    auto Compare = MIRBuilder.buildFCmp(
+        IsMax ? CmpInst::FCMP_OGT : CmpInst::FCMP_OLT, CmpTy, Src0, Src1);
+    Res = MIRBuilder.buildSelect(Ty, Compare, Src0, Src1).getReg(0);
+  }
+
+  // Propagate any NaN of both operands
+  if (!MI.getFlag(MachineInstr::FmNoNans) &&
+      (!isKnownNeverNaN(Src0, MRI) || isKnownNeverNaN(Src1, MRI))) {
+    auto IsOrdered = MIRBuilder.buildFCmp(CmpInst::FCMP_ORD, CmpTy, Src0, Src1);
+
+    LLT ElementTy = Ty.isScalar() ? Ty : Ty.getElementType();
+    APFloat NaNValue = APFloat::getNaN(getFltSemanticForLLT(ElementTy));
+    Register NaN = MIRBuilder.buildFConstant(ElementTy, NaNValue).getReg(0);
+    if (Ty.isVector())
+      NaN = MIRBuilder.buildSplatBuildVector(Ty, NaN).getReg(0);
+
+    Res = MIRBuilder.buildSelect(Ty, IsOrdered, Res, NaN).getReg(0);
+  }
+
+  // fminimum/fmaximum requires -0.0 less than +0.0
+  if (!MinMaxMustRespectOrderedZero && !MI.getFlag(MachineInstr::FmNsz)) {
+    GISelValueTracking VT(MIRBuilder.getMF());
+    KnownFPClass Src0Info = VT.computeKnownFPClass(Src0, fcZero);
+    KnownFPClass Src1Info = VT.computeKnownFPClass(Src1, fcZero);
+
+    if (!Src0Info.isKnownNeverZero() && !Src1Info.isKnownNeverZero()) {
+      const unsigned Flags = MI.getFlags();
+      Register Zero = MIRBuilder.buildFConstant(Ty, 0.0).getReg(0);
+      auto IsZero = MIRBuilder.buildFCmp(CmpInst::FCMP_OEQ, CmpTy, Res, Zero);
+
+      unsigned TestClass = IsMax ? fcPosZero : fcNegZero;
+
+      auto LHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src0, TestClass);
+      auto LHSSelect =
+          MIRBuilder.buildSelect(Ty, LHSTestZero, Src0, Res, Flags);
+
+      auto RHSTestZero = MIRBuilder.buildIsFPClass(CmpTy, Src1, TestClass);
+      auto RHSSelect =
+          MIRBuilder.buildSelect(Ty, RHSTestZero, Src1, LHSSelect, Flags);
+
+      Res = MIRBuilder.buildSelect(Ty, IsZero, RHSSelect, Res, Flags).getReg(0);
+    }
+  }
+
+  MIRBuilder.buildCopy(Dst, Res);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
 LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
   // Expand G_FMAD a, b, c -> G_FADD (G_FMUL a, b), c
   Register DstReg = MI.getOperand(0).getReg();
@@ -9008,6 +9084,8 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
   SmallVector<Register, 32> BuildVec;
   LLT EltTy = DstTy.getScalarType();
 
+  DenseMap<unsigned, Register> CachedExtract;
+
   for (int Idx : Mask) {
     if (Idx < 0) {
       if (!Undef.isValid())
@@ -9016,22 +9094,22 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
       continue;
     }
 
-    if (Src0Ty.isScalar()) {
-      BuildVec.push_back(Idx == 0 ? Src0Reg : Src1Reg);
-    } else {
-      int NumElts = Src0Ty.getNumElements();
-      Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
-      int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+    assert(!Src0Ty.isScalar() && "Unexpected scalar G_SHUFFLE_VECTOR");
+
+    int NumElts = Src0Ty.getNumElements();
+    Register SrcVec = Idx < NumElts ? Src0Reg : Src1Reg;
+    int ExtractIdx = Idx < NumElts ? Idx : Idx - NumElts;
+    auto [It, Inserted] = CachedExtract.try_emplace(Idx);
+    if (Inserted) {
       auto IdxK = MIRBuilder.buildConstant(IdxTy, ExtractIdx);
-      auto Extract = MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK);
-      BuildVec.push_back(Extract.getReg(0));
+      It->second =
+          MIRBuilder.buildExtractVectorElement(EltTy, SrcVec, IdxK).getReg(0);
     }
+    BuildVec.push_back(It->second);
   }
 
-  if (DstTy.isVector())
-    MIRBuilder.buildBuildVector(DstReg, BuildVec);
-  else
-    MIRBuilder.buildCopy(DstReg, BuildVec[0]);
+  assert(DstTy.isVector() && "Unexpected scalar G_SHUFFLE_VECTOR");
+  MIRBuilder.buildBuildVector(DstReg, BuildVec);
   MI.eraseFromParent();
   return Legalized;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 27df7e369436a..4b4df98024f4a 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -800,10 +800,11 @@ MachineInstrBuilder MachineIRBuilder::buildShuffleVector(const DstOp &Res,
   LLT DstTy = Res.getLLTTy(*getMRI());
   LLT Src1Ty = Src1.getLLTTy(*getMRI());
   LLT Src2Ty = Src2.getLLTTy(*getMRI());
-  const LLT DstElemTy = DstTy.isVector() ? DstTy.getElementType() : DstTy;
-  const LLT ElemTy1 = Src1Ty.isVector() ? Src1Ty.getElementType() : Src1Ty;
-  const LLT ElemTy2 = Src2Ty.isVector() ? Src2Ty.getElementType() : Src2Ty;
+  const LLT DstElemTy = DstTy.getScalarType();
+  const LLT ElemTy1 = Src1Ty.getScalarType();
+  const LLT ElemTy2 = Src2Ty.getScalarType();
   assert(DstElemTy == ElemTy1 && DstElemTy == ElemTy2);
+  assert(Mask.size() > 1 && "Scalar G_SHUFFLE_VECTOR are not supported");
   (void)DstElemTy;
   (void)ElemTy1;
   (void)ElemTy2;
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index d6e85059b5db8..c3e0964594bd5 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -721,6 +721,9 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   // Allocate a new register for the remat.
   Register NewVReg = Edit->createFrom(Original);
 
+  // Constrain it to the register class of MI.
+  MRI.constrainRegClass(NewVReg, MRI.getRegClass(VirtReg.reg()));
+
   // Finally we can rematerialize OrigMI before MI.
   SlotIndex DefIdx =
       Edit->rematerializeAt(*MI.getParent(), MI, NewVReg, RM, TRI);
diff --git a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
index bd4e41e0252b6..743095643d155 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/LiveDebugValues.cpp
@@ -71,7 +71,7 @@ class LiveDebugValuesLegacy : public MachineFunctionPass {
   static char ID;
 
   LiveDebugValuesLegacy();
-  ~LiveDebugValuesLegacy() = default;
+  ~LiveDebugValuesLegacy() override = default;
 
   /// Calculate the liveness information for the given machine function.
   bool runOnMachineFunction(MachineFunction &MF) override;
diff --git a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
index b9ea03f949ef8..1c4b2f9136857 100644
--- a/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues/VarLocBasedImpl.cpp
@@ -1094,7 +1094,7 @@ class VarLocBasedLDV : public LDVImpl {
   /// Default construct and initialize the pass.
   VarLocBasedLDV();
 
-  ~VarLocBasedLDV();
+  ~VarLocBasedLDV() override;
 
   /// Print to ostream with a message.
   void printVarLocInMBB(const MachineFunction &MF, const VarLocInMBB &V,
diff --git a/llvm/lib/CodeGen/MIR2Vec.cpp b/llvm/lib/CodeGen/MIR2Vec.cpp
index 99be1fc0dcf1f..00b37e7032f61 100644
--- a/llvm/lib/CodeGen/MIR2Vec.cpp
+++ b/llvm/lib/CodeGen/MIR2Vec.cpp
@@ -42,6 +42,13 @@ static cl::opt<std::string>
 cl::opt<float> OpcWeight("mir2vec-opc-weight", cl::Optional, cl::init(1.0),
                          cl::desc("Weight for machine opcode embeddings"),
                          cl::cat(MIR2VecCategory));
+cl::opt<float> CommonOperandWeight(
+    "mir2vec-common-operand-weight", cl::Optional, cl::init(1.0),
+    cl::desc("Weight for common operand embeddings"), cl::cat(MIR2VecCategory));
+cl::opt<float>
+    RegOperandWeight("mir2vec-reg-operand-weight", cl::Optional, cl::init(1.0),
+                     cl::desc("Weight for register operand embeddings"),
+                     cl::cat(MIR2VecCategory));
 cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
     "mir2vec-kind", cl::Optional,
     cl::values(clEnumValN(MIR2VecKind::Symbolic, "symbolic",
@@ -56,26 +63,52 @@ cl::opt<MIR2VecKind> MIR2VecEmbeddingKind(
 // Vocabulary
 //===----------------------------------------------------------------------===//
 
-MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeEntries,
-                             const TargetInstrInfo &TII)
-    : TII(TII) {
+MIRVocabulary::MIRVocabulary(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+                             VocabMap &&PhysicalRegisterMap,
+                             VocabMap &&VirtualRegisterMap,
+                             const TargetInstrInfo &TII,
+                             const TargetRegisterInfo &TRI,
+                             const MachineRegisterInfo &MRI)
+    : TII(TII), TRI(TRI), MRI(MRI) {
   buildCanonicalOpcodeMapping();
-
   unsigned CanonicalOpcodeCount = UniqueBaseOpcodeNames.size();
   assert(CanonicalOpcodeCount > 0 &&
          "No canonical opcodes found for target - invalid vocabulary");
-  Layout.OperandBase = CanonicalOpcodeCount;
-  generateStorage(OpcodeEntries);
+
+  buildRegisterOperandMapping();
+
+  // Define layout of vocabulary sections
+  Layout.OpcodeBase = 0;
+  Layout.CommonOperandBase = CanonicalOpcodeCount;
+  // We expect same classes for physical and virtual registers
+  Layout.PhyRegBase = Layout.CommonOperandBase + std::size(CommonOperandNames);
+  Layout.VirtRegBase = Layout.PhyRegBase + RegisterOperandNames.size();
+
+  generateStorage(OpcodeMap, CommonOperandMap, PhysicalRegisterMap,
+                  VirtualRegisterMap);
   Layout.TotalEntries = Storage.size();
 }
 
-Expected<MIRVocabulary> MIRVocabulary::create(VocabMap &&Entries,
-                                              const TargetInstrInfo &TII) {
-  if (Entries.empty())
+Expected<MIRVocabulary>
+MIRVocabulary::create(VocabMap &&OpcodeMap, VocabMap &&CommonOperandMap,
+                      VocabMap &&PhyRegMap, VocabMap &&VirtRegMap,
+                      const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+                      const MachineRegisterInfo &MRI) {
+  if (OpcodeMap.empty() || CommonOperandMap.empty() || PhyRegMap.empty() ||
+      VirtRegMap.empty())
     return createStringError(errc::invalid_argument,
                              "Empty vocabulary entries provided");
 
-  return MIRVocabulary(std::move(Entries), TII);
+  MIRVocabulary Vocab(std::move(OpcodeMap), std::move(CommonOperandMap),
+                      std::move(PhyRegMap), std::move(VirtRegMap), TII, TRI,
+                      MRI);
+
+  // Validate Storage after construction
+  if (!Vocab.Storage.isValid())
+    return createStringError(errc::invalid_argument,
+                             "Failed to create valid vocabulary storage");
+  Vocab.ZeroEmbedding = Embedding(Vocab.Storage.getDimension(), 0.0);
+  return std::move(Vocab);
 }
 
 std::string MIRVocabulary::extractBaseOpcodeName(StringRef InstrName) {
@@ -122,22 +155,74 @@ unsigned MIRVocabulary::getCanonicalOpcodeIndex(unsigned Opcode) const {
   return getCanonicalIndexForBaseName(BaseOpcode);
 }
 
+unsigned
+MIRVocabulary::getCanonicalIndexForOperandName(StringRef OperandName) const {
+  auto It = std::find(std::begin(CommonOperandNames),
+                      std::end(CommonOperandNames), OperandName);
+  assert(It != std::end(CommonOperandNames) &&
+         "Operand name not found in common operands");
+  return Layout.CommonOperandBase +
+         std::distance(std::begin(CommonOperandNames), It);
+}
+
+unsigned
+MIRVocabulary::getCanonicalIndexForRegisterClass(StringRef RegName,
+                                                 bool IsPhysical) const {
+  auto It = std::find(RegisterOperandNames.begin(), RegisterOperandNames.end(),
+                      RegName);
+  assert(It != RegisterOperandNames.end() &&
+         "Register name not found in register operands");
+  unsigned LocalIndex = std::distance(RegisterOperandNames.begin(), It);
+  return (IsPhysical ? Layout.PhyRegBase : Layout.VirtRegBase) + LocalIndex;
+}
+
 std::string MIRVocabulary::getStringKey(unsigned Pos) const {
   assert(Pos < Layout.TotalEntries && "Position out of bounds in vocabulary");
 
-  // For now, all entries are opcodes since we only have one section
-  if (Pos < Layout.OperandBase && Pos < UniqueBaseOpcodeNames.size()) {
+  // Handle opcodes section
+  if (Pos < Layout.CommonOperandBase) {
     // Convert canonical index back to base opcode name
     auto It = UniqueBaseOpcodeNames.begin();
     std::advance(It, Pos);
+    assert(It != UniqueBaseOpcodeNames.end() &&
+           "Canonical index out of bounds in opcode section");
     return *It;
   }
 
-  llvm_unreachable("Invalid position in vocabulary");
-  return "";
+  auto getLocalIndex = [](unsigned Pos, size_t BaseOffset, size_t Bound,
+                          const char *Msg) {
+    unsigned LocalIndex = Pos - BaseOffset;
+    assert(LocalIndex < Bound && Msg);
+    return LocalIndex;
+  };
+
+  // Handle common operands section
+  if (Pos < Layout.PhyRegBase) {
+    unsigned LocalIndex = getLocalIndex(
+        Pos, Layout.CommonOperandBase, std::size(CommonOperandNames),
+        "Local index out of bounds in common operands");
+    return CommonOperandNames[LocalIndex].str();
+  }
+
+  // Handle physical registers section
+  if (Pos < Layout.VirtRegBase) {
+    unsigned LocalIndex =
+        getLocalIndex(Pos, Layout.PhyRegBase, RegisterOperandNames.size(),
+                      "Local index out of bounds in physical registers");
+    return "PhyReg_" + RegisterOperandNames[LocalIndex];
+  }
+
+  // Handle virtual registers section
+  unsigned LocalIndex =
+      getLocalIndex(Pos, Layout.VirtRegBase, RegisterOperandNames.size(),
+                    "Local index out of bounds in virtual registers");
+  return "VirtReg_" + RegisterOperandNames[LocalIndex];
 }
 
-void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
+void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap,
+                                    const VocabMap &CommonOperandsMap,
+                                    const VocabMap &PhyRegMap,
+                                    const VocabMap &VirtRegMap) {
 
   // Helper for handling missing entities in the vocabulary.
   // Currently, we use a zero vector. In the future, we will throw an error to
@@ -151,14 +236,14 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
 
   // Initialize opcode embeddings section
   unsigned EmbeddingDim = OpcodeMap.begin()->second.size();
-  std::vector<Embedding> OpcodeEmbeddings(Layout.OperandBase,
+  std::vector<Embedding> OpcodeEmbeddings(Layout.CommonOperandBase,
                                           Embedding(EmbeddingDim));
 
   // Populate opcode embeddings using canonical mapping
   for (auto COpcodeName : UniqueBaseOpcodeNames) {
     if (auto It = OpcodeMap.find(COpcodeName); It != OpcodeMap.end()) {
       auto COpcodeIndex = getCanonicalIndexForBaseName(COpcodeName);
-      assert(COpcodeIndex < Layout.OperandBase &&
+      assert(COpcodeIndex < Layout.CommonOperandBase &&
              "Canonical index out of bounds");
       OpcodeEmbeddings[COpcodeIndex] = It->second;
     } else {
@@ -166,8 +251,39 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
     }
   }
 
-  // TODO: Add operand/argument embeddings as additional sections
-  // This will require extending the vocabulary format and layout
+  // Initialize common operand embeddings section
+  std::vector<Embedding> CommonOperandEmbeddings(std::size(CommonOperandNames),
+                                                 Embedding(EmbeddingDim));
+  unsigned OperandIndex = 0;
+  for (const auto &CommonOperandName : CommonOperandNames) {
+    if (auto It = CommonOperandsMap.find(CommonOperandName.str());
+        It != CommonOperandsMap.end()) {
+      CommonOperandEmbeddings[OperandIndex] = It->second;
+    } else {
+      handleMissingEntity(CommonOperandName);
+    }
+    ++OperandIndex;
+  }
+
+  // Helper lambda for creating register operand embeddings
+  auto createRegisterEmbeddings = [&](const VocabMap &RegMap) {
+    std::vector<Embedding> RegEmbeddings(TRI.getNumRegClasses(),
+                                         Embedding(EmbeddingDim));
+    unsigned RegOperandIndex = 0;
+    for (const auto &RegOperandName : RegisterOperandNames) {
+      if (auto It = RegMap.find(RegOperandName); It != RegMap.end())
+        RegEmbeddings[RegOperandIndex] = It->second;
+      else
+        handleMissingEntity(RegOperandName);
+      ++RegOperandIndex;
+    }
+    return RegEmbeddings;
+  };
+
+  // Initialize register operand embeddings sections
+  std::vector<Embedding> PhyRegEmbeddings = createRegisterEmbeddings(PhyRegMap);
+  std::vector<Embedding> VirtRegEmbeddings =
+      createRegisterEmbeddings(VirtRegMap);
 
   // Scale the vocabulary sections based on the provided weights
   auto scaleVocabSection = [](std::vector<Embedding> &Embeddings,
@@ -176,9 +292,20 @@ void MIRVocabulary::generateStorage(const VocabMap &OpcodeMap) {
       Embedding *= Weight;
   };
   scaleVocabSection(OpcodeEmbeddings, OpcWeight);
-
-  std::vector<std::vector<Embedding>> Sections(1);
-  Sections[0] = std::move(OpcodeEmbeddings);
+  scaleVocabSection(CommonOperandEmbeddings, CommonOperandWeight);
+  scaleVocabSection(PhyRegEmbeddings, RegOperandWeight);
+  scaleVocabSection(VirtRegEmbeddings, RegOperandWeight);
+
+  std::vector<std::vector<Embedding>> Sections(
+      static_cast<unsigned>(Section::MaxSections));
+  Sections[static_cast<unsigned>(Section::Opcodes)] =
+      std::move(OpcodeEmbeddings);
+  Sections[static_cast<unsigned>(Section::CommonOperands)] =
+      std::move(CommonOperandEmbeddings);
+  Sections[static_cast<unsigned>(Section::PhyRegisters)] =
+      std::move(PhyRegEmbeddings);
+  Sections[static_cast<unsigned>(Section::VirtRegisters)] =
+      std::move(VirtRegEmbeddings);
 
   Storage = ir2vec::VocabStorage(std::move(Sections));
 }
@@ -199,46 +326,130 @@ void MIRVocabulary::buildCanonicalOpcodeMapping() {
                     << " unique base opcodes\n");
 }
 
-Expected<MIRVocabulary>
-MIRVocabulary::createDummyVocabForTest(const TargetInstrInfo &TII,
-                                       unsigned Dim) {
+void MIRVocabulary::buildRegisterOperandMapping() {
+  // Check if already built
+  if (!RegisterOperandNames.empty())
+    return;
+
+  for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+    const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+    if (!RegClass)
+      continue;
+
+    // Get the register class name
+    StringRef ClassName = TRI.getRegClassName(RegClass);
+    RegisterOperandNames.push_back(ClassName.str());
+  }
+}
+
+unsigned MIRVocabulary::getCommonOperandIndex(
+    MachineOperand::MachineOperandType OperandType) const {
+  assert(OperandType != MachineOperand::MO_Register &&
+         "Expected non-register operand type");
+  assert(OperandType > MachineOperand::MO_Register &&
+         OperandType < MachineOperand::MO_Last && "Operand type out of bounds");
+  return static_cast<unsigned>(OperandType) - 1;
+}
+
+unsigned MIRVocabulary::getRegisterOperandIndex(Register Reg) const {
+  assert(!RegisterOperandNames.empty() && "Register operand mapping not built");
+  assert(Reg.isValid() && "Invalid register; not expected here");
+  assert((Reg.isPhysical() || Reg.isVirtual()) &&
+         "Expected a physical or virtual register");
+
+  const TargetRegisterClass *RegClass = nullptr;
+
+  // For physical registers, use TRI to get minimal register class as a
+  // physical register can belong to multiple classes. For virtual
+  // registers, use MRI to uniquely identify the assigned register class.
+  if (Reg.isPhysical())
+    RegClass = TRI.getMinimalPhysRegClass(Reg);
+  else
+    RegClass = MRI.getRegClass(Reg);
+
+  if (RegClass)
+    return RegClass->getID();
+  // Fallback for registers without a class (shouldn't happen)
+  llvm_unreachable("Register operand without a valid register class");
+  return 0;
+}
+
+Expected<MIRVocabulary> MIRVocabulary::createDummyVocabForTest(
+    const TargetInstrInfo &TII, const TargetRegisterInfo &TRI,
+    const MachineRegisterInfo &MRI, unsigned Dim) {
   assert(Dim > 0 && "Dimension must be greater than zero");
 
   float DummyVal = 0.1f;
 
-  // Create dummy embeddings for all canonical opcode names
-  VocabMap DummyVocabMap;
+  VocabMap DummyOpcMap, DummyOperandMap, DummyPhyRegMap, DummyVirtRegMap;
+
+  // Process opcodes directly without creating temporary vocabulary
   for (unsigned Opcode = 0; Opcode < TII.getNumOpcodes(); ++Opcode) {
     std::string BaseOpcode = extractBaseOpcodeName(TII.getName(Opcode));
-    if (DummyVocabMap.count(BaseOpcode) == 0) {
-      // Only add if not already present
-      DummyVocabMap[BaseOpcode] = Embedding(Dim, DummyVal);
+    if (DummyOpcMap.count(BaseOpcode) == 0) { // Only add if not already present
+      DummyOpcMap[BaseOpcode] = Embedding(Dim, DummyVal);
       DummyVal += 0.1f;
     }
   }
 
-  // Create and return vocabulary with dummy embeddings
-  return MIRVocabulary::create(std::move(DummyVocabMap), TII);
+  // Add common operands
+  for (const auto &CommonOperandName : CommonOperandNames) {
+    DummyOperandMap[CommonOperandName.str()] = Embedding(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+
+  // Process register classes directly
+  for (unsigned RC = 0; RC < TRI.getNumRegClasses(); ++RC) {
+    const TargetRegisterClass *RegClass = TRI.getRegClass(RC);
+    if (!RegClass)
+      continue;
+
+    std::string ClassName = TRI.getRegClassName(RegClass);
+    DummyPhyRegMap[ClassName] = Embedding(Dim, DummyVal);
+    DummyVirtRegMap[ClassName] = Embedding(Dim, DummyVal);
+    DummyVal += 0.1f;
+  }
+
+  // Create vocabulary directly without temporary instance
+  return MIRVocabulary::create(
+      std::move(DummyOpcMap), std::move(DummyOperandMap),
+      std::move(DummyPhyRegMap), std::move(DummyVirtRegMap), TII, TRI, MRI);
 }
 
 //===----------------------------------------------------------------------===//
-// MIR2VecVocabLegacyAnalysis Implementation
+// MIR2VecVocabProvider and MIR2VecVocabLegacyAnalysis
 //===----------------------------------------------------------------------===//
 
-char MIR2VecVocabLegacyAnalysis::ID = 0;
-INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
-                      "MIR2Vec Vocabulary Analysis", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
-INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
-                    "MIR2Vec Vocabulary Analysis", false, true)
+Expected<mir2vec::MIRVocabulary>
+MIR2VecVocabProvider::getVocabulary(const Module &M) {
+  VocabMap OpcVocab, CommonOperandVocab, PhyRegVocabMap, VirtRegVocabMap;
 
-StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
-  return "MIR2Vec Vocabulary Analysis";
+  if (Error Err = readVocabulary(OpcVocab, CommonOperandVocab, PhyRegVocabMap,
+                                 VirtRegVocabMap))
+    return std::move(Err);
+
+  for (const auto &F : M) {
+    if (F.isDeclaration())
+      continue;
+
+    if (auto *MF = MMI.getMachineFunction(F)) {
+      auto &Subtarget = MF->getSubtarget();
+      if (const auto *TII = Subtarget.getInstrInfo())
+        if (const auto *TRI = Subtarget.getRegisterInfo())
+          return mir2vec::MIRVocabulary::create(
+              std::move(OpcVocab), std::move(CommonOperandVocab),
+              std::move(PhyRegVocabMap), std::move(VirtRegVocabMap), *TII, *TRI,
+              MF->getRegInfo());
+    }
+  }
+  return createStringError(errc::invalid_argument,
+                           "No machine functions found in module");
 }
 
-Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
-  // TODO: Extend vocabulary format to support multiple sections
-  // (opcodes, operands, etc.) similar to IR2Vec structure
+Error MIR2VecVocabProvider::readVocabulary(VocabMap &OpcodeVocab,
+                                           VocabMap &CommonOperandVocab,
+                                           VocabMap &PhyRegVocabMap,
+                                           VocabMap &VirtRegVocabMap) {
   if (VocabFile.empty())
     return createStringError(
         errc::invalid_argument,
@@ -255,39 +466,47 @@ Error MIR2VecVocabLegacyAnalysis::readVocabulary() {
   if (!ParsedVocabValue)
     return ParsedVocabValue.takeError();
 
-  unsigned Dim = 0;
+  unsigned OpcodeDim = 0, CommonOperandDim = 0, PhyRegOperandDim = 0,
+           VirtRegOperandDim = 0;
   if (auto Err = ir2vec::VocabStorage::parseVocabSection(
-          "entities", *ParsedVocabValue, StrVocabMap, Dim))
+          "Opcodes", *ParsedVocabValue, OpcodeVocab, OpcodeDim))
     return Err;
 
-  return Error::success();
-}
-
-Expected<mir2vec::MIRVocabulary>
-MIR2VecVocabLegacyAnalysis::getMIR2VecVocabulary(const Module &M) {
-  if (StrVocabMap.empty()) {
-    if (Error Err = readVocabulary()) {
-      return std::move(Err);
-    }
-  }
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "CommonOperands", *ParsedVocabValue, CommonOperandVocab,
+          CommonOperandDim))
+    return Err;
 
-  // Get machine module info to access machine functions and target info
-  MachineModuleInfo &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "PhysicalRegisters", *ParsedVocabValue, PhyRegVocabMap,
+          PhyRegOperandDim))
+    return Err;
 
-  // Find first available machine function to get target instruction info
-  for (const auto &F : M) {
-    if (F.isDeclaration())
-      continue;
+  if (auto Err = ir2vec::VocabStorage::parseVocabSection(
+          "VirtualRegisters", *ParsedVocabValue, VirtRegVocabMap,
+          VirtRegOperandDim))
+    return Err;
 
-    if (auto *MF = MMI.getMachineFunction(F)) {
-      const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
-      return mir2vec::MIRVocabulary::create(std::move(StrVocabMap), *TII);
-    }
+  // All sections must have the same embedding dimension
+  if (!(OpcodeDim == CommonOperandDim && CommonOperandDim == PhyRegOperandDim &&
+        PhyRegOperandDim == VirtRegOperandDim)) {
+    return createStringError(
+        errc::illegal_byte_sequence,
+        "MIR2Vec vocabulary sections have different dimensions");
   }
 
-  // No machine functions available - return error
-  return createStringError(errc::invalid_argument,
-                           "No machine functions found in module");
+  return Error::success();
+}
+
+char MIR2VecVocabLegacyAnalysis::ID = 0;
+INITIALIZE_PASS_BEGIN(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
+                      "MIR2Vec Vocabulary Analysis", false, true)
+INITIALIZE_PASS_DEPENDENCY(MachineModuleInfoWrapperPass)
+INITIALIZE_PASS_END(MIR2VecVocabLegacyAnalysis, "mir2vec-vocab-analysis",
+                    "MIR2Vec Vocabulary Analysis", false, true)
+
+StringRef MIR2VecVocabLegacyAnalysis::getPassName() const {
+  return "MIR2Vec Vocabulary Analysis";
 }
 
 //===----------------------------------------------------------------------===//
@@ -351,9 +570,14 @@ Embedding SymbolicMIREmbedder::computeEmbeddings(const MachineInstr &MI) const {
   if (MI.isDebugInstr())
     return Embedding(Dimension, 0);
 
-  // Todo: Add operand/argument contributions
+  // Opcode embedding
+  Embedding InstructionEmbedding = Vocab[MI.getOpcode()];
+
+  // Add operand contributions
+  for (const MachineOperand &MO : MI.operands())
+    InstructionEmbedding += Vocab[MO];
 
-  return Vocab[MI.getOpcode()];
+  return InstructionEmbedding;
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index e10b308a53ef3..9ef5164fe5b0b 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -2860,6 +2860,9 @@ bool MIParser::parseShuffleMaskOperand(MachineOperand &Dest) {
   if (expectAndConsume(MIToken::rparen))
     return error("shufflemask should be terminated by ')'.");
 
+  if (ShufMask.size() < 2)
+    return error("shufflemask should have > 1 element");
+
   ArrayRef<int> MaskAlloc = MF.allocateShuffleMask(ShufMask);
   Dest = MachineOperand::CreateShuffleMask(MaskAlloc);
   return false;
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index 1cb57a4fa4258..ba0b025167307 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -1137,7 +1137,7 @@ class SlotIndexUpdateDelegate : public MachineFunction::Delegate {
     MF.setDelegate(this);
   }
 
-  ~SlotIndexUpdateDelegate() {
+  ~SlotIndexUpdateDelegate() override {
     MF.resetDelegate(this);
     for (auto MI : Insertions)
       Indexes->insertMachineInstrInMaps(*MI);
diff --git a/llvm/lib/CodeGen/MachineBlockHashInfo.cpp b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp
new file mode 100644
index 0000000000000..c4d9c0f27fbf9
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineBlockHashInfo.cpp
@@ -0,0 +1,115 @@
+//===- llvm/CodeGen/MachineBlockHashInfo.cpp---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Compute the hashes of basic blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineBlockHashInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+uint64_t hashBlock(const MachineBasicBlock &MBB, bool HashOperands) {
+  uint64_t Hash = 0;
+  for (const MachineInstr &MI : MBB) {
+    if (MI.isMetaInstruction() || MI.isTerminator())
+      continue;
+    Hash = hashing::detail::hash_16_bytes(Hash, MI.getOpcode());
+    if (HashOperands) {
+      for (unsigned i = 0; i < MI.getNumOperands(); i++) {
+        Hash =
+            hashing::detail::hash_16_bytes(Hash, hash_value(MI.getOperand(i)));
+      }
+    }
+  }
+  return Hash;
+}
+
+/// Fold a 64-bit integer to a 16-bit one.
+uint16_t fold_64_to_16(const uint64_t Value) {
+  uint16_t Res = static_cast<uint16_t>(Value);
+  Res ^= static_cast<uint16_t>(Value >> 16);
+  Res ^= static_cast<uint16_t>(Value >> 32);
+  Res ^= static_cast<uint16_t>(Value >> 48);
+  return Res;
+}
+
+INITIALIZE_PASS(MachineBlockHashInfo, "machine-block-hash",
+                "Machine Block Hash Analysis", true, true)
+
+char MachineBlockHashInfo::ID = 0;
+
+MachineBlockHashInfo::MachineBlockHashInfo() : MachineFunctionPass(ID) {
+  initializeMachineBlockHashInfoPass(*PassRegistry::getPassRegistry());
+}
+
+void MachineBlockHashInfo::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+struct CollectHashInfo {
+  uint64_t Offset;
+  uint64_t OpcodeHash;
+  uint64_t InstrHash;
+  uint64_t NeighborHash;
+};
+
+bool MachineBlockHashInfo::runOnMachineFunction(MachineFunction &F) {
+  DenseMap<const MachineBasicBlock *, CollectHashInfo> HashInfos;
+  uint16_t Offset = 0;
+  // Initialize hash components
+  for (const MachineBasicBlock &MBB : F) {
+    // offset of the machine basic block
+    HashInfos[&MBB].Offset = Offset;
+    Offset += MBB.size();
+    // Hashing opcodes
+    HashInfos[&MBB].OpcodeHash = hashBlock(MBB, /*HashOperands=*/false);
+    // Hash complete instructions
+    HashInfos[&MBB].InstrHash = hashBlock(MBB, /*HashOperands=*/true);
+  }
+
+  // Initialize neighbor hash
+  for (const MachineBasicBlock &MBB : F) {
+    uint64_t Hash = HashInfos[&MBB].OpcodeHash;
+    // Append hashes of successors
+    for (const MachineBasicBlock *SuccMBB : MBB.successors()) {
+      uint64_t SuccHash = HashInfos[SuccMBB].OpcodeHash;
+      Hash = hashing::detail::hash_16_bytes(Hash, SuccHash);
+    }
+    // Append hashes of predecessors
+    for (const MachineBasicBlock *PredMBB : MBB.predecessors()) {
+      uint64_t PredHash = HashInfos[PredMBB].OpcodeHash;
+      Hash = hashing::detail::hash_16_bytes(Hash, PredHash);
+    }
+    HashInfos[&MBB].NeighborHash = Hash;
+  }
+
+  // Assign hashes
+  for (const MachineBasicBlock &MBB : F) {
+    const auto &HashInfo = HashInfos[&MBB];
+    BlendedBlockHash BlendedHash(fold_64_to_16(HashInfo.Offset),
+                                 fold_64_to_16(HashInfo.OpcodeHash),
+                                 fold_64_to_16(HashInfo.InstrHash),
+                                 fold_64_to_16(HashInfo.NeighborHash));
+    MBBHashInfo[&MBB] = BlendedHash.combine();
+  }
+
+  return false;
+}
+
+uint64_t MachineBlockHashInfo::getMBBHash(const MachineBasicBlock &MBB) {
+  return MBBHashInfo[&MBB];
+}
+
+MachineFunctionPass *llvm::createMachineBlockHashInfoPass() {
+  return new MachineBlockHashInfo();
+}
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 7acddff753693..729e73c8c312c 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -932,12 +932,11 @@ void MachineLICMImpl::InitRegPressure(MachineBasicBlock *BB) {
 void MachineLICMImpl::UpdateRegPressure(const MachineInstr *MI,
                                         bool ConsiderUnseenAsDef) {
   auto Cost = calcRegisterCost(MI, /*ConsiderSeen=*/true, ConsiderUnseenAsDef);
-  for (const auto &RPIdAndCost : Cost) {
-    unsigned Class = RPIdAndCost.first;
-    if (static_cast<int>(RegPressure[Class]) < -RPIdAndCost.second)
+  for (const auto &[Class, Weight] : Cost) {
+    if (static_cast<int>(RegPressure[Class]) < -Weight)
       RegPressure[Class] = 0;
     else
-      RegPressure[Class] += RPIdAndCost.second;
+      RegPressure[Class] += Weight;
   }
 }
 
@@ -1215,11 +1214,10 @@ bool MachineLICMImpl::IsCheapInstruction(MachineInstr &MI) const {
 /// given cost matrix can cause high register pressure.
 bool MachineLICMImpl::CanCauseHighRegPressure(
     const SmallDenseMap<unsigned, int> &Cost, bool CheapInstr) {
-  for (const auto &RPIdAndCost : Cost) {
-    if (RPIdAndCost.second <= 0)
+  for (const auto &[Class, Weight] : Cost) {
+    if (Weight <= 0)
       continue;
 
-    unsigned Class = RPIdAndCost.first;
     int Limit = RegLimit[Class];
 
     // Don't hoist cheap instructions if they would increase register pressure,
@@ -1228,7 +1226,7 @@ bool MachineLICMImpl::CanCauseHighRegPressure(
       return true;
 
     for (const auto &RP : BackTrace)
-      if (static_cast<int>(RP[Class]) + RPIdAndCost.second >= Limit)
+      if (static_cast<int>(RP[Class]) + Weight >= Limit)
         return true;
   }
 
@@ -1246,8 +1244,8 @@ void MachineLICMImpl::UpdateBackTraceRegPressure(const MachineInstr *MI) {
 
   // Update register pressure of blocks from loop header to current block.
   for (auto &RP : BackTrace)
-    for (const auto &RPIdAndCost : Cost)
-      RP[RPIdAndCost.first] += RPIdAndCost.second;
+    for (const auto &[Class, Weight] : Cost)
+      RP[Class] += Weight;
 }
 
 /// Return true if it is potentially profitable to hoist the given loop
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 115485509c4a5..c0710c467a2e6 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -1924,13 +1924,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     if (Src0Ty != Src1Ty)
       report("Source operands must be the same type", MI);
 
-    if (Src0Ty.getScalarType() != DstTy.getScalarType())
+    if (Src0Ty.getScalarType() != DstTy.getScalarType()) {
       report("G_SHUFFLE_VECTOR cannot change element type", MI);
+      break;
+    }
+    if (!Src0Ty.isVector()) {
+      report("G_SHUFFLE_VECTOR must have vector src", MI);
+      break;
+    }
+    if (!DstTy.isVector()) {
+      report("G_SHUFFLE_VECTOR must have vector dst", MI);
+      break;
+    }
 
     // Don't check that all operands are vector because scalars are used in
     // place of 1 element vectors.
-    int SrcNumElts = Src0Ty.isVector() ? Src0Ty.getNumElements() : 1;
-    int DstNumElts = DstTy.isVector() ? DstTy.getNumElements() : 1;
+    int SrcNumElts = Src0Ty.getNumElements();
+    int DstNumElts = DstTy.getNumElements();
 
     ArrayRef<int> MaskIdxes = MaskOp.getShuffleMask();
 
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index e1d39d64e9fb8..493d8dfa8c00a 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -196,7 +196,7 @@ class CopyRewriter : public Rewriter {
   CopyRewriter(MachineInstr &MI) : Rewriter(MI) {
     assert(MI.isCopy() && "Expected copy instruction");
   }
-  virtual ~CopyRewriter() = default;
+  ~CopyRewriter() override = default;
 
   bool getNextRewritableSource(RegSubRegPair &Src,
                                RegSubRegPair &Dst) override {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index f54e2f264556c..620d3d3d02daa 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -593,7 +593,7 @@ bool PreISelIntrinsicLowering::lowerIntrinsics(Module &M) const {
     case Intrinsic::log:
       Changed |= forEachCall(F, [&](CallInst *CI) {
         Type *Ty = CI->getArgOperand(0)->getType();
-        if (!isa<ScalableVectorType>(Ty))
+        if (!TM || !isa<ScalableVectorType>(Ty))
           return false;
         const TargetLowering *TL = TM->getSubtargetImpl(F)->getTargetLowering();
         unsigned Op = TL->IntrinsicIDToISD(F.getIntrinsicID());
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index d6fb2a0b0728f..6a0619dd8e0ce 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -211,7 +211,7 @@ class RegAllocFastImpl {
     unsigned getSparseSetIndex() const { return VirtReg.virtRegIndex(); }
   };
 
-  using LiveRegMap = SparseSet<LiveReg, unsigned, identity_cxx20, uint16_t>;
+  using LiveRegMap = SparseSet<LiveReg, unsigned, identity, uint16_t>;
   /// This map contains entries for each virtual register that is currently
   /// available in a physical register.
   LiveRegMap LiveVirtRegs;
diff --git a/llvm/lib/CodeGen/RegisterUsageInfo.cpp b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
index 7a4628a6e91d4..2ef380fc7cad4 100644
--- a/llvm/lib/CodeGen/RegisterUsageInfo.cpp
+++ b/llvm/lib/CodeGen/RegisterUsageInfo.cpp
@@ -44,7 +44,7 @@ void PhysicalRegisterUsageInfo::setTargetMachine(const TargetMachine &TM) {
 }
 
 bool PhysicalRegisterUsageInfo::doInitialization(Module &M) {
-  RegMasks.grow(M.size());
+  RegMasks.reserve(M.size());
   return false;
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0ebc8f0abcfaf..76247c201f61d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -2476,16 +2476,17 @@ static bool canFoldInAddressingMode(SDNode *N, SDNode *Use, SelectionDAG &DAG,
 /// masked vector operation if the target supports it.
 static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
                                               bool ShouldCommuteOperands) {
-  // Match a select as operand 1. The identity constant that we are looking for
-  // is only valid as operand 1 of a non-commutative binop.
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+
+  // Match a select as operand 1. The identity constant that we are looking for
+  // is only valid as operand 1 of a non-commutative binop.
   if (ShouldCommuteOperands)
     std::swap(N0, N1);
 
-  unsigned SelOpcode = N1.getOpcode();
-  if ((SelOpcode != ISD::VSELECT && SelOpcode != ISD::SELECT) ||
-      !N1.hasOneUse())
+  SDValue Cond, TVal, FVal;
+  if (!sd_match(N1, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+                                          m_Value(FVal)))))
     return SDValue();
 
   // We can't hoist all instructions because of immediate UB (not speculatable).
@@ -2493,11 +2494,9 @@ static SDValue foldSelectWithIdentityConstant(SDNode *N, SelectionDAG &DAG,
   if (!DAG.isSafeToSpeculativelyExecuteNode(N))
     return SDValue();
 
+  unsigned SelOpcode = N1.getOpcode();
   unsigned Opcode = N->getOpcode();
   EVT VT = N->getValueType(0);
-  SDValue Cond = N1.getOperand(0);
-  SDValue TVal = N1.getOperand(1);
-  SDValue FVal = N1.getOperand(2);
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   // This transform increases uses of N0, so freeze it to be safe.
@@ -13856,12 +13855,11 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
           Opcode == ISD::ANY_EXTEND) &&
          "Expected EXTEND dag node in input!");
 
-  if (!(N0->getOpcode() == ISD::SELECT || N0->getOpcode() == ISD::VSELECT) ||
-      !N0.hasOneUse())
+  SDValue Cond, Op1, Op2;
+  if (!sd_match(N0, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(Op1),
+                                          m_Value(Op2)))))
     return SDValue();
 
-  SDValue Op1 = N0->getOperand(1);
-  SDValue Op2 = N0->getOperand(2);
   if (!isCompatibleLoad(Op1, Opcode) || !isCompatibleLoad(Op2, Opcode))
     return SDValue();
 
@@ -13883,7 +13881,7 @@ static SDValue tryToFoldExtendSelectLoad(SDNode *N, const TargetLowering &TLI,
 
   SDValue Ext1 = DAG.getNode(Opcode, DL, VT, Op1);
   SDValue Ext2 = DAG.getNode(Opcode, DL, VT, Op2);
-  return DAG.getSelect(DL, VT, N0->getOperand(0), Ext1, Ext2);
+  return DAG.getSelect(DL, VT, Cond, Ext1, Ext2);
 }
 
 /// Try to fold a sext/zext/aext dag node into a ConstantSDNode or
@@ -17464,8 +17462,8 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   // fold (fsub (fpext (fneg (fmul, x, y))), z)
   //   -> (fneg (fma (fpext x), (fpext y), z))
   // Note: This could be removed with appropriate canonicalization of the
-  // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
-  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // input expression into (fneg (fadd (fpext (fmul, x, y)), z)). However, the
+  // command line flag -fp-contract=fast and fast-math flag contract prevent
   // from implementing the canonicalization in visitFSUB.
   if (matcher.match(N0, ISD::FP_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
@@ -17489,7 +17487,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   //   -> (fneg (fma (fpext x)), (fpext y), z)
   // Note: This could be removed with appropriate canonicalization of the
   // input expression into (fneg (fadd (fpext (fmul, x, y)), z). However, the
-  // orthogonal flags -fp-contract=fast and -enable-unsafe-fp-math prevent
+  // command line flag -fp-contract=fast and fast-math flag contract prevent
   // from implementing the canonicalization in visitFSUB.
   if (matcher.match(N0, ISD::FNEG)) {
     SDValue N00 = N0.getOperand(0);
@@ -19997,8 +19995,12 @@ static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
   //    nor a successor of N. Otherwise, if Op is folded that would
   //    create a cycle.
   unsigned MaxSteps = SelectionDAG::getHasPredecessorMaxSteps();
-  for (SDNode *Op : Ptr->users()) {
+  for (SDUse &U : Ptr->uses()) {
+    if (U.getResNo() != Ptr.getResNo())
+      continue;
+
     // Check for #1.
+    SDNode *Op = U.getUser();
     if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
       continue;
 
@@ -29622,13 +29624,14 @@ static SDValue takeInexpensiveLog2(SelectionDAG &DAG, const SDLoc &DL, EVT VT,
   }
 
   // c ? X : Y -> c ? Log2(X) : Log2(Y)
-  if ((Op.getOpcode() == ISD::SELECT || Op.getOpcode() == ISD::VSELECT) &&
-      Op.hasOneUse()) {
-    if (SDValue LogX = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(1),
-                                           Depth + 1, AssumeNonZero))
-      if (SDValue LogY = takeInexpensiveLog2(DAG, DL, VT, Op.getOperand(2),
-                                             Depth + 1, AssumeNonZero))
-        return DAG.getSelect(DL, VT, Op.getOperand(0), LogX, LogY);
+  SDValue Cond, TVal, FVal;
+  if (sd_match(Op, m_OneUse(m_SelectLike(m_Value(Cond), m_Value(TVal),
+                                         m_Value(FVal))))) {
+    if (SDValue LogX =
+            takeInexpensiveLog2(DAG, DL, VT, TVal, Depth + 1, AssumeNonZero))
+      if (SDValue LogY =
+              takeInexpensiveLog2(DAG, DL, VT, FVal, Depth + 1, AssumeNonZero))
+        return DAG.getSelect(DL, VT, Cond, LogX, LogY);
   }
 
   // log2(umin(X, Y)) -> umin(log2(X), log2(Y))
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index 603dc34ce72a7..9656a30321efa 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -890,6 +890,7 @@ class LLVM_LIBRARY_VISIBILITY DAGTypeLegalizer {
   SDValue ScalarizeVecRes_UnaryOpWithExtraInput(SDNode *N);
   SDValue ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N);
   SDValue ScalarizeVecRes_LOAD(LoadSDNode *N);
+  SDValue ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N);
   SDValue ScalarizeVecRes_SCALAR_TO_VECTOR(SDNode *N);
   SDValue ScalarizeVecRes_VSELECT(SDNode *N);
   SDValue ScalarizeVecRes_SELECT(SDNode *N);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 3b5f83f7c089a..bb4a8d9967f94 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -69,6 +69,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     R = ScalarizeVecRes_UnaryOpWithExtraInput(N);
     break;
   case ISD::INSERT_VECTOR_ELT: R = ScalarizeVecRes_INSERT_VECTOR_ELT(N); break;
+  case ISD::ATOMIC_LOAD:
+    R = ScalarizeVecRes_ATOMIC_LOAD(cast<AtomicSDNode>(N));
+    break;
   case ISD::LOAD:           R = ScalarizeVecRes_LOAD(cast<LoadSDNode>(N));break;
   case ISD::SCALAR_TO_VECTOR:  R = ScalarizeVecRes_SCALAR_TO_VECTOR(N); break;
   case ISD::SIGN_EXTEND_INREG: R = ScalarizeVecRes_InregOp(N); break;
@@ -475,6 +478,18 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_INSERT_VECTOR_ELT(SDNode *N) {
   return Op;
 }
 
+SDValue DAGTypeLegalizer::ScalarizeVecRes_ATOMIC_LOAD(AtomicSDNode *N) {
+  SDValue Result = DAG.getAtomicLoad(
+      N->getExtensionType(), SDLoc(N), N->getMemoryVT().getVectorElementType(),
+      N->getValueType(0).getVectorElementType(), N->getChain(), N->getBasePtr(),
+      N->getMemOperand());
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Result.getValue(1));
+  return Result;
+}
+
 SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
   assert(N->isUnindexed() && "Indexed vector load?");
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index e4582169553d9..b1dfc11e8d30f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -8620,7 +8620,10 @@ SDValue SelectionDAG::getMemBasePlusOffset(SDValue Ptr, SDValue Offset,
   if (TLI->shouldPreservePtrArith(this->getMachineFunction().getFunction(),
                                   BasePtrVT))
     return getNode(ISD::PTRADD, DL, BasePtrVT, Ptr, Offset, Flags);
-  return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, Flags);
+  // InBounds only applies to PTRADD, don't set it if we generate ADD.
+  SDNodeFlags AddFlags = Flags;
+  AddFlags.setInBounds(false);
+  return getNode(ISD::ADD, DL, BasePtrVT, Ptr, Offset, AddFlags);
 }
 
 /// Returns true if memcpy source is constant data.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index e2ede57722219..386bcda145a54 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -1162,6 +1162,43 @@ SDValue SelectionDAGBuilder::getMemoryRoot() {
   return updateRoot(PendingLoads);
 }
 
+SDValue SelectionDAGBuilder::getFPOperationRoot(fp::ExceptionBehavior EB) {
+  // If the new exception behavior differs from that of the pending
+  // ones, chain up them and update the root.
+  switch (EB) {
+  case fp::ExceptionBehavior::ebMayTrap:
+  case fp::ExceptionBehavior::ebIgnore:
+    // Floating-point exceptions produced by such operations are not intended
+    // to be observed, so the sequence of these operations does not need to be
+    // preserved.
+    //
+    // They however must not be mixed with the instructions that have strict
+    // exception behavior. Placing an operation with 'ebIgnore' behavior between
+    // 'ebStrict' operations could distort the observed exception behavior.
+    if (!PendingConstrainedFPStrict.empty()) {
+      assert(PendingConstrainedFP.empty());
+      updateRoot(PendingConstrainedFPStrict);
+    }
+    break;
+  case fp::ExceptionBehavior::ebStrict:
+    // Floating-point exception produced by these operations may be observed, so
+    // they must be correctly chained. If trapping on FP exceptions is
+    // disabled, the exceptions can be observed only by functions that read
+    // exception flags, like 'llvm.get_fpenv' or 'fetestexcept'. It means that
+    // the order of operations is not significant between barriers.
+    //
+    // If trapping is enabled, each operation becomes an implicit observation
+    // point, so the operations must be sequenced according their original
+    // source order.
+    if (!PendingConstrainedFP.empty()) {
+      assert(PendingConstrainedFPStrict.empty());
+      updateRoot(PendingConstrainedFP);
+    }
+    // TODO: Add support for trapping-enabled scenarios.
+  }
+  return DAG.getRoot();
+}
+
 SDValue SelectionDAGBuilder::getRoot() {
   // Chain up all pending constrained intrinsics together with all
   // pending loads, by simply appending them to PendingLoads and
@@ -1977,8 +2014,13 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
   if (const Instruction *Inst = dyn_cast<Instruction>(V)) {
     Register InReg = FuncInfo.InitializeRegForValue(Inst);
 
+    std::optional<CallingConv::ID> CallConv;
+    auto *CB = dyn_cast<CallBase>(Inst);
+    if (CB && !CB->isInlineAsm())
+      CallConv = CB->getCallingConv();
+
     RegsForValue RFV(*DAG.getContext(), TLI, DAG.getDataLayout(), InReg,
-                     Inst->getType(), std::nullopt);
+                     Inst->getType(), CallConv);
     SDValue Chain = DAG.getEntryNode();
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
@@ -3126,12 +3168,16 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   if (TLI.useLoadStackGuardNode(M)) {
     Guard = getLoadStackGuard(DAG, dl, Chain);
   } else {
-    const Value *IRGuard = TLI.getSDagStackGuard(M);
-    SDValue GuardPtr = getValue(IRGuard);
-
-    Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
-                        MachinePointerInfo(IRGuard, 0), Align,
-                        MachineMemOperand::MOVolatile);
+    if (const Value *IRGuard = TLI.getSDagStackGuard(M)) {
+      SDValue GuardPtr = getValue(IRGuard);
+      Guard = DAG.getLoad(PtrMemTy, dl, Chain, GuardPtr,
+                          MachinePointerInfo(IRGuard, 0), Align,
+                          MachineMemOperand::MOVolatile);
+    } else {
+      LLVMContext &Ctx = *DAG.getContext();
+      Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+      Guard = DAG.getPOISON(PtrMemTy);
+    }
   }
 
   // Perform the comparison via a getsetcc.
@@ -4381,6 +4427,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         if (NW.hasNoUnsignedWrap() ||
             (int64_t(Offset) >= 0 && NW.hasNoUnsignedSignedWrap()))
           Flags |= SDNodeFlags::NoUnsignedWrap;
+        Flags.setInBounds(NW.isInBounds());
 
         N = DAG.getMemBasePlusOffset(
             N, DAG.getConstant(Offset, dl, N.getValueType()), dl, Flags);
@@ -4424,6 +4471,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         if (NW.hasNoUnsignedWrap() ||
             (Offs.isNonNegative() && NW.hasNoUnsignedSignedWrap()))
           Flags.setNoUnsignedWrap(true);
+        Flags.setInBounds(NW.isInBounds());
 
         OffsVal = DAG.getSExtOrTrunc(OffsVal, dl, N.getValueType());
 
@@ -4493,6 +4541,7 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       // pointer index type (add nuw).
       SDNodeFlags AddFlags;
       AddFlags.setNoUnsignedWrap(NW.hasNoUnsignedWrap());
+      AddFlags.setInBounds(NW.isInBounds());
 
       N = DAG.getMemBasePlusOffset(N, IdxN, dl, AddFlags);
     }
@@ -7312,6 +7361,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       Res = DAG.getPtrExtOrTrunc(Res, sdl, PtrTy);
     } else {
       const Value *Global = TLI.getSDagStackGuard(M);
+      if (!Global) {
+        LLVMContext &Ctx = *DAG.getContext();
+        Ctx.diagnose(DiagnosticInfoGeneric("unable to lower stackguard"));
+        setValue(&I, DAG.getPOISON(PtrTy));
+        return;
+      }
+
       Align Align = DAG.getDataLayout().getPrefTypeAlign(Global->getType());
       Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
                         MachinePointerInfo(Global, 0), Align,
@@ -8272,6 +8328,30 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   }
 }
 
+void SelectionDAGBuilder::pushFPOpOutChain(SDValue Result,
+                                           fp::ExceptionBehavior EB) {
+  assert(Result.getNode()->getNumValues() == 2);
+  SDValue OutChain = Result.getValue(1);
+  assert(OutChain.getValueType() == MVT::Other);
+
+  // Instead of updating the root immediately, push the produced chain to the
+  // appropriate list, deferring the update until the root is requested. In this
+  // case, the nodes from the lists are chained using TokenFactor, indicating
+  // that the operations are independent.
+  //
+  // In particular, the root is updated before any call that might access the
+  // floating-point environment, except for constrained intrinsics.
+  switch (EB) {
+  case fp::ExceptionBehavior::ebMayTrap:
+  case fp::ExceptionBehavior::ebIgnore:
+    PendingConstrainedFP.push_back(OutChain);
+    break;
+  case fp::ExceptionBehavior::ebStrict:
+    PendingConstrainedFPStrict.push_back(OutChain);
+    break;
+  }
+}
+
 void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
     const ConstrainedFPIntrinsic &FPI) {
   SDLoc sdl = getCurSDLoc();
@@ -8279,42 +8359,16 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   // We do not need to serialize constrained FP intrinsics against
   // each other or against (nonvolatile) loads, so they can be
   // chained like loads.
-  SDValue Chain = DAG.getRoot();
+  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
+  SDValue Chain = getFPOperationRoot(EB);
   SmallVector<SDValue, 4> Opers;
   Opers.push_back(Chain);
   for (unsigned I = 0, E = FPI.getNonMetadataArgCount(); I != E; ++I)
     Opers.push_back(getValue(FPI.getArgOperand(I)));
 
-  auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
-    assert(Result.getNode()->getNumValues() == 2);
-
-    // Push node to the appropriate list so that future instructions can be
-    // chained up correctly.
-    SDValue OutChain = Result.getValue(1);
-    switch (EB) {
-    case fp::ExceptionBehavior::ebIgnore:
-      // The only reason why ebIgnore nodes still need to be chained is that
-      // they might depend on the current rounding mode, and therefore must
-      // not be moved across instruction that may change that mode.
-      [[fallthrough]];
-    case fp::ExceptionBehavior::ebMayTrap:
-      // These must not be moved across calls or instructions that may change
-      // floating-point exception masks.
-      PendingConstrainedFP.push_back(OutChain);
-      break;
-    case fp::ExceptionBehavior::ebStrict:
-      // These must not be moved across calls or instructions that may change
-      // floating-point exception masks or read floating-point exception flags.
-      // In addition, they cannot be optimized out even if unused.
-      PendingConstrainedFPStrict.push_back(OutChain);
-      break;
-    }
-  };
-
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), FPI.getType());
   SDVTList VTs = DAG.getVTList(VT, MVT::Other);
-  fp::ExceptionBehavior EB = *FPI.getExceptionBehavior();
 
   SDNodeFlags Flags;
   if (EB == fp::ExceptionBehavior::ebIgnore)
@@ -8338,7 +8392,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
         !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(), VT)) {
       Opers.pop_back();
       SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags);
-      pushOutChain(Mul, EB);
+      pushFPOpOutChain(Mul, EB);
       Opcode = ISD::STRICT_FADD;
       Opers.clear();
       Opers.push_back(Mul.getValue(1));
@@ -8369,7 +8423,7 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   }
 
   SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags);
-  pushOutChain(Result, EB);
+  pushFPOpOutChain(Result, EB);
 
   SDValue FPResult = Result.getValue(0);
   setValue(&FPI, FPResult);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index c7577fa335feb..47e19f77a15e7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -195,6 +195,11 @@ class SelectionDAGBuilder {
   /// Update root to include all chains from the Pending list.
   SDValue updateRoot(SmallVectorImpl<SDValue> &Pending);
 
+  /// Given a node representing a floating-point operation and its specified
+  /// exception behavior, this either updates the root or stores the node in
+  /// a list to be added to chains latter.
+  void pushFPOpOutChain(SDValue Result, fp::ExceptionBehavior EB);
+
   /// A unique monotonically increasing number used to order the SDNodes we
   /// create.
   unsigned SDNodeOrder;
@@ -300,6 +305,13 @@ class SelectionDAGBuilder {
   /// memory node that may need to be ordered after any prior load instructions.
   SDValue getMemoryRoot();
 
+  /// Return the current virtual root of the Selection DAG, flushing
+  /// PendingConstrainedFP or PendingConstrainedFPStrict items if the new
+  /// exception behavior (specified by \p EB) differs from that of the pending
+  /// instructions. This must be done before emitting constrained FP operation
+  /// call.
+  SDValue getFPOperationRoot(fp::ExceptionBehavior EB);
+
   /// Similar to getMemoryRoot, but also flushes PendingConstrainedFP(Strict)
   /// items. This must be done before emitting any call other any other node
   /// that may need to be ordered after FP instructions due to other side
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 39cbfad6d0be1..77377d348b836 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -689,6 +689,9 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasSameSign())
     OS << " samesign";
 
+  if (getFlags().hasInBounds())
+    OS << " inbounds";
+
   if (getFlags().hasNonNeg())
     OS << " nneg";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 920dff935daed..da4e40953b39a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -9899,6 +9899,18 @@ SDValue TargetLowering::expandBSWAP(SDNode *N, SelectionDAG &DAG) const {
     // Use a rotate by 8. This can be further expanded if necessary.
     return DAG.getNode(ISD::ROTL, dl, VT, Op, DAG.getConstant(8, dl, SHVT));
   case MVT::i32:
+    // This is meant for ARM speficially, which has ROTR but no ROTL.
+    if (isOperationLegalOrCustom(ISD::ROTR, VT)) {
+      SDValue Mask = DAG.getConstant(0x00FF00FF, dl, VT);
+      // (x & 0x00FF00FF) rotr 8 | (x rotl 8) & 0x00FF00FF
+      SDValue And = DAG.getNode(ISD::AND, dl, VT, Op, Mask);
+      SDValue Rotr =
+          DAG.getNode(ISD::ROTR, dl, VT, And, DAG.getConstant(8, dl, SHVT));
+      SDValue Rotl =
+          DAG.getNode(ISD::ROTR, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
+      SDValue And2 = DAG.getNode(ISD::AND, dl, VT, Rotl, Mask);
+      return DAG.getNode(ISD::OR, dl, VT, Rotr, And2);
+    }
     Tmp4 = DAG.getNode(ISD::SHL, dl, VT, Op, DAG.getConstant(24, dl, SHVT));
     Tmp3 = DAG.getNode(ISD::AND, dl, VT, Op,
                        DAG.getConstant(0xFF00, dl, VT));
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 826e4126de44c..83581052560cb 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -319,7 +319,7 @@ bool ShrinkWrapImpl::useOrDefCSROrFI(const MachineInstr &MI, RegScavenger *RS,
       return isa<GlobalValue>(UO);
     }
     if (const PseudoSourceValue *PSV = Op->getPseudoValue())
-      return PSV->isJumpTable();
+      return PSV->isJumpTable() || PSV->isConstantPool();
     return false;
   };
   // Load/store operations may access the stack indirectly when we previously
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 060b1ddc2ef39..59798b3cf201a 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -2097,6 +2097,11 @@ Value *TargetLoweringBase::getSDagStackGuard(const Module &M) const {
 }
 
 Function *TargetLoweringBase::getSSPStackGuardCheck(const Module &M) const {
+  // MSVC CRT has a function to validate security cookie.
+  RTLIB::LibcallImpl SecurityCheckCookieLibcall =
+      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
+  if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
+    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
   return nullptr;
 }
 
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 5eb86e740ff7c..c33bf8b014b55 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -44,14 +44,14 @@ bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
     return false;
 
   return StringSwitch<bool>(FPAttr.getValueAsString())
-      .Cases("all", "non-leaf", "reserved", true)
+      .Cases({"all", "non-leaf", "reserved"}, true)
       .Case("none", false);
 }
 
 /// HonorSignDependentRoundingFPMath - Return true if the codegen must assume
 /// that the rounding mode of the FPU can change from its default.
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
-  return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
+  return HonorSignDependentRoundingFPMathOption;
 }
 
 /// NOTE: There are targets that still do not support the debug entry values
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index b6169e6c4dc34..10b723887b21f 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -272,6 +272,12 @@ static cl::opt<bool>
                     cl::desc("Split static data sections into hot and cold "
                              "sections using profile information"));
 
+cl::opt<bool> EmitBBHash(
+    "emit-bb-hash",
+    cl::desc(
+        "Emit the hash of basic block in the SHT_LLVM_BB_ADDR_MAP section."),
+    cl::init(false), cl::Optional);
+
 /// Allow standard passes to be disabled by command line options. This supports
 /// simple binary flags that either suppress the pass or do nothing.
 /// i.e. -disable-mypass=false has no effect.
@@ -1281,6 +1287,8 @@ void TargetPassConfig::addMachinePasses() {
   // address map (or both).
   if (TM->getBBSectionsType() != llvm::BasicBlockSection::None ||
       TM->Options.BBAddrMap) {
+    if (EmitBBHash)
+      addPass(llvm::createMachineBlockHashInfoPass());
     if (TM->getBBSectionsType() == llvm::BasicBlockSection::List) {
       addPass(llvm::createBasicBlockSectionsProfileReaderWrapperPass(
           TM->getBBSectionsFuncListBuf()));
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 6c7e27e429849..fa04976f06b5a 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -247,7 +247,7 @@ class InProcessMemoryManager::IPInFlightAlloc
         StandardSegments(std::move(StandardSegments)),
         FinalizationSegments(std::move(FinalizationSegments)) {}
 
-  ~IPInFlightAlloc() {
+  ~IPInFlightAlloc() override {
     assert(!G && "InFlight alloc neither abandoned nor finalized");
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 8d413a35f5a93..d029ac587fb9a 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -2901,13 +2901,23 @@ ExecutionSession::IL_emit(MaterializationResponsibility &MR,
 
   for (auto &SN : ER.Ready)
     IL_collectQueries(
-        EQ.Updated, SN->defs(),
+        EQ.Completed, SN->defs(),
         [](JITDylib::SymbolTableEntry &E) { E.setState(SymbolState::Ready); },
         [](AsynchronousSymbolQuery &Q, JITDylib &JD,
            NonOwningSymbolStringPtr Name, JITDylib::SymbolTableEntry &E) {
           Q.notifySymbolMetRequiredState(SymbolStringPtr(Name), E.getSymbol());
         });
 
+  // std::erase_if is not available in C++17, and llvm::erase_if does not work
+  // here.
+  for (auto it = EQ.Completed.begin(), end = EQ.Completed.end(); it != end;) {
+    if ((*it)->isComplete()) {
+      ++it;
+    } else {
+      it = EQ.Completed.erase(it);
+    }
+  }
+
 #ifdef EXPENSIVE_CHECKS
   verifySessionState("exiting ExecutionSession::IL_emit");
 #endif
@@ -3043,9 +3053,8 @@ Error ExecutionSession::OL_notifyEmitted(
     }
   }
 
-  for (auto &UQ : EmitQueries->Updated)
-    if (UQ->isComplete())
-      UQ->handleComplete(*this);
+  for (auto &UQ : EmitQueries->Completed)
+    UQ->handleComplete(*this);
 
   // If there are any bad dependencies then return an error.
   if (!BadDeps.empty()) {
diff --git a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
index 75ae80f610a56..4ceff483ad799 100644
--- a/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Debugging/DebuggerSupportPlugin.cpp
@@ -38,7 +38,7 @@ class MachODebugObjectSynthesizerBase
 
   MachODebugObjectSynthesizerBase(LinkGraph &G, ExecutorAddr RegisterActionAddr)
       : G(G), RegisterActionAddr(RegisterActionAddr) {}
-  virtual ~MachODebugObjectSynthesizerBase() = default;
+  ~MachODebugObjectSynthesizerBase() override = default;
 
   Error preserveDebugSections() {
     if (G.findSectionByName(SynthDebugSectionName)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
index d1a6eaf914a78..a2990ab3805ad 100644
--- a/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LinkGraphLinkingLayer.cpp
@@ -55,7 +55,7 @@ class LinkGraphLinkingLayer::JITLinkCtx final : public JITLinkContext {
     Plugins = Layer.Plugins;
   }
 
-  ~JITLinkCtx() {
+  ~JITLinkCtx() override {
     // If there is an object buffer return function then use it to
     // return ownership of the buffer.
     if (Layer.ReturnObjectBuffer && ObjBuffer)
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
index fd805fbf01fb7..cdde733b3a817 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -92,7 +92,7 @@ class OrcCAPIMaterializationUnit : public llvm::orc::MaterializationUnit {
         Name(std::move(Name)), Ctx(Ctx), Materialize(Materialize),
         Discard(Discard), Destroy(Destroy) {}
 
-  ~OrcCAPIMaterializationUnit() {
+  ~OrcCAPIMaterializationUnit() override {
     if (Ctx)
       Destroy(Ctx);
   }
@@ -264,7 +264,7 @@ class CAPIDefinitionGenerator final : public DefinitionGenerator {
       LLVMOrcCAPIDefinitionGeneratorTryToGenerateFunction TryToGenerate)
       : Dispose(Dispose), Ctx(Ctx), TryToGenerate(TryToGenerate) {}
 
-  ~CAPIDefinitionGenerator() {
+  ~CAPIDefinitionGenerator() override {
     if (Dispose)
       Dispose(Ctx);
   }
diff --git a/llvm/lib/Frontend/HLSL/CBuffer.cpp b/llvm/lib/Frontend/HLSL/CBuffer.cpp
index 407b6ad6d5a7e..1f53c87bb1683 100644
--- a/llvm/lib/Frontend/HLSL/CBuffer.cpp
+++ b/llvm/lib/Frontend/HLSL/CBuffer.cpp
@@ -43,8 +43,13 @@ std::optional<CBufferMetadata> CBufferMetadata::get(Module &M) {
   for (const MDNode *MD : CBufMD->operands()) {
     assert(MD->getNumOperands() && "Invalid cbuffer metadata");
 
-    auto *Handle = cast<GlobalVariable>(
-        cast<ValueAsMetadata>(MD->getOperand(0))->getValue());
+    // For an unused cbuffer, the handle may have been optimized out
+    Metadata *OpMD = MD->getOperand(0);
+    if (!OpMD)
+      continue;
+
+    auto *Handle =
+        cast<GlobalVariable>(cast<ValueAsMetadata>(OpMD)->getValue());
     CBufferMapping &Mapping = Result->Mappings.emplace_back(Handle);
 
     for (int I = 1, E = MD->getNumOperands(); I < E; ++I) {
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index 6ed28c5207c22..2dbaca346862c 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -4181,10 +4181,10 @@ void AssemblyWriter::printTypeIdentities() {
 
 /// printFunction - Print all aspects of a function.
 void AssemblyWriter::printFunction(const Function *F) {
-  if (AnnotationWriter) AnnotationWriter->emitFunctionAnnot(F, Out);
-
   if (F->isMaterializable())
     Out << "; Materializable\n";
+  else if (AnnotationWriter)
+    AnnotationWriter->emitFunctionAnnot(F, Out);
 
   const AttributeList &Attrs = F->getAttributes();
   if (Attrs.hasFnAttrs()) {
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 49df886186600..92aaac9514c89 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5262,33 +5262,47 @@ void llvm::UpgradeIntrinsicCall(CallBase *CI, Function *NewFn) {
       return;
     }
 
+    auto GetMaybeAlign = [](Value *Op) {
+      if (auto *CI = dyn_cast<ConstantInt>(Op)) {
+        uint64_t Val = CI->getZExtValue();
+        if (Val == 0)
+          return MaybeAlign();
+        if (isPowerOf2_64(Val))
+          return MaybeAlign(Val);
+      }
+      reportFatalUsageError("Invalid alignment argument");
+    };
+    auto GetAlign = [&](Value *Op) {
+      MaybeAlign Align = GetMaybeAlign(Op);
+      if (Align)
+        return *Align;
+      reportFatalUsageError("Invalid zero alignment argument");
+    };
+
     const DataLayout &DL = CI->getDataLayout();
     switch (NewFn->getIntrinsicID()) {
     case Intrinsic::masked_load:
       NewCall = Builder.CreateMaskedLoad(
-          CI->getType(), CI->getArgOperand(0),
-          cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue(),
+          CI->getType(), CI->getArgOperand(0), GetAlign(CI->getArgOperand(1)),
           CI->getArgOperand(2), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_gather:
       NewCall = Builder.CreateMaskedGather(
           CI->getType(), CI->getArgOperand(0),
-          DL.getValueOrABITypeAlignment(
-              cast<ConstantInt>(CI->getArgOperand(1))->getMaybeAlignValue(),
-              CI->getType()->getScalarType()),
+          DL.getValueOrABITypeAlignment(GetMaybeAlign(CI->getArgOperand(1)),
+                                        CI->getType()->getScalarType()),
           CI->getArgOperand(2), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_store:
       NewCall = Builder.CreateMaskedStore(
           CI->getArgOperand(0), CI->getArgOperand(1),
-          cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue(),
-          CI->getArgOperand(3));
+          GetAlign(CI->getArgOperand(2)), CI->getArgOperand(3));
       break;
     case Intrinsic::masked_scatter:
       NewCall = Builder.CreateMaskedScatter(
           CI->getArgOperand(0), CI->getArgOperand(1),
           DL.getValueOrABITypeAlignment(
-              cast<ConstantInt>(CI->getArgOperand(2))->getMaybeAlignValue(),
+              GetMaybeAlign(CI->getArgOperand(2)),
               CI->getArgOperand(0)->getType()->getScalarType()),
           CI->getArgOperand(3));
       break;
@@ -6046,6 +6060,120 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 }
 
+// Check if the function attribute is not present and set it.
+static void setFunctionAttrIfNotSet(Function &F, StringRef FnAttrName,
+                                    StringRef Value) {
+  if (!F.hasFnAttribute(FnAttrName))
+    F.addFnAttr(FnAttrName, Value);
+}
+
+// Check if the function attribute is not present and set it if needed.
+// If the attribute is "false" then removes it.
+// If the attribute is "true" resets it to a valueless attribute.
+static void ConvertFunctionAttr(Function &F, bool Set, StringRef FnAttrName) {
+  if (!F.hasFnAttribute(FnAttrName)) {
+    if (Set)
+      F.addFnAttr(FnAttrName);
+  } else {
+    auto A = F.getFnAttribute(FnAttrName);
+    if ("false" == A.getValueAsString())
+      F.removeFnAttr(FnAttrName);
+    else if ("true" == A.getValueAsString()) {
+      F.removeFnAttr(FnAttrName);
+      F.addFnAttr(FnAttrName);
+    }
+  }
+}
+
+void llvm::copyModuleAttrToFunctions(Module &M) {
+  Triple T(M.getTargetTriple());
+  if (!T.isThumb() && !T.isARM() && !T.isAArch64())
+    return;
+
+  uint64_t BTEValue = 0;
+  uint64_t BPPLRValue = 0;
+  uint64_t GCSValue = 0;
+  uint64_t SRAValue = 0;
+  uint64_t SRAALLValue = 0;
+  uint64_t SRABKeyValue = 0;
+
+  NamedMDNode *ModFlags = M.getModuleFlagsMetadata();
+  if (ModFlags) {
+    for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
+      MDNode *Op = ModFlags->getOperand(I);
+      if (Op->getNumOperands() != 3)
+        continue;
+
+      MDString *ID = dyn_cast_or_null<MDString>(Op->getOperand(1));
+      auto *CI = mdconst::dyn_extract<ConstantInt>(Op->getOperand(2));
+      if (!ID || !CI)
+        continue;
+
+      StringRef IDStr = ID->getString();
+      uint64_t *ValPtr = IDStr == "branch-target-enforcement"    ? &BTEValue
+                         : IDStr == "branch-protection-pauth-lr" ? &BPPLRValue
+                         : IDStr == "guarded-control-stack"      ? &GCSValue
+                         : IDStr == "sign-return-address"        ? &SRAValue
+                         : IDStr == "sign-return-address-all"    ? &SRAALLValue
+                         : IDStr == "sign-return-address-with-bkey"
+                             ? &SRABKeyValue
+                             : nullptr;
+      if (!ValPtr)
+        continue;
+
+      *ValPtr = CI->getZExtValue();
+      if (*ValPtr == 2)
+        return;
+    }
+  }
+
+  bool BTE = BTEValue == 1;
+  bool BPPLR = BPPLRValue == 1;
+  bool GCS = GCSValue == 1;
+  bool SRA = SRAValue == 1;
+
+  StringRef SignTypeValue = "non-leaf";
+  if (SRA && SRAALLValue == 1)
+    SignTypeValue = "all";
+
+  StringRef SignKeyValue = "a_key";
+  if (SRA && SRABKeyValue == 1)
+    SignKeyValue = "b_key";
+
+  for (Function &F : M.getFunctionList()) {
+    if (F.isDeclaration())
+      continue;
+
+    if (SRA) {
+      setFunctionAttrIfNotSet(F, "sign-return-address", SignTypeValue);
+      setFunctionAttrIfNotSet(F, "sign-return-address-key", SignKeyValue);
+    } else {
+      if (auto A = F.getFnAttribute("sign-return-address");
+          A.isValid() && "none" == A.getValueAsString()) {
+        F.removeFnAttr("sign-return-address");
+        F.removeFnAttr("sign-return-address-key");
+      }
+    }
+    ConvertFunctionAttr(F, BTE, "branch-target-enforcement");
+    ConvertFunctionAttr(F, BPPLR, "branch-protection-pauth-lr");
+    ConvertFunctionAttr(F, GCS, "guarded-control-stack");
+  }
+
+  if (BTE)
+    M.setModuleFlag(llvm::Module::Min, "branch-target-enforcement", 2);
+  if (BPPLR)
+    M.setModuleFlag(llvm::Module::Min, "branch-protection-pauth-lr", 2);
+  if (GCS)
+    M.setModuleFlag(llvm::Module::Min, "guarded-control-stack", 2);
+  if (SRA) {
+    M.setModuleFlag(llvm::Module::Min, "sign-return-address", 2);
+    if (SRAALLValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-all", 2);
+    if (SRABKeyValue == 1)
+      M.setModuleFlag(llvm::Module::Min, "sign-return-address-with-bkey", 2);
+  }
+}
+
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
diff --git a/llvm/lib/IR/DebugProgramInstruction.cpp b/llvm/lib/IR/DebugProgramInstruction.cpp
index 089448a47a2f5..3fc3d28ba34fd 100644
--- a/llvm/lib/IR/DebugProgramInstruction.cpp
+++ b/llvm/lib/IR/DebugProgramInstruction.cpp
@@ -666,11 +666,11 @@ void DbgMarker::eraseFromParent() {
 }
 
 iterator_range<DbgRecord::self_iterator> DbgMarker::getDbgRecordRange() {
-  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
+  return StoredDbgRecords;
 }
 iterator_range<DbgRecord::const_self_iterator>
 DbgMarker::getDbgRecordRange() const {
-  return make_range(StoredDbgRecords.begin(), StoredDbgRecords.end());
+  return StoredDbgRecords;
 }
 
 void DbgRecord::removeFromParent() {
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 30b5e48652b28..e19336efd1c8a 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -403,9 +403,14 @@ void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
                            Metadata *Val) {
   NamedMDNode *ModFlags = getOrInsertModuleFlagsMetadata();
   // Replace the flag if it already exists.
-  for (MDNode *Flag : ModFlags->operands()) {
+  for (unsigned i = 0; i < ModFlags->getNumOperands(); ++i) {
+    MDNode *Flag = ModFlags->getOperand(i);
     if (cast<MDString>(Flag->getOperand(1))->getString() == Key) {
-      Flag->replaceOperandWith(2, Val);
+      Type *Int32Ty = Type::getInt32Ty(Context);
+      Metadata *Ops[3] = {
+          ConstantAsMetadata::get(ConstantInt::get(Int32Ty, Behavior)),
+          MDString::get(Context, Key), Val};
+      ModFlags->setOperand(i, MDNode::get(Context, Ops));
       return;
     }
   }
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index a6353664e0e7a..62fd62caad3d6 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -111,11 +111,13 @@ uint64_t ModuleSummaryIndex::getFlags() const {
     Flags |= 0x100;
   if (hasUnifiedLTO())
     Flags |= 0x200;
+  if (withInternalizeAndPromote())
+    Flags |= 0x400;
   return Flags;
 }
 
 void ModuleSummaryIndex::setFlags(uint64_t Flags) {
-  assert(Flags <= 0x2ff && "Unexpected bits in flag");
+  assert(Flags <= 0x7ff && "Unexpected bits in flag");
   // 1 bit: WithGlobalValueDeadStripping flag.
   // Set on combined index only.
   if (Flags & 0x1)
@@ -154,6 +156,10 @@ void ModuleSummaryIndex::setFlags(uint64_t Flags) {
   // Set on combined index only.
   if (Flags & 0x200)
     setUnifiedLTO();
+  // 1 bit: WithInternalizeAndPromote flag.
+  // Set on combined index only.
+  if (Flags & 0x400)
+    setWithInternalizeAndPromote();
 }
 
 // Collect for the given module the list of function it defines
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 3ad216a6f7ce1..6d23dad2e185b 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4462,10 +4462,12 @@ void Verifier::visitLoadInst(LoadInst &LI) {
     Check(LI.getOrdering() != AtomicOrdering::Release &&
               LI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Load cannot have Release ordering", &LI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic load operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic load operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &LI);
+
     checkAtomicMemAccessSize(ElTy, &LI);
   } else {
     Check(LI.getSyncScopeID() == SyncScope::System,
@@ -4488,9 +4490,10 @@ void Verifier::visitStoreInst(StoreInst &SI) {
     Check(SI.getOrdering() != AtomicOrdering::Acquire &&
               SI.getOrdering() != AtomicOrdering::AcquireRelease,
           "Store cannot have Acquire ordering", &SI);
-    Check(ElTy->isIntOrPtrTy() || ElTy->isFloatingPointTy(),
-          "atomic store operand must have integer, pointer, or floating point "
-          "type!",
+    Check(ElTy->getScalarType()->isIntOrPtrTy() ||
+              ElTy->getScalarType()->isFloatingPointTy(),
+          "atomic store operand must have integer, pointer, floating point, "
+          "or vector type!",
           ElTy, &SI);
     checkAtomicMemAccessSize(ElTy, &SI);
   } else {
diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp
index a7e7deee8aa91..c16871f081d1d 100644
--- a/llvm/lib/IRReader/IRReader.cpp
+++ b/llvm/lib/IRReader/IRReader.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/IRReader/IRReader.h"
 #include "llvm-c/IRReader.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/IR/LLVMContext.h"
@@ -68,7 +69,8 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
 
 std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
                                       LLVMContext &Context,
-                                      ParserCallbacks Callbacks) {
+                                      ParserCallbacks Callbacks,
+                                      llvm::AsmParserContext *ParserContext) {
   NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription,
                      TimeIRParsingGroupName, TimeIRParsingGroupDescription,
                      TimePassesIsEnabled);
@@ -88,12 +90,14 @@ std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
 
   return parseAssembly(Buffer, Err, Context, nullptr,
                        Callbacks.DataLayout.value_or(
-                           [](StringRef, StringRef) { return std::nullopt; }));
+                           [](StringRef, StringRef) { return std::nullopt; }),
+                       ParserContext);
 }
 
 std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
                                           LLVMContext &Context,
-                                          ParserCallbacks Callbacks) {
+                                          ParserCallbacks Callbacks,
+                                          AsmParserContext *ParserContext) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename, /*IsText=*/true);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -102,7 +106,8 @@ std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
     return nullptr;
   }
 
-  return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks);
+  return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context, Callbacks,
+                 ParserContext);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index cbc0b1d3256bf..4bc2a1885e0fe 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -471,11 +471,13 @@ static void thinLTOInternalizeAndPromoteGUID(
     ValueInfo VI, function_ref<bool(StringRef, ValueInfo)> isExported,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing) {
-  auto ExternallyVisibleCopies =
-      llvm::count_if(VI.getSummaryList(),
-                     [](const std::unique_ptr<GlobalValueSummary> &Summary) {
-                       return !GlobalValue::isLocalLinkage(Summary->linkage());
-                     });
+  // Before performing index-based internalization and promotion for this GUID,
+  // the local flag should be consistent with the summary list linkage types.
+  VI.verifyLocal();
+
+  const bool SingleExternallyVisibleCopy =
+      VI.getSummaryList().size() == 1 &&
+      !GlobalValue::isLocalLinkage(VI.getSummaryList().front()->linkage());
 
   for (auto &S : VI.getSummaryList()) {
     // First see if we need to promote an internal value because it is not
@@ -539,7 +541,9 @@ static void thinLTOInternalizeAndPromoteGUID(
         GlobalValue::isExternalWeakLinkage(S->linkage()))
       continue;
 
-    if (isPrevailing(VI.getGUID(), S.get()) && ExternallyVisibleCopies == 1)
+    // We may have a single summary copy that is externally visible but not
+    // prevailing if the prevailing copy is in a native object.
+    if (SingleExternallyVisibleCopy && isPrevailing(VI.getGUID(), S.get()))
       S->setLinkage(GlobalValue::InternalLinkage);
   }
 }
@@ -551,9 +555,11 @@ void llvm::thinLTOInternalizeAndPromoteInIndex(
     function_ref<bool(StringRef, ValueInfo)> isExported,
     function_ref<bool(GlobalValue::GUID, const GlobalValueSummary *)>
         isPrevailing) {
+  assert(!Index.withInternalizeAndPromote());
   for (auto &I : Index)
     thinLTOInternalizeAndPromoteGUID(Index.getValueInfo(I), isExported,
                                      isPrevailing);
+  Index.setWithInternalizeAndPromote();
 }
 
 // Requires a destructor for std::vector<InputModule>.
@@ -1080,15 +1086,15 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing)
-        ThinLTO.PrevailingModuleForGUID[GUID] = BM.getModuleIdentifier();
+        ThinLTO.setPrevailingModuleForGUID(GUID, BM.getModuleIdentifier());
     }
   }
 
   if (Error Err =
           BM.readSummary(ThinLTO.CombinedIndex, BM.getModuleIdentifier(),
                          [&](GlobalValue::GUID GUID) {
-                           return ThinLTO.PrevailingModuleForGUID[GUID] ==
-                                  BM.getModuleIdentifier();
+                           return ThinLTO.isPrevailingModuleForGUID(
+                               GUID, BM.getModuleIdentifier());
                          }))
     return Err;
   LLVM_DEBUG(dbgs() << "Module " << BM.getModuleIdentifier() << "\n");
@@ -1102,8 +1108,8 @@ LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
           GlobalValue::getGlobalIdentifier(Sym.getIRName(),
                                            GlobalValue::ExternalLinkage, ""));
       if (R.Prevailing) {
-        assert(ThinLTO.PrevailingModuleForGUID[GUID] ==
-               BM.getModuleIdentifier());
+        assert(
+            ThinLTO.isPrevailingModuleForGUID(GUID, BM.getModuleIdentifier()));
 
         // For linker redefined symbols (via --wrap or --defsym) we want to
         // switch the linkage to `weak` to prevent IPOs from happening.
@@ -1982,7 +1988,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
                                LocalWPDTargetsMap);
 
   auto isPrevailing = [&](GlobalValue::GUID GUID, const GlobalValueSummary *S) {
-    return ThinLTO.PrevailingModuleForGUID[GUID] == S->modulePath();
+    return ThinLTO.isPrevailingModuleForGUID(GUID, S->modulePath());
   };
   if (EnableMemProfContextDisambiguation) {
     MemProfContextDisambiguation ContextDisambiguation;
@@ -2218,6 +2224,7 @@ class OutOfProcessThinBackend : public CGThinBackend {
   ArrayRef<StringRef> DistributorArgs;
 
   SString RemoteCompiler;
+  ArrayRef<StringRef> RemoteCompilerPrependArgs;
   ArrayRef<StringRef> RemoteCompilerArgs;
 
   bool SaveTemps;
@@ -2254,12 +2261,14 @@ class OutOfProcessThinBackend : public CGThinBackend {
       bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
       StringRef LinkerOutputFile, StringRef Distributor,
       ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
+      ArrayRef<StringRef> RemoteCompilerPrependArgs,
       ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps)
       : CGThinBackend(Conf, CombinedIndex, ModuleToDefinedGVSummaries,
                       AddStream, OnWrite, ShouldEmitIndexFiles,
                       ShouldEmitImportsFiles, ThinLTOParallelism),
         LinkerOutputFile(LinkerOutputFile), DistributorPath(Distributor),
         DistributorArgs(DistributorArgs), RemoteCompiler(RemoteCompiler),
+        RemoteCompilerPrependArgs(RemoteCompilerPrependArgs),
         RemoteCompilerArgs(RemoteCompilerArgs), SaveTemps(SaveTemps) {}
 
   virtual void setup(unsigned ThinLTONumTasks, unsigned ThinLTOTaskOffset,
@@ -2381,6 +2390,11 @@ class OutOfProcessThinBackend : public CGThinBackend {
         JOS.attributeArray("args", [&]() {
           JOS.value(RemoteCompiler);
 
+          // Forward any supplied prepend options.
+          if (!RemoteCompilerPrependArgs.empty())
+            for (auto &A : RemoteCompilerPrependArgs)
+              JOS.value(A);
+
           JOS.value("-c");
 
           JOS.value(Saver.save("--target=" + Triple.str()));
@@ -2511,6 +2525,7 @@ ThinBackend lto::createOutOfProcessThinBackend(
     bool ShouldEmitIndexFiles, bool ShouldEmitImportsFiles,
     StringRef LinkerOutputFile, StringRef Distributor,
     ArrayRef<StringRef> DistributorArgs, StringRef RemoteCompiler,
+    ArrayRef<StringRef> RemoteCompilerPrependArgs,
     ArrayRef<StringRef> RemoteCompilerArgs, bool SaveTemps) {
   auto Func =
       [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
@@ -2520,7 +2535,7 @@ ThinBackend lto::createOutOfProcessThinBackend(
             Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries,
             AddStream, OnWrite, ShouldEmitIndexFiles, ShouldEmitImportsFiles,
             LinkerOutputFile, Distributor, DistributorArgs, RemoteCompiler,
-            RemoteCompilerArgs, SaveTemps);
+            RemoteCompilerPrependArgs, RemoteCompilerArgs, SaveTemps);
       };
   return ThinBackend(Func, Parallelism);
 }
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index 1bff6cd25156d..f78d9b016d8c9 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -1512,6 +1512,11 @@ Error IRLinker::run() {
   // Loop over all of the linked values to compute type mappings.
   computeTypeMapping();
 
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(*SrcM);
+
   std::reverse(Worklist.begin(), Worklist.end());
   while (!Worklist.empty()) {
     GlobalValue *GV = Worklist.back();
@@ -1677,6 +1682,11 @@ IRMover::IRMover(Module &M) : Composite(M) {
   for (const auto *MD : StructTypes.getVisitedMetadata()) {
     SharedMDs[MD].reset(const_cast<MDNode *>(MD));
   }
+
+  // Convert module level attributes to function level attributes because
+  // after merging modules the attributes might change and would have different
+  // effect on the functions as the original module would have.
+  copyModuleAttrToFunctions(M);
 }
 
 Error IRMover::move(std::unique_ptr<Module> Src,
diff --git a/llvm/lib/MC/CMakeLists.txt b/llvm/lib/MC/CMakeLists.txt
index 1e1d0a6d00601..70c4577aeec0a 100644
--- a/llvm/lib/MC/CMakeLists.txt
+++ b/llvm/lib/MC/CMakeLists.txt
@@ -73,9 +73,10 @@ add_llvm_component_library(LLVMMC
   ${LLVM_MAIN_INCLUDE_DIR}/llvm/MC
 
   LINK_COMPONENTS
+  BinaryFormat
+  DebugInfoDWARFLowLevel
   Support
   TargetParser
-  BinaryFormat
 
   DEPENDS
   intrinsics_gen
diff --git a/llvm/lib/MC/GOFFObjectWriter.cpp b/llvm/lib/MC/GOFFObjectWriter.cpp
index d68f4af86752f..71bd39763956e 100644
--- a/llvm/lib/MC/GOFFObjectWriter.cpp
+++ b/llvm/lib/MC/GOFFObjectWriter.cpp
@@ -440,7 +440,7 @@ class TextStream : public raw_ostream {
     SetBuffer(Buffer, sizeof(Buffer));
   }
 
-  ~TextStream() { flush(); }
+  ~TextStream() override { flush(); }
 };
 } // namespace
 
diff --git a/llvm/lib/MC/MCAsmInfoELF.cpp b/llvm/lib/MC/MCAsmInfoELF.cpp
index 98090d34bcbdc..6670971c3c6ed 100644
--- a/llvm/lib/MC/MCAsmInfoELF.cpp
+++ b/llvm/lib/MC/MCAsmInfoELF.cpp
@@ -197,6 +197,8 @@ void MCAsmInfoELF::printSwitchToSection(const MCSection &Section,
     OS << "llvm_jt_sizes";
   else if (Sec.Type == ELF::SHT_LLVM_CFI_JUMP_TABLE)
     OS << "llvm_cfi_jump_table";
+  else if (Sec.Type == ELF::SHT_LLVM_CALL_GRAPH)
+    OS << "llvm_call_graph";
   else
     OS << "0x" << Twine::utohexstr(Sec.Type);
 
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 6c54a9efbf2c6..930683c447931 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -295,7 +295,7 @@ class MCAsmStreamer final : public MCStreamer {
                              unsigned Flags, unsigned Isa,
                              unsigned Discriminator, StringRef FileName,
                              StringRef Location = {}) override;
-  virtual void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
+  void emitDwarfLocLabelDirective(SMLoc Loc, StringRef Name) override;
 
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index aee3c3b84fd4a..b2f500083f5d8 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -554,7 +554,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".sframe", ELF::SHT_GNU_SFRAME, ELF::SHF_ALLOC);
 
   CallGraphSection =
-      Ctx->getELFSection(".llvm.callgraph", ELF::SHT_PROGBITS, 0);
+      Ctx->getELFSection(".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, 0);
 
   StackSizesSection = Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, 0);
 
@@ -1172,7 +1172,7 @@ MCObjectFileInfo::getCallGraphSection(const MCSection &TextSec) const {
   }
 
   return Ctx->getELFSection(
-      ".llvm.callgraph", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+      ".llvm.callgraph", ELF::SHT_LLVM_CALL_GRAPH, Flags, 0, GroupName,
       /*IsComdat=*/true, ElfSec.getUniqueID(),
       static_cast<const MCSymbolELF *>(TextSec.getBeginSymbol()));
 }
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index 968ccf776440b..a6188f0676937 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -835,7 +835,14 @@ AsmToken AsmLexer::LexToken() {
   }
 
   if (isAtStartOfComment(TokStart)) {
-    CurPtr += MAI.getCommentString().size() - 1;
+    StringRef CommentString = MAI.getCommentString();
+    // For multi-char comment strings, advance CurPtr only if we matched the
+    // full string. This stops us from accidentally eating the newline if the
+    // current line ends in a single comment char.
+    if (CommentString.size() > 1 &&
+        StringRef(TokStart, CommentString.size()) == CommentString) {
+      CurPtr += CommentString.size() - 1;
+    }
     return LexLineComment();
   }
 
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 77e4a30392342..3a18e1041b726 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -734,7 +734,7 @@ class HLASMAsmParser final : public AsmParser {
     Lexer.setLexHLASMStrings(true);
   }
 
-  ~HLASMAsmParser() { Lexer.setSkipSpace(true); }
+  ~HLASMAsmParser() override { Lexer.setSkipSpace(true); }
 
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI) override;
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index 6195355626fd5..1a3752f71f065 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -637,6 +637,8 @@ bool ELFAsmParser::parseSectionArguments(bool IsPush, SMLoc loc) {
       Type = ELF::SHT_LLVM_JT_SIZES;
     else if (TypeName == "llvm_cfi_jump_table")
       Type = ELF::SHT_LLVM_CFI_JUMP_TABLE;
+    else if (TypeName == "llvm_call_graph")
+      Type = ELF::SHT_LLVM_CALL_GRAPH;
     else if (TypeName.getAsInteger(0, Type))
       return TokError("unknown section type");
   }
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
index cb0a67e787bec..23881972d973a 100644
--- a/llvm/lib/MC/MCParser/MasmParser.cpp
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -5847,11 +5847,11 @@ bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
 
 bool MasmParser::lookUpType(StringRef Name, AsmTypeInfo &Info) const {
   unsigned Size = StringSwitch<unsigned>(Name)
-                      .CasesLower("byte", "db", "sbyte", 1)
-                      .CasesLower("word", "dw", "sword", 2)
-                      .CasesLower("dword", "dd", "sdword", 4)
-                      .CasesLower("fword", "df", 6)
-                      .CasesLower("qword", "dq", "sqword", 8)
+                      .CasesLower({"byte", "db", "sbyte"}, 1)
+                      .CasesLower({"word", "dw", "sword"}, 2)
+                      .CasesLower({"dword", "dd", "sdword"}, 4)
+                      .CasesLower({"fword", "df"}, 6)
+                      .CasesLower({"qword", "dq", "sqword"}, 8)
                       .CaseLower("real4", 4)
                       .CaseLower("real8", 8)
                       .CaseLower("real10", 10)
diff --git a/llvm/lib/MC/MCSFrame.cpp b/llvm/lib/MC/MCSFrame.cpp
index d6fa54c087ca3..e0a90dfdb3af1 100644
--- a/llvm/lib/MC/MCSFrame.cpp
+++ b/llvm/lib/MC/MCSFrame.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/MC/MCSFrame.h"
 #include "llvm/BinaryFormat/SFrame.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFCFIProgram.h"
+#include "llvm/DebugInfo/DWARF/LowLevel/DWARFDataExtractorSimple.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCObjectFileInfo.h"
@@ -211,8 +213,152 @@ class SFrameEmitterImpl {
     return true;
   }
 
+  // Technically, the escape data could be anything, but it is commonly a dwarf
+  // CFI program. Even then, it could contain an arbitrarily complicated Dwarf
+  // expression. Following gnu-gas, look for certain common cases that could
+  // invalidate an FDE, emit a warning for those sequences, and don't generate
+  // an FDE in those cases. Allow any that are known safe. It is likely that
+  // more thorough test cases could refine this code, but it handles the most
+  // important ones compatibly with gas.
+  // Returns true if the CFI escape sequence is safe for sframes.
+  bool isCFIEscapeSafe(SFrameFDE &FDE, const SFrameFRE &FRE,
+                       const MCCFIInstruction &CFI) {
+    const MCAsmInfo *AI = Streamer.getContext().getAsmInfo();
+    DWARFDataExtractorSimple data(CFI.getValues(), AI->isLittleEndian(),
+                                  AI->getCodePointerSize());
+
+    // Normally, both alignment factors are extracted from the enclosing Dwarf
+    // FDE or CIE. We don't have one here. Alignments are used for scaling
+    // factors for ops like CFA_def_cfa_offset_sf. But this particular function
+    // is only interested in registers.
+    dwarf::CFIProgram P(/*CodeAlignmentFactor=*/1,
+                        /*DataAlignmentFactor=*/1,
+                        Streamer.getContext().getTargetTriple().getArch());
+    uint64_t Offset = 0;
+    if (P.parse(data, &Offset, CFI.getValues().size())) {
+      // Not a parsable dwarf expression. Assume the worst.
+      Streamer.getContext().reportWarning(
+          CFI.getLoc(),
+          "skipping SFrame FDE; .cfi_escape with unknown effects");
+      return false;
+    }
+
+    // This loop deals with dwarf::CFIProgram::Instructions. Everywhere else
+    // this file deals with MCCFIInstructions.
+    for (const dwarf::CFIProgram::Instruction &I : P) {
+      switch (I.Opcode) {
+      case dwarf::DW_CFA_nop:
+        break;
+      case dwarf::DW_CFA_val_offset: {
+        // First argument is a register. Anything that touches CFA, FP, or RA is
+        // a problem, but allow others through. As an even more special case,
+        // allow SP + 0.
+        auto Reg = I.getOperandAsUnsigned(P, 0);
+        // The parser should have failed in this case.
+        assert(Reg && "DW_CFA_val_offset with no register.");
+        bool SPOk = true;
+        if (*Reg == SPReg) {
+          auto Opnd = I.getOperandAsSigned(P, 1);
+          if (!Opnd || *Opnd != 0)
+            SPOk = false;
+        }
+        if (!SPOk || *Reg == RAReg || *Reg == FPReg) {
+          StringRef RN = *Reg == SPReg
+                             ? "SP reg "
+                             : (*Reg == FPReg ? "FP reg " : "RA reg ");
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine(
+                  "skipping SFrame FDE; .cfi_escape DW_CFA_val_offset with ") +
+                  RN + Twine(*Reg));
+          return false;
+        }
+      } break;
+      case dwarf::DW_CFA_expression: {
+        // First argument is a register. Anything that touches CFA, FP, or RA is
+        // a problem, but allow others through.
+        auto Reg = I.getOperandAsUnsigned(P, 0);
+        if (!Reg) {
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              "skipping SFrame FDE; .cfi_escape with unknown effects");
+          return false;
+        }
+        if (*Reg == SPReg || *Reg == RAReg || *Reg == FPReg) {
+          StringRef RN = *Reg == SPReg
+                             ? "SP reg "
+                             : (*Reg == FPReg ? "FP reg " : "RA reg ");
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine(
+                  "skipping SFrame FDE; .cfi_escape DW_CFA_expression with ") +
+                  RN + Twine(*Reg));
+          return false;
+        }
+      } break;
+      case dwarf::DW_CFA_GNU_args_size: {
+        auto Size = I.getOperandAsSigned(P, 0);
+        // Zero size doesn't affect the cfa.
+        if (Size && *Size == 0)
+          break;
+        if (FRE.Info.getBaseRegister() != BaseReg::FP) {
+          Streamer.getContext().reportWarning(
+              CFI.getLoc(),
+              Twine("skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size "
+                    "with non frame-pointer CFA"));
+          return false;
+        }
+      } break;
+      // Cases that gas doesn't specially handle. TODO: Some of these could be
+      // analyzed and handled instead of just punting. But these are uncommon,
+      // or should be written as normal cfi directives. Some will need fixes to
+      // the scaling factor.
+      case dwarf::DW_CFA_advance_loc:
+      case dwarf::DW_CFA_offset:
+      case dwarf::DW_CFA_restore:
+      case dwarf::DW_CFA_set_loc:
+      case dwarf::DW_CFA_advance_loc1:
+      case dwarf::DW_CFA_advance_loc2:
+      case dwarf::DW_CFA_advance_loc4:
+      case dwarf::DW_CFA_offset_extended:
+      case dwarf::DW_CFA_restore_extended:
+      case dwarf::DW_CFA_undefined:
+      case dwarf::DW_CFA_same_value:
+      case dwarf::DW_CFA_register:
+      case dwarf::DW_CFA_remember_state:
+      case dwarf::DW_CFA_restore_state:
+      case dwarf::DW_CFA_def_cfa:
+      case dwarf::DW_CFA_def_cfa_register:
+      case dwarf::DW_CFA_def_cfa_offset:
+      case dwarf::DW_CFA_def_cfa_expression:
+      case dwarf::DW_CFA_offset_extended_sf:
+      case dwarf::DW_CFA_def_cfa_sf:
+      case dwarf::DW_CFA_def_cfa_offset_sf:
+      case dwarf::DW_CFA_val_offset_sf:
+      case dwarf::DW_CFA_val_expression:
+      case dwarf::DW_CFA_MIPS_advance_loc8:
+      case dwarf::DW_CFA_AARCH64_negate_ra_state_with_pc:
+      case dwarf::DW_CFA_AARCH64_negate_ra_state:
+      case dwarf::DW_CFA_LLVM_def_aspace_cfa:
+      case dwarf::DW_CFA_LLVM_def_aspace_cfa_sf:
+        Streamer.getContext().reportWarning(
+            CFI.getLoc(), "skipping SFrame FDE; .cfi_escape "
+                          "CFA expression with unknown side effects");
+        return false;
+      default:
+        // Dwarf expression was only partially valid, and user could have
+        // written anything.
+        Streamer.getContext().reportWarning(
+            CFI.getLoc(),
+            "skipping SFrame FDE; .cfi_escape with unknown effects");
+        return false;
+      }
+    }
+    return true;
+  }
+
   // Add the effects of CFI to the current FDE, creating a new FRE when
-  // necessary.
+  // necessary. Return true if the CFI is representable in the sframe format.
   bool handleCFI(SFrameFDE &FDE, SFrameFRE &FRE, const MCCFIInstruction &CFI) {
     switch (CFI.getOperation()) {
     case MCCFIInstruction::OpDefCfaRegister:
@@ -265,10 +411,11 @@ class SFrameEmitterImpl {
       FRE = FDE.SaveState.pop_back_val();
       return true;
     case MCCFIInstruction::OpEscape:
-      // TODO: Implement. Will use FDE.
-      return true;
+      // This is a string of bytes that contains an arbitrary dwarf-expression
+      // that may or may not affect unwind info.
+      return isCFIEscapeSafe(FDE, FRE, CFI);
     default:
-      // Instructions that don't affect the CFA, RA, and SP can be safely
+      // Instructions that don't affect the CFA, RA, and FP can be safely
       // ignored.
       return true;
     }
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index fce6b2ac5cf8d..d466009348c47 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -184,7 +184,7 @@ struct CsectSectionEntry : public SectionEntry {
       Group->clear();
   }
 
-  virtual ~CsectSectionEntry() = default;
+  ~CsectSectionEntry() override = default;
 };
 
 struct DwarfSectionEntry : public SectionEntry {
@@ -220,7 +220,7 @@ struct DwarfSectionEntry : public SectionEntry {
 
   DwarfSectionEntry(DwarfSectionEntry &&s) = default;
 
-  virtual ~DwarfSectionEntry() = default;
+  ~DwarfSectionEntry() override = default;
 };
 
 struct ExceptionTableEntry {
@@ -249,7 +249,7 @@ struct ExceptionSectionEntry : public SectionEntry {
     memcpy(Name, N.data(), N.size());
   }
 
-  virtual ~ExceptionSectionEntry() = default;
+  ~ExceptionSectionEntry() override = default;
 };
 
 struct CInfoSymInfo {
@@ -276,7 +276,7 @@ struct CInfoSymSectionEntry : public SectionEntry {
   std::unique_ptr<CInfoSymInfo> Entry;
 
   CInfoSymSectionEntry(StringRef N, int32_t Flags) : SectionEntry(N, Flags) {}
-  virtual ~CInfoSymSectionEntry() = default;
+  ~CInfoSymSectionEntry() override = default;
   void addEntry(std::unique_ptr<CInfoSymInfo> NewEntry) {
     Entry = std::move(NewEntry);
     Entry->Offset = sizeof(uint32_t);
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index f256e7b56591b..6da97f9b3755d 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -322,6 +322,7 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_LTO);
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_JT_SIZES)
     STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CFI_JUMP_TABLE)
+    STRINGIFY_ENUM_CASE(ELF, SHT_LLVM_CALL_GRAPH);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_SFRAME);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_ATTRIBUTES);
     STRINGIFY_ENUM_CASE(ELF, SHT_GNU_HASH);
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index c3a27c9326790..f8a84b075b779 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -744,6 +744,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
   ECase(SHT_LLVM_BB_ADDR_MAP);
   ECase(SHT_LLVM_OFFLOADING);
   ECase(SHT_LLVM_LTO);
+  ECase(SHT_LLVM_CALL_GRAPH);
   ECase(SHT_GNU_SFRAME);
   ECase(SHT_GNU_ATTRIBUTES);
   ECase(SHT_GNU_HASH);
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index e45cac848d0a2..3c9a27ac24015 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -669,7 +669,14 @@ void PassBuilder::registerFunctionAnalyses(FunctionAnalysisManager &FAM) {
   FAM.registerPass([&] { return buildDefaultAAPipeline(); });
 
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
-  FAM.registerPass([&] { return CREATE_PASS; });
+  if constexpr (std::is_constructible_v<                                       \
+                    std::remove_reference_t<decltype(CREATE_PASS)>,            \
+                    const TargetMachine &>) {                                  \
+    if (TM)                                                                    \
+      FAM.registerPass([&] { return CREATE_PASS; });                           \
+  } else {                                                                     \
+    FAM.registerPass([&] { return CREATE_PASS; });                             \
+  }
 #include "PassRegistry.def"
 
   for (auto &C : FunctionAnalysisRegistrationCallbacks)
@@ -1095,6 +1102,31 @@ Expected<MemorySanitizerOptions> parseMSanPassOptions(StringRef Params) {
   return Result;
 }
 
+Expected<AllocTokenOptions> parseAllocTokenPassOptions(StringRef Params) {
+  AllocTokenOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName.consume_front("mode=")) {
+      if (auto Mode = getAllocTokenModeFromString(ParamName))
+        Result.Mode = *Mode;
+      else
+        return make_error<StringError>(
+            formatv("invalid argument to AllocToken pass mode "
+                    "parameter: '{}'",
+                    ParamName)
+                .str(),
+            inconvertibleErrorCode());
+    } else {
+      return make_error<StringError>(
+          formatv("invalid AllocToken pass parameter '{}'", ParamName).str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 /// Parser of parameters for SimplifyCFG pass.
 Expected<SimplifyCFGOptions> parseSimplifyCFGOptions(StringRef Params) {
   SimplifyCFGOptions Result;
@@ -2013,6 +2045,14 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
   }
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CREATE_PASS)>,          \
+                      const TargetMachine &>) {                                \
+      if (!TM)                                                                 \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+    }                                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS));               \
     return Error::success();                                                   \
   }
@@ -2021,6 +2061,18 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
+    auto CreatePass = CREATE_PASS;                                             \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CreatePass(             \
+                          Params.get()))>,                                     \
+                      const TargetMachine &,                                   \
+                      std::remove_reference_t<decltype(Params.get())>>) {      \
+      if (!TM) {                                                               \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+      }                                                                        \
+    }                                                                          \
     MPM.addPass(createModuleToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
     return Error::success();                                                   \
   }
@@ -2127,6 +2179,14 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
   }
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CREATE_PASS)>,          \
+                      const TargetMachine &>) {                                \
+      if (!TM)                                                                 \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+    }                                                                          \
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS));               \
     return Error::success();                                                   \
   }
@@ -2135,6 +2195,18 @@ Error PassBuilder::parseCGSCCPass(CGSCCPassManager &CGPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
+    auto CreatePass = CREATE_PASS;                                             \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CreatePass(             \
+                          Params.get()))>,                                     \
+                      const TargetMachine &,                                   \
+                      std::remove_reference_t<decltype(Params.get())>>) {      \
+      if (!TM) {                                                               \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+      }                                                                        \
+    }                                                                          \
     CGPM.addPass(createCGSCCToFunctionPassAdaptor(CREATE_PASS(Params.get()))); \
     return Error::success();                                                   \
   }
@@ -2214,6 +2286,14 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
 // Now expand the basic registered passes from the .inc file.
 #define FUNCTION_PASS(NAME, CREATE_PASS)                                       \
   if (Name == NAME) {                                                          \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CREATE_PASS)>,          \
+                      const TargetMachine &>) {                                \
+      if (!TM)                                                                 \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+    }                                                                          \
     FPM.addPass(CREATE_PASS);                                                  \
     return Error::success();                                                   \
   }
@@ -2222,14 +2302,34 @@ Error PassBuilder::parseFunctionPass(FunctionPassManager &FPM,
     auto Params = parsePassParameters(PARSER, Name, NAME);                     \
     if (!Params)                                                               \
       return Params.takeError();                                               \
+    auto CreatePass = CREATE_PASS;                                             \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CreatePass(             \
+                          Params.get()))>,                                     \
+                      const TargetMachine &,                                   \
+                      std::remove_reference_t<decltype(Params.get())>>) {      \
+      if (!TM) {                                                               \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+      }                                                                        \
+    }                                                                          \
     FPM.addPass(CREATE_PASS(Params.get()));                                    \
     return Error::success();                                                   \
   }
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
   if (Name == "require<" NAME ">") {                                           \
+    if constexpr (std::is_constructible_v<                                     \
+                      std::remove_reference_t<decltype(CREATE_PASS)>,          \
+                      const TargetMachine &>) {                                \
+      if (!TM)                                                                 \
+        return make_error<StringError>(                                        \
+            formatv("pass '{0}' requires TargetMachine", Name).str(),          \
+            inconvertibleErrorCode());                                         \
+    }                                                                          \
     FPM.addPass(                                                               \
-        RequireAnalysisPass<                                                   \
-            std::remove_reference_t<decltype(CREATE_PASS)>, Function>());      \
+        RequireAnalysisPass<std::remove_reference_t<decltype(CREATE_PASS)>,    \
+                            Function>());                                      \
     return Error::success();                                                   \
   }                                                                            \
   if (Name == "invalidate<" NAME ">") {                                        \
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 081ce580b4b57..dfc47626a1113 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -126,7 +126,6 @@ MODULE_PASS("openmp-opt", OpenMPOptPass())
 MODULE_PASS("openmp-opt-postlink",
             OpenMPOptPass(ThinOrFullLTOPhase::FullLTOPostLink))
 MODULE_PASS("partial-inliner", PartialInlinerPass())
-MODULE_PASS("alloc-token", AllocTokenPass())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
@@ -182,6 +181,10 @@ MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass())
 #ifndef MODULE_PASS_WITH_PARAMS
 #define MODULE_PASS_WITH_PARAMS(NAME, CLASS, CREATE_PASS, PARSER, PARAMS)
 #endif
+MODULE_PASS_WITH_PARAMS(
+    "alloc-token", "AllocTokenPass",
+    [](AllocTokenOptions Opts) { return AllocTokenPass(Opts); },
+    parseAllocTokenPassOptions, "mode=<mode>")
 MODULE_PASS_WITH_PARAMS(
     "asan", "AddressSanitizerPass",
     [](AddressSanitizerOptions Opts) { return AddressSanitizerPass(Opts); },
@@ -349,7 +352,7 @@ FUNCTION_ANALYSIS("aa", AAManager())
 FUNCTION_ANALYSIS("access-info", LoopAccessAnalysis())
 FUNCTION_ANALYSIS("assumptions", AssumptionAnalysis())
 FUNCTION_ANALYSIS("bb-sections-profile-reader",
-                  BasicBlockSectionsProfileReaderAnalysis(TM))
+                  BasicBlockSectionsProfileReaderAnalysis(*TM))
 FUNCTION_ANALYSIS("block-freq", BlockFrequencyAnalysis())
 FUNCTION_ANALYSIS("branch-prob", BranchProbabilityAnalysis())
 FUNCTION_ANALYSIS("cycles", CycleAnalysis())
@@ -360,7 +363,7 @@ FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("domtree", DominatorTreeAnalysis())
 FUNCTION_ANALYSIS("ephemerals", EphemeralValuesAnalysis())
 FUNCTION_ANALYSIS("func-properties", FunctionPropertiesAnalysis())
-FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(TM))
+FUNCTION_ANALYSIS("machine-function-info", MachineFunctionAnalysis(*TM))
 FUNCTION_ANALYSIS("gc-function", GCFunctionAnalysis())
 FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("last-run-tracking", LastRunTrackingAnalysis())
@@ -410,14 +413,14 @@ FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("annotation-remarks", AnnotationRemarksPass())
 FUNCTION_PASS("assume-builder", AssumeBuilderPass())
 FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
-FUNCTION_PASS("atomic-expand", AtomicExpandPass(TM))
+FUNCTION_PASS("atomic-expand", AtomicExpandPass(*TM))
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
 FUNCTION_PASS("callbr-prepare", CallBrPreparePass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
-FUNCTION_PASS("codegenprepare", CodeGenPreparePass(TM))
-FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(TM))
+FUNCTION_PASS("codegenprepare", CodeGenPreparePass(*TM))
+FUNCTION_PASS("complex-deinterleaving", ComplexDeinterleavingPass(*TM))
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("constraint-elimination", ConstraintEliminationPass())
 FUNCTION_PASS("coro-elide", CoroElidePass())
@@ -434,10 +437,10 @@ FUNCTION_PASS("dot-dom-only", DomOnlyPrinter())
 FUNCTION_PASS("dot-post-dom", PostDomPrinter())
 FUNCTION_PASS("dot-post-dom-only", PostDomOnlyPrinter())
 FUNCTION_PASS("dse", DSEPass())
-FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(TM))
+FUNCTION_PASS("dwarf-eh-prepare", DwarfEHPreparePass(*TM))
 FUNCTION_PASS("drop-unnecessary-assumes", DropUnnecessaryAssumesPass())
-FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(TM))
-FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(TM))
+FUNCTION_PASS("expand-large-div-rem", ExpandLargeDivRemPass(*TM))
+FUNCTION_PASS("expand-memcmp", ExpandMemCmpPass(*TM))
 FUNCTION_PASS("expand-reductions", ExpandReductionsPass())
 FUNCTION_PASS("extra-vector-passes",
               ExtraFunctionPassManager<ShouldRunExtraVectorPasses>())
@@ -450,15 +453,15 @@ FUNCTION_PASS("guard-widening", GuardWideningPass())
 FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("gvn-sink", GVNSinkPass())
 FUNCTION_PASS("helloworld", HelloWorldPass())
-FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(TM))
+FUNCTION_PASS("indirectbr-expand", IndirectBrExpandPass(*TM))
 FUNCTION_PASS("infer-address-spaces", InferAddressSpacesPass())
 FUNCTION_PASS("infer-alignment", InferAlignmentPass())
 FUNCTION_PASS("inject-tli-mappings", InjectTLIMappings())
 FUNCTION_PASS("instcount", InstCountPass())
 FUNCTION_PASS("instnamer", InstructionNamerPass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
-FUNCTION_PASS("interleaved-access", InterleavedAccessPass(TM))
-FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(TM))
+FUNCTION_PASS("interleaved-access", InterleavedAccessPass(*TM))
+FUNCTION_PASS("interleaved-load-combine", InterleavedLoadCombinePass(*TM))
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("jump-threading", JumpThreadingPass())
@@ -537,25 +540,25 @@ FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("redundant-dbg-inst-elim", RedundantDbgInstEliminationPass())
 FUNCTION_PASS("replace-with-veclib", ReplaceWithVeclib())
 FUNCTION_PASS("reg2mem", RegToMemPass())
-FUNCTION_PASS("safe-stack", SafeStackPass(TM))
+FUNCTION_PASS("safe-stack", SafeStackPass(*TM))
 FUNCTION_PASS("sandbox-vectorizer", SandboxVectorizerPass())
 FUNCTION_PASS("scalarize-masked-mem-intrin", ScalarizeMaskedMemIntrinPass())
 FUNCTION_PASS("sccp", SCCPPass())
-FUNCTION_PASS("select-optimize", SelectOptimizePass(TM))
+FUNCTION_PASS("select-optimize", SelectOptimizePass(*TM))
 FUNCTION_PASS("separate-const-offset-from-gep",
               SeparateConstOffsetFromGEPPass())
 FUNCTION_PASS("sink", SinkingPass())
 FUNCTION_PASS("sjlj-eh-prepare", SjLjEHPreparePass(TM))
 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
 FUNCTION_PASS("slsr", StraightLineStrengthReducePass())
-FUNCTION_PASS("stack-protector", StackProtectorPass(TM))
+FUNCTION_PASS("stack-protector", StackProtectorPass(*TM))
 FUNCTION_PASS("strip-gc-relocates", StripGCRelocates())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
 FUNCTION_PASS("transform-warning", WarnMissedTransformationsPass())
 FUNCTION_PASS("trigger-crash-function", TriggerCrashFunctionPass())
 FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 FUNCTION_PASS("tsan", ThreadSanitizerPass())
-FUNCTION_PASS("typepromotion", TypePromotionPass(TM))
+FUNCTION_PASS("typepromotion", TypePromotionPass(*TM))
 FUNCTION_PASS("unify-loop-exits", UnifyLoopExitsPass())
 FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
 FUNCTION_PASS("vector-combine", VectorCombinePass())
@@ -734,7 +737,7 @@ FUNCTION_PASS_WITH_PARAMS(
 FUNCTION_PASS_WITH_PARAMS(
     "expand-fp", "ExpandFpPass",
     [TM = TM](CodeGenOptLevel OL) {
-      return ExpandFpPass(TM, OL);
+      return ExpandFpPass(*TM, OL);
     },
     parseExpandFpOptions, "O0;O1;O2;O3")
 
diff --git a/llvm/lib/Support/AllocToken.cpp b/llvm/lib/Support/AllocToken.cpp
new file mode 100644
index 0000000000000..8e9e89f0df353
--- /dev/null
+++ b/llvm/lib/Support/AllocToken.cpp
@@ -0,0 +1,61 @@
+//===- AllocToken.cpp - Allocation Token Calculation ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Definition of AllocToken modes and shared calculation of stateless token IDs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/AllocToken.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/SipHash.h"
+
+using namespace llvm;
+
+std::optional<AllocTokenMode>
+llvm::getAllocTokenModeFromString(StringRef Name) {
+  return StringSwitch<std::optional<AllocTokenMode>>(Name)
+      .Case("increment", AllocTokenMode::Increment)
+      .Case("random", AllocTokenMode::Random)
+      .Case("typehash", AllocTokenMode::TypeHash)
+      .Case("typehashpointersplit", AllocTokenMode::TypeHashPointerSplit)
+      .Default(std::nullopt);
+}
+
+static uint64_t getStableHash(const AllocTokenMetadata &Metadata,
+                              uint64_t MaxTokens) {
+  return getStableSipHash(Metadata.TypeName) % MaxTokens;
+}
+
+std::optional<uint64_t> llvm::getAllocToken(AllocTokenMode Mode,
+                                            const AllocTokenMetadata &Metadata,
+                                            uint64_t MaxTokens) {
+  assert(MaxTokens && "Must provide non-zero max tokens");
+
+  switch (Mode) {
+  case AllocTokenMode::Increment:
+  case AllocTokenMode::Random:
+    // Stateful modes cannot be implemented as a pure function.
+    return std::nullopt;
+
+  case AllocTokenMode::TypeHash:
+    return getStableHash(Metadata, MaxTokens);
+
+  case AllocTokenMode::TypeHashPointerSplit: {
+    if (MaxTokens == 1)
+      return 0;
+    const uint64_t HalfTokens = MaxTokens / 2;
+    uint64_t Hash = getStableHash(Metadata, HalfTokens);
+    if (Metadata.ContainsPointer)
+      Hash += HalfTokens;
+    return Hash;
+  }
+  }
+
+  llvm_unreachable("");
+}
diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 42b21b5e62029..671a5fe941cef 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -149,6 +149,7 @@ add_llvm_component_library(LLVMSupport
   AArch64BuildAttributes.cpp
   ARMAttributeParser.cpp
   ARMWinEH.cpp
+  AllocToken.cpp
   Allocator.cpp
   AutoConvert.cpp
   Base64.cpp
diff --git a/llvm/lib/Support/GlobPattern.cpp b/llvm/lib/Support/GlobPattern.cpp
index 0ecf47dc1d3d1..2715229c65be1 100644
--- a/llvm/lib/Support/GlobPattern.cpp
+++ b/llvm/lib/Support/GlobPattern.cpp
@@ -132,24 +132,70 @@ parseBraceExpansions(StringRef S, std::optional<size_t> MaxSubPatterns) {
   return std::move(SubPatterns);
 }
 
+static StringRef maxPlainSubstring(StringRef S) {
+  StringRef Best;
+  while (!S.empty()) {
+    size_t PrefixSize = S.find_first_of("?*[{\\");
+    if (PrefixSize == std::string::npos)
+      PrefixSize = S.size();
+
+    if (Best.size() < PrefixSize)
+      Best = S.take_front(PrefixSize);
+
+    S = S.drop_front(PrefixSize);
+
+    // It's impossible, as the first and last characters of the input string
+    // must be Glob special characters, otherwise they would be parts of
+    // the prefix or the suffix.
+    assert(!S.empty());
+
+    switch (S.front()) {
+    case '\\':
+      S = S.drop_front(2);
+      break;
+    case '[': {
+      // Drop '[' and the first character which can be ']'.
+      S = S.drop_front(2);
+      size_t EndBracket = S.find_first_of("]");
+      // Should not be possible, SubGlobPattern::create should fail on invalid
+      // pattern before we get here.
+      assert(EndBracket != std::string::npos);
+      S = S.drop_front(EndBracket + 1);
+      break;
+    }
+    case '{':
+      // TODO: implement.
+      // Fallback to whatever is best for now.
+      return Best;
+    default:
+      S = S.drop_front(1);
+    }
+  }
+
+  return Best;
+}
+
 Expected<GlobPattern>
 GlobPattern::create(StringRef S, std::optional<size_t> MaxSubPatterns) {
   GlobPattern Pat;
+  Pat.Pattern = S;
 
   // Store the prefix that does not contain any metacharacter.
-  size_t PrefixSize = S.find_first_of("?*[{\\");
-  Pat.Prefix = S.substr(0, PrefixSize);
-  if (PrefixSize == std::string::npos)
+  Pat.PrefixSize = S.find_first_of("?*[{\\");
+  if (Pat.PrefixSize == std::string::npos) {
+    Pat.PrefixSize = S.size();
     return Pat;
-  S = S.substr(PrefixSize);
+  }
+  S = S.substr(Pat.PrefixSize);
 
   // Just in case we stop on unmatched opening brackets.
   size_t SuffixStart = S.find_last_of("?*[]{}\\");
   assert(SuffixStart != std::string::npos);
   if (S[SuffixStart] == '\\')
     ++SuffixStart;
-  ++SuffixStart;
-  Pat.Suffix = S.substr(SuffixStart);
+  if (SuffixStart < S.size())
+    ++SuffixStart;
+  Pat.SuffixSize = S.size() - SuffixStart;
   S = S.substr(0, SuffixStart);
 
   SmallVector<std::string, 1> SubPats;
@@ -199,10 +245,15 @@ GlobPattern::SubGlobPattern::create(StringRef S) {
   return Pat;
 }
 
+StringRef GlobPattern::longest_substr() const {
+  return maxPlainSubstring(
+      Pattern.drop_front(PrefixSize).drop_back(SuffixSize));
+}
+
 bool GlobPattern::match(StringRef S) const {
-  if (!S.consume_front(Prefix))
+  if (!S.consume_front(prefix()))
     return false;
-  if (!S.consume_back(Suffix))
+  if (!S.consume_back(suffix()))
     return false;
   if (SubGlobs.empty() && S.empty())
     return true;
diff --git a/llvm/lib/Support/LSP/Protocol.cpp b/llvm/lib/Support/LSP/Protocol.cpp
index f22126345a435..f8eeb32db02ab 100644
--- a/llvm/lib/Support/LSP/Protocol.cpp
+++ b/llvm/lib/Support/LSP/Protocol.cpp
@@ -96,10 +96,6 @@ static void percentEncode(StringRef Content, std::string &Out) {
 static std::string percentDecode(StringRef Content) {
   std::string Result;
   for (auto I = Content.begin(), E = Content.end(); I != E; ++I) {
-    if (*I != '%') {
-      Result += *I;
-      continue;
-    }
     if (*I == '%' && I + 2 < Content.end() && llvm::isHexDigit(*(I + 1)) &&
         llvm::isHexDigit(*(I + 2))) {
       Result.push_back(llvm::hexFromNibbles(*(I + 1), *(I + 2)));
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index 549c4183298d8..3a9718569a06f 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -89,14 +89,36 @@ void SpecialCaseList::GlobMatcher::preprocess(bool BySize) {
       return A.Name.size() < B.Name.size();
     });
   }
+
+  for (const auto &G : reverse(Globs)) {
+    StringRef Prefix = G.Pattern.prefix();
+    StringRef Suffix = G.Pattern.suffix();
+
+    auto &SToGlob = PrefixSuffixToGlob.emplace(Prefix).first->second;
+    auto &V = SToGlob.emplace(reverse(Suffix)).first->second;
+    V.emplace_back(&G);
+  }
 }
 
 void SpecialCaseList::GlobMatcher::match(
     StringRef Query,
     llvm::function_ref<void(StringRef Rule, unsigned LineNo)> Cb) const {
-  for (const auto &G : reverse(Globs))
-    if (G.Pattern.match(Query))
-      return Cb(G.Name, G.LineNo);
+  if (!PrefixSuffixToGlob.empty()) {
+    for (const auto &[_, SToGlob] : PrefixSuffixToGlob.find_prefixes(Query)) {
+      for (const auto &[_, V] : SToGlob.find_prefixes(reverse(Query))) {
+        for (const auto *G : V) {
+          if (G->Pattern.match(Query)) {
+            Cb(G->Name, G->LineNo);
+            // As soon as we find a match in the vector, we can break for this
+            // vector, since the globs are already sorted by priority within the
+            // prefix group. However, we continue searching other prefix groups
+            // in the map, as they may contain a better match overall.
+            break;
+          }
+        }
+      }
+    }
+  }
 }
 
 SpecialCaseList::Matcher::Matcher(bool UseGlobs, bool RemoveDotSlash)
@@ -111,7 +133,7 @@ Error SpecialCaseList::Matcher::insert(StringRef Pattern, unsigned LineNumber) {
   return std::visit([&](auto &V) { return V.insert(Pattern, LineNumber); }, M);
 }
 
-LLVM_ABI void SpecialCaseList::Matcher::preprocess(bool BySize) {
+void SpecialCaseList::Matcher::preprocess(bool BySize) {
   return std::visit([&](auto &V) { return V.preprocess(BySize); }, M);
 }
 
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 67483ba9dd655..9d45096dddd97 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -240,7 +240,8 @@ class Name2PairMap {
   getGroupEntry(StringRef GroupName, StringRef GroupDescription) {
     std::pair<TimerGroup *, Name2TimerMap> &GroupEntry = Map[GroupName];
     if (!GroupEntry.first)
-      GroupEntry.first = new TimerGroup(GroupName, GroupDescription);
+      GroupEntry.first =
+          new TimerGroup(GroupName, GroupDescription, /*PrintOnExit=*/true);
 
     return GroupEntry;
   }
@@ -270,9 +271,10 @@ TimerGroup &NamedRegionTimer::getNamedTimerGroup(StringRef GroupName,
 static TimerGroup *TimerGroupList = nullptr;
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
-                       sys::SmartMutex<true> &lock)
+                       sys::SmartMutex<true> &lock, bool PrintOnExit)
     : Name(Name.begin(), Name.end()),
-      Description(Description.begin(), Description.end()) {
+      Description(Description.begin(), Description.end()),
+      PrintOnExit(PrintOnExit) {
   // Add the group to TimerGroupList.
   sys::SmartScopedLock<true> L(lock);
   if (TimerGroupList)
@@ -282,12 +284,12 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,
   TimerGroupList = this;
 }
 
-TimerGroup::TimerGroup(StringRef Name, StringRef Description)
-    : TimerGroup(Name, Description, timerLock()) {}
+TimerGroup::TimerGroup(StringRef Name, StringRef Description, bool PrintOnExit)
+    : TimerGroup(Name, Description, timerLock(), PrintOnExit) {}
 
 TimerGroup::TimerGroup(StringRef Name, StringRef Description,
-                       const StringMap<TimeRecord> &Records)
-    : TimerGroup(Name, Description) {
+                       const StringMap<TimeRecord> &Records, bool PrintOnExit)
+    : TimerGroup(Name, Description, PrintOnExit) {
   TimersToPrint.reserve(Records.size());
   for (const auto &P : Records)
     TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()),
@@ -301,7 +303,7 @@ TimerGroup::~TimerGroup() {
   while (FirstTimer)
     removeTimer(*FirstTimer);
 
-  if (!TimersToPrint.empty()) {
+  if (!TimersToPrint.empty() && PrintOnExit) {
     std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
     PrintQueuedTimers(*OutStream);
   }
@@ -530,7 +532,7 @@ class llvm::TimerGlobals {
 
   sys::SmartMutex<true> TimerLock;
   TimerGroup DefaultTimerGroup{"misc", "Miscellaneous Ungrouped Timers",
-                               TimerLock};
+                               TimerLock, /*PrintOnExit=*/true};
   SignpostEmitter Signposts;
 
   // Order of these members and initialization below is important. For example
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 86f95488e6bb7..a4529a5844ab7 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -73,9 +73,16 @@ def SVEUnsupported : AArch64Unsupported {
                       SVE2Unsupported.F);
 }
 
-let F = [HasSME2p2, HasSVE2p2_or_SME2p2, HasNonStreamingSVE_or_SME2p2,
-         HasNonStreamingSVE2p2_or_SME2p2] in
-def SME2p2Unsupported : AArch64Unsupported;
+def SME2p3Unsupported : AArch64Unsupported {
+  let F = [HasSVE2p3_or_SME2p3, HasSVE_B16MM];
+}
+
+def SME2p2Unsupported : AArch64Unsupported {
+  let F = !listconcat([HasSME2p2, HasSVE2p2_or_SME2p2,
+           HasNonStreamingSVE_or_SME2p2,
+           HasNonStreamingSVE2p2_or_SME2p2],
+           SME2p3Unsupported.F);
+}
 
 def SME2p1Unsupported : AArch64Unsupported {
   let F = !listconcat([HasSME2p1, HasSVE2p1_or_SME2p1,
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index ecaeff77fcb4b..b3ec65cab51fa 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -71,7 +71,6 @@ def AArch64PreLegalizerCombiner: GICombiner<
   "AArch64PreLegalizerCombinerImpl", [all_combines,
                                       icmp_redundant_trunc,
                                       fold_global_offset,
-                                      shuffle_to_extract,
                                       ext_addv_to_udot_addv,
                                       ext_uaddv_to_uaddlv,
                                       push_sub_through_zext,
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 46f5f0c1ca9dd..0e94b78d11d83 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -585,6 +585,47 @@ def FeatureSME_TMOP: ExtensionWithMArch<"sme-tmop", "SME_TMOP", "FEAT_SME_TMOP",
 def FeatureSSVE_FEXPA : ExtensionWithMArch<"ssve-fexpa", "SSVE_FEXPA", "FEAT_SSVE_FEXPA",
   "Enable SVE FEXPA instruction in Streaming SVE mode", [FeatureSME2]>;
 
+//===----------------------------------------------------------------------===//
+//  Armv9.7 Architecture Extensions
+//===----------------------------------------------------------------------===//
+
+def FeatureCMH : ExtensionWithMArch<"cmh", "CMH", "FEAT_CMH",
+  "Enable Armv9.7-A Contention Management Hints">;
+
+def FeatureLSCP : ExtensionWithMArch<"lscp", "LSCP", "FEAT_LSCP",
+  "Enable Armv9.7-A Load-acquire and store-release pair extension">;
+
+def FeatureTLBID: ExtensionWithMArch<"tlbid", "TLBID", "FEAT_TLBID",
+  "Enable Armv9.7-A TLBI Domains extension">;
+
+def FeatureMPAMv2: ExtensionWithMArch<"mpamv2", "MPAMv2", "FEAT_MPAMv2",
+  "Enable Armv9.7-A MPAMv2 Lookaside Buffer Invalidate instructions">;
+
+def FeatureMTETC: ExtensionWithMArch<"mtetc", "MTETC", "FEAT_MTETC",
+  "Enable Virtual Memory Tagging Extension">;
+
+def FeatureGCIE: ExtensionWithMArch<"gcie", "GCIE", "FEAT_GCIE",
+  "Enable GICv5 (Generic Interrupt Controller) CPU Interface Extension">;
+
+def FeatureSVE2p3 : ExtensionWithMArch<"sve2p3", "SVE2p3", "FEAT_SVE2p3",
+  "Enable Armv9.7-A Scalable Vector Extension 2.3 instructions", [FeatureSVE2p2]>;
+
+def FeatureSME2p3 : ExtensionWithMArch<"sme2p3", "SME2p3", "FEAT_SME2p3",
+  "Enable Armv9.7-A Scalable Matrix Extension 2.3 instructions", [FeatureSME2p2]>;
+
+def FeatureSVE_B16MM : ExtensionWithMArch<"sve-b16mm", "SVE_B16MM", "FEAT_SVE_B16MM",
+  "Enable Armv9.7-A SVE non-widening BFloat16 matrix multiply-accumulate", [FeatureSVE]>;
+
+def FeatureF16MM : ExtensionWithMArch<"f16mm", "F16MM", "FEAT_F16MM",
+  "Enable Armv9.7-A non-widening half-precision matrix multiply-accumulate", [FeatureFullFP16]>;
+
+def FeatureF16F32DOT : ExtensionWithMArch<"f16f32dot", "F16F32DOT", "FEAT_F16F32DOT",
+  "Enable Armv9.7-A Advanced SIMD half-precision dot product accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>;
+
+def FeatureF16F32MM : ExtensionWithMArch<"f16f32mm", "F16F32MM", "FEAT_F16F32MM",
+  "Enable Armv9.7-A Advanced SIMD half-precision matrix multiply-accumulate to single-precision", [FeatureNEON, FeatureFullFP16]>;
+
+//===----------------------------------------------------------------------===//
 //  Other Features
 //===----------------------------------------------------------------------===//
 
@@ -939,9 +980,12 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
   !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
 def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a",
-  [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2, FeatureLSUI, FeatureOCCMO],
-  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2,
+  [HasV9_5aOps, FeatureCMPBR, FeatureLSUI, FeatureOCCMO],
+  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR,
     FeatureLSUI, FeatureOCCMO])>;
+def HasV9_7aOps : Architecture64<9, 7, "a", "v9.7a",
+  [HasV9_6aOps, FeatureSVE2p3, FeatureFPRCVT],
+  !listconcat(HasV9_6aOps.DefaultExts, [FeatureSVE2p3, FeatureFPRCVT])>;
 def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",
   [ //v8.1
     FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index a81de5c5adc34..d16b11686e3c1 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -9002,12 +9002,12 @@ static void analyzeCallOperands(const AArch64TargetLowering &TLI,
 }
 
 static SMECallAttrs
-getSMECallAttrs(const Function &Caller, const AArch64TargetLowering &TLI,
+getSMECallAttrs(const Function &Caller, const RTLIB::RuntimeLibcallsInfo &RTLCI,
                 const TargetLowering::CallLoweringInfo &CLI) {
   if (CLI.CB)
-    return SMECallAttrs(*CLI.CB, &TLI);
+    return SMECallAttrs(*CLI.CB, &RTLCI);
   if (auto *ES = dyn_cast<ExternalSymbolSDNode>(CLI.Callee))
-    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), TLI));
+    return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(ES->getSymbol(), RTLCI));
   return SMECallAttrs(SMEAttrs(Caller), SMEAttrs(SMEAttrs::Normal));
 }
 
@@ -9029,7 +9029,8 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   // SME Streaming functions are not eligible for TCO as they may require
   // the streaming mode or ZA to be restored after returning from the call.
-  SMECallAttrs CallAttrs = getSMECallAttrs(CallerF, *this, CLI);
+  SMECallAttrs CallAttrs =
+      getSMECallAttrs(CallerF, getRuntimeLibcallsInfo(), CLI);
   if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
       CallAttrs.requiresPreservingAllZAState() ||
       CallAttrs.caller().hasStreamingBody())
@@ -9454,7 +9455,8 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   // Determine whether we need any streaming mode changes.
-  SMECallAttrs CallAttrs = getSMECallAttrs(MF.getFunction(), *this, CLI);
+  SMECallAttrs CallAttrs =
+      getSMECallAttrs(MF.getFunction(), getRuntimeLibcallsInfo(), CLI);
 
   std::optional<unsigned> ZAMarkerNode;
   bool UseNewSMEABILowering = getTM().useNewSMEABILowering();
@@ -19476,6 +19478,61 @@ static SDValue performMulVectorExtendCombine(SDNode *Mul, SelectionDAG &DAG) {
                      Op1 ? Op1 : Mul->getOperand(1));
 }
 
+// Multiplying an RDSVL value by a constant can sometimes be done cheaper by
+// folding a power-of-two factor of the constant into the RDSVL immediate and
+// compensating with an extra shift.
+//
+// We rewrite:
+//   (mul (srl (rdsvl 1), w), x)
+// to one of:
+//   (shl (rdsvl y),  z)   if z > 0
+//   (srl (rdsvl y), abs(z))   if z < 0
+// where integers y, z satisfy   x = y * 2^(w + z)   and   y ∈ [-32, 31].
+static SDValue performMulRdsvlCombine(SDNode *Mul, SelectionDAG &DAG) {
+  SDLoc DL(Mul);
+  EVT VT = Mul->getValueType(0);
+  SDValue MulOp0 = Mul->getOperand(0);
+  int ConstMultiplier =
+      cast<ConstantSDNode>(Mul->getOperand(1))->getSExtValue();
+  if ((MulOp0->getOpcode() != ISD::SRL) ||
+      (MulOp0->getOperand(0).getOpcode() != AArch64ISD::RDSVL))
+    return SDValue();
+
+  unsigned AbsConstValue = abs(ConstMultiplier);
+  unsigned OperandShift =
+      cast<ConstantSDNode>(MulOp0->getOperand(1))->getZExtValue();
+
+  // z ≤ ctz(|x|) - w  (largest extra shift we can take while keeping y
+  // integral)
+  int UpperBound = llvm::countr_zero(AbsConstValue) - OperandShift;
+
+  // To keep y in range, with B = 31 for x > 0 and B = 32 for x < 0, we need:
+  // 2^(w + z) ≥ ceil(x / B)  ⇒  z ≥ ceil_log2(ceil(x / B)) - w  (LowerBound).
+  unsigned B = ConstMultiplier < 0 ? 32 : 31;
+  unsigned CeilAxOverB = (AbsConstValue + (B - 1)) / B; // ceil(|x|/B)
+  int LowerBound = llvm::Log2_32_Ceil(CeilAxOverB) - OperandShift;
+
+  // No valid solution found.
+  if (LowerBound > UpperBound)
+    return SDValue();
+
+  // Any value of z in [LowerBound, UpperBound] is valid. Prefer no extra
+  // shift if possible.
+  int Shift = std::min(std::max(/*prefer*/ 0, LowerBound), UpperBound);
+
+  // y = x / 2^(w + z)
+  int32_t RdsvlMul = (AbsConstValue >> (OperandShift + Shift)) *
+                     (ConstMultiplier < 0 ? -1 : 1);
+  auto Rdsvl = DAG.getNode(AArch64ISD::RDSVL, DL, MVT::i64,
+                           DAG.getSignedConstant(RdsvlMul, DL, MVT::i32));
+
+  if (Shift == 0)
+    return Rdsvl;
+  return DAG.getNode(Shift < 0 ? ISD::SRL : ISD::SHL, DL, VT, Rdsvl,
+                     DAG.getConstant(abs(Shift), DL, MVT::i32),
+                     SDNodeFlags::Exact);
+}
+
 // Combine v4i32 Mul(And(Srl(X, 15), 0x10001), 0xffff) -> v8i16 CMLTz
 // Same for other types with equivalent constants.
 static SDValue performMulVectorCmpZeroCombine(SDNode *N, SelectionDAG &DAG) {
@@ -19604,6 +19661,9 @@ static SDValue performMulCombine(SDNode *N, SelectionDAG &DAG,
   if (!isa<ConstantSDNode>(N1))
     return SDValue();
 
+  if (SDValue Ext = performMulRdsvlCombine(N, DAG))
+    return Ext;
+
   ConstantSDNode *C = cast<ConstantSDNode>(N1);
   const APInt &ConstValue = C->getAPIntValue();
 
@@ -26665,11 +26725,34 @@ static SDValue performDUPCombine(SDNode *N,
   }
 
   if (N->getOpcode() == AArch64ISD::DUP) {
+    SDValue Op = N->getOperand(0);
+
+    // Optimize DUP(extload/zextload i8/i16/i32) to avoid GPR->FPR transfer.
+    // For example:
+    //   v4i32 = DUP (i32 (zextloadi8 addr))
+    // =>
+    //   v4i32 = SCALAR_TO_VECTOR (i32 (zextloadi8 addr)) ; Matches to ldr b0
+    //   v4i32 = DUPLANE32 (v4i32), 0
+    if (auto *LD = dyn_cast<LoadSDNode>(Op)) {
+      ISD::LoadExtType ExtType = LD->getExtensionType();
+      EVT MemVT = LD->getMemoryVT();
+      EVT ElemVT = VT.getVectorElementType();
+      if ((ExtType == ISD::EXTLOAD || ExtType == ISD::ZEXTLOAD) &&
+          (MemVT == MVT::i8 || MemVT == MVT::i16 || MemVT == MVT::i32) &&
+          ElemVT != MemVT && LD->hasOneUse()) {
+        EVT Vec128VT = EVT::getVectorVT(*DCI.DAG.getContext(), ElemVT,
+                                        128 / ElemVT.getSizeInBits());
+        SDValue ScalarToVec =
+            DCI.DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, Vec128VT, Op);
+        return DCI.DAG.getNode(getDUPLANEOp(ElemVT), DL, VT, ScalarToVec,
+                               DCI.DAG.getConstant(0, DL, MVT::i64));
+      }
+    }
+
     // If the instruction is known to produce a scalar in SIMD registers, we can
     // duplicate it across the vector lanes using DUPLANE instead of moving it
     // to a GPR first. For example, this allows us to handle:
     //   v4i32 = DUP (i32 (FCMGT (f32, f32)))
-    SDValue Op = N->getOperand(0);
     // FIXME: Ideally, we should be able to handle all instructions that
     // produce a scalar value in FPRs.
     if (Op.getOpcode() == AArch64ISD::FCMEQ ||
@@ -29430,15 +29513,6 @@ void AArch64TargetLowering::insertSSPDeclarations(Module &M) const {
   TargetLowering::insertSSPDeclarations(M);
 }
 
-Function *AArch64TargetLowering::getSSPStackGuardCheck(const Module &M) const {
-  // MSVC CRT has a function to validate security cookie.
-  RTLIB::LibcallImpl SecurityCheckCookieLibcall =
-      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
-  if (SecurityCheckCookieLibcall != RTLIB::Unsupported)
-    return M.getFunction(getLibcallImplName(SecurityCheckCookieLibcall));
-  return TargetLowering::getSSPStackGuardCheck(M);
-}
-
 Value *
 AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
@@ -29447,11 +29521,6 @@ AArch64TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   if (Subtarget->isTargetAndroid())
     return UseTlsOffset(IRB, 0x48);
 
-  // Fuchsia is similar.
-  // <zircon/tls.h> defines ZX_TLS_UNSAFE_SP_OFFSET with this value.
-  if (Subtarget->isTargetFuchsia())
-    return UseTlsOffset(IRB, -0x8);
-
   return TargetLowering::getSafeStackPointerLocation(IRB);
 }
 
@@ -29769,7 +29838,7 @@ bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
 
   // Checks to allow the use of SME instructions
   if (auto *Base = dyn_cast<CallBase>(&Inst)) {
-    auto CallAttrs = SMECallAttrs(*Base, this);
+    auto CallAttrs = SMECallAttrs(*Base, &getRuntimeLibcallsInfo());
     if (CallAttrs.requiresSMChange() || CallAttrs.requiresLazySave() ||
         CallAttrs.requiresPreservingZT0() ||
         CallAttrs.requiresPreservingAllZAState())
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 9495c9ffc47aa..2cb8ed29f252a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -366,7 +366,6 @@ class AArch64TargetLowering : public TargetLowering {
   Value *getIRStackGuard(IRBuilderBase &IRB) const override;
 
   void insertSSPDeclarations(Module &M) const override;
-  Function *getSSPStackGuardCheck(const Module &M) const override;
 
   /// If the target has a standard location for the unsafe stack pointer,
   /// returns the address of that location. Otherwise, returns nullptr.
diff --git a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
index 31fcd63b9f2c8..5d9215dd71233 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrAtomics.td
@@ -136,8 +136,8 @@ def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
                (ro_Xindexed32 GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend))))),
           (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro_Xextend32:$extend)>;
 def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
-               (am_indexed32 GPR64sp:$Rn, uimm12s8:$offset))))),
-          (LDRSui GPR64sp:$Rn, uimm12s8:$offset)>;
+               (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+          (LDRSui GPR64sp:$Rn, uimm12s4:$offset)>;
 def : Pat<(f32 (bitconvert (i32 (relaxed_load<atomic_load_nonext_32>
                (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
           (LDURSi GPR64sp:$Rn, simm9:$offset)>;
@@ -236,11 +236,11 @@ def : Pat<(relaxed_store<atomic_store_32>
 def : Pat<(releasing_store<atomic_store_64> GPR64sp:$ptr, GPR64:$val),
           (STLRX GPR64:$val, GPR64sp:$ptr)>;
 def : Pat<(relaxed_store<atomic_store_64> (ro_Windexed64 GPR64sp:$Rn, GPR32:$Rm,
-                                                         ro_Wextend16:$extend),
+                                                         ro_Wextend64:$extend),
                                           GPR64:$val),
           (STRXroW GPR64:$val, GPR64sp:$Rn, GPR32:$Rm, ro_Wextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
-                                                         ro_Xextend16:$extend),
+                                                         ro_Xextend64:$extend),
                                           GPR64:$val),
           (STRXroX GPR64:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64>
@@ -276,8 +276,8 @@ def : Pat<(relaxed_store<atomic_store_64> (ro_Xindexed64 GPR64sp:$Rn, GPR64:$Rm,
                                           (i64 (bitconvert (f64 FPR64Op:$val)))),
           (STRDroX FPR64Op:$val, GPR64sp:$Rn, GPR64:$Rm, ro_Xextend64:$extend)>;
 def : Pat<(relaxed_store<atomic_store_64>
-              (am_indexed64 GPR64sp:$Rn, uimm12s4:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
-          (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s4:$offset)>;
+              (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
+          (STRDui FPR64Op:$val, GPR64sp:$Rn, uimm12s8:$offset)>;
 def : Pat<(relaxed_store<atomic_store_64>
                (am_unscaled64 GPR64sp:$Rn, simm9:$offset), (i64 (bitconvert (f64 FPR64Op:$val)))),
           (STURDi FPR64Op:$val, GPR64sp:$Rn, simm9:$offset)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index 09ce713ea862c..58a53af76e1b5 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -1894,6 +1894,21 @@ def btihint_op : Operand<i32> {
   }];
 }
 
+def CMHPriorityHintOperand : AsmOperandClass {
+  let Name = "CMHPriorityHint";
+  let ParserMethod = "tryParseCMHPriorityHint";
+}
+
+def CMHPriorityHint_op : Operand<i32> {
+  let ParserMatchClass = CMHPriorityHintOperand;
+  let PrintMethod = "printCMHPriorityHintOp";
+  let MCOperandPredicate = [{
+    if (!MCOp.isImm())
+      return false;
+    return AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(MCOp.getImm()) != nullptr;
+  }];
+}
+
 class MRSI : RtSystemI<1, (outs GPR64:$Rt), (ins mrs_sysreg_op:$systemreg),
                        "mrs", "\t$Rt, $systemreg"> {
   bits<16> systemreg;
@@ -4636,6 +4651,48 @@ multiclass StorePairOffset<bits<2> opc, bit V, RegisterOperand regtype,
                                                   GPR64sp:$Rn, 0)>;
 }
 
+class BaseLoadStoreAcquirePairOffset<bits<4> opc, bit L, dag oops, dag iops,
+                              string asm>
+    : I<oops, iops, asm, "\t$Rt, $Rt2, [$Rn, #0]", "", []> {
+  bits<5> Rt;
+  bits<5> Rt2;
+  bits<5> Rn;
+  let Inst{31-23} = 0b110110010;
+  let Inst{22}    = L;
+  let Inst{21}    = 0b0;
+  let Inst{20-16} = Rt2;
+  let Inst{15-12} = opc;
+  let Inst{11-10} = 0b10;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rt;
+}
+
+multiclass LoadAcquirePairOffset<bits<4> opc, string asm> {
+  let hasSideEffects = 0, mayStore = 0, mayLoad = 1 in
+  def i : BaseLoadStoreAcquirePairOffset<opc, 0b1,
+                                  (outs GPR64:$Rt, GPR64:$Rt2),
+                                  (ins GPR64sp:$Rn), asm>,
+          Sched<[WriteAtomic, WriteLDHi]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2,
+                                                  GPR64sp:$Rn)>;
+}
+
+
+multiclass StoreAcquirePairOffset<bits<4> opc, string asm> {
+  let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
+  def i : BaseLoadStoreAcquirePairOffset<opc, 0b0, (outs),
+                                  (ins GPR64:$Rt, GPR64:$Rt2,
+                                       GPR64sp:$Rn),
+                                  asm>,
+          Sched<[WriteSTP]>;
+
+  def : InstAlias<asm # "\t$Rt, $Rt2, [$Rn]",
+                  (!cast<Instruction>(NAME # "i") GPR64:$Rt, GPR64:$Rt2,
+                                                  GPR64sp:$Rn)>;
+}
+
 // (pre-indexed)
 class BaseLoadStorePairPreIdx<bits<2> opc, bit V, bit L, dag oops, dag iops,
                               string asm>
@@ -5241,7 +5298,7 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
 }
 
 multiclass FPToIntegerSIMDScalar<bits<2> rmode, bits<3> opcode, string asm, 
-                                 SDPatternOperator OpN = null_frag> {
+                                 SDPatternOperator OpN> {
   // double-precision to 32-bit SIMD/FPR
   def SDr :  BaseFPToIntegerUnscaled<0b01, rmode, opcode, FPR64, FPR32, asm,
              [(set FPR32:$Rd, (i32 (OpN (f64 FPR64:$Rn))))]> {
@@ -6481,8 +6538,7 @@ multiclass SIMDThreeSameVectorFML<bit U, bit b13, bits<3> size, string asm,
 }
 
 multiclass SIMDThreeSameVectorMLA<bit Q, string asm, SDPatternOperator op> {
-
-  def v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
+  def v16i8_v8f16 : BaseSIMDThreeSameVectorDot<Q, 0b0, 0b11, 0b1111, asm, ".8h", ".16b",
                                          V128, v8f16, v16i8, op>;
 }
 
@@ -6491,6 +6547,23 @@ multiclass SIMDThreeSameVectorMLAL<bit Q, bits<2> sz, string asm, SDPatternOpera
                                          V128, v4f32, v16i8, op>;
 }
 
+multiclass SIMDThreeSameVectorFMLA<string asm> {
+  def v8f16_v8f16 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b11, 0b1101, asm, ".8h", ".8h",
+                                          V128, v8f16, v8f16, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorFMLAWiden<string asm> {
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<0b1, 0b0, 0b01, 0b1101, asm, ".4s", ".8h",
+                                          V128, v4f32, v8f16, null_frag>;
+}
+
+multiclass SIMDThreeSameVectorFDot<string asm, SDPatternOperator OpNode = null_frag> {
+  def v4f16_v2f32 : BaseSIMDThreeSameVectorDot<0, 0, 0b10, 0b1111, asm, ".2s", ".4h", V64,
+                                         v2f32, v4f16, OpNode>;
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorDot<1, 0, 0b10, 0b1111, asm, ".4s", ".8h", V128,
+                                         v4f32, v8f16, OpNode>;
+}
+
 // FP8 assembly/disassembly classes
 
 //----------------------------------------------------------------------------
@@ -9112,6 +9185,13 @@ multiclass SIMDThreeSameVectorFMLIndex<bit U, bits<4> opc, string asm,
                                               V128, V128_lo, v4f32, v8f16, VectorIndexH, OpNode>;
 }
 
+multiclass SIMDThreeSameVectorFDOTIndex<string asm> {
+  def v4f16_v2f32 : BaseSIMDThreeSameVectorIndexS<0b0, 0b0, 0b01, 0b1001, asm, ".2s", ".4h", ".2h",
+                                           V64, v2f32, v4f16, VectorIndexS, null_frag>;
+  def v8f16_v4f32 : BaseSIMDThreeSameVectorIndexS<0b1, 0b0, 0b01, 0b1001, asm, ".4s", ".8h",".2h",
+                                            V128, v4f32, v8f16, VectorIndexS, null_frag>;
+}
+
 //----------------------------------------------------------------------------
 // FP8 Advanced SIMD vector x indexed element
 multiclass SIMD_FP8_Dot2_Index<string asm, SDPatternOperator op> {
@@ -13227,3 +13307,34 @@ multiclass SIMDThreeSameVectorFP8MatrixMul<string asm>{
       let Predicates = [HasNEON, HasF8F32MM];
     }
 }
+
+//----------------------------------------------------------------------------
+// Contention Management Hints - FEAT_CMH
+//----------------------------------------------------------------------------
+
+class SHUHInst<string asm> : I<
+    (outs),
+    (ins CMHPriorityHint_op:$priority),
+    asm, "\t$priority", "", []>, Sched<[]> {
+  bits<1> priority;
+  let Inst{31-12} = 0b11010101000000110010;
+  let Inst{11-8}  = 0b0110;
+  let Inst{7-6}   = 0b01;
+  let Inst{5}     = priority;
+  let Inst{4-0}   = 0b11111;
+}
+
+multiclass SHUH<string asm> {
+  def NAME : SHUHInst<asm>;
+  def      : InstAlias<asm, (!cast<Instruction>(NAME) 0), 1>;
+}
+
+class STCPHInst<string asm> : I<
+    (outs),
+    (ins),
+    asm, "", "", []>, Sched<[]> {
+    let Inst{31-12} = 0b11010101000000110010;
+    let Inst{11-8}  = 0b0110;
+    let Inst{7-5}   = 0b100;
+    let Inst{4-0}   = 0b11111;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
index fe8419301b306..30b7b03f7a69a 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrGISel.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -507,7 +507,7 @@ let AddedComplexity = 19 in {
   defm : VecROStoreLane64_0Pat<ro32, store, v2i32, i32, ssub, STRSroW, STRSroX>;
 }
 
-def : Pat<(v8i8 (AArch64dup (i8 (load (am_indexed8 GPR64sp:$Rn))))),
+def : Pat<(v8i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
           (LD1Rv8b GPR64sp:$Rn)>;
 def : Pat<(v16i8 (AArch64dup (i8 (load GPR64sp:$Rn)))),
           (LD1Rv16b GPR64sp:$Rn)>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index d5117da524231..457e5402e0f46 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -5151,7 +5151,15 @@ void AArch64InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // GPR32 zeroing
   if (AArch64::GPR32spRegClass.contains(DestReg) && SrcReg == AArch64::WZR) {
-    if (Subtarget.hasZeroCycleZeroingGPR32()) {
+    if (Subtarget.hasZeroCycleZeroingGPR64() &&
+        !Subtarget.hasZeroCycleZeroingGPR32()) {
+      MCRegister DestRegX = RI.getMatchingSuperReg(DestReg, AArch64::sub_32,
+                                                   &AArch64::GPR64spRegClass);
+      assert(DestRegX.isValid() && "Destination super-reg not valid");
+      BuildMI(MBB, I, DL, get(AArch64::MOVZXi), DestRegX)
+          .addImm(0)
+          .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
+    } else if (Subtarget.hasZeroCycleZeroingGPR32()) {
       BuildMI(MBB, I, DL, get(AArch64::MOVZWi), DestReg)
           .addImm(0)
           .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSL, 0));
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index f788c7510f80c..b9e299ef37454 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -50,63 +50,44 @@ def HasV9_4a         : Predicate<"Subtarget->hasV9_4aOps()">,
                                  AssemblerPredicateWithAll<(all_of HasV9_4aOps), "armv9.4a">;
 def HasV8_0r         : Predicate<"Subtarget->hasV8_0rOps()">,
                                  AssemblerPredicateWithAll<(all_of HasV8_0rOps), "armv8-r">;
-
 def HasEL2VMSA       : Predicate<"Subtarget->hasEL2VMSA()">,
-                       AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureEL2VMSA), "el2vmsa">;
 def HasEL3           : Predicate<"Subtarget->hasEL3()">,
-                       AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureEL3), "el3">;
 def HasVH            : Predicate<"Subtarget->hasVH()">,
-                       AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureVH), "vh">;
 def HasLOR           : Predicate<"Subtarget->hasLOR()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureLOR), "lor">;
 def HasPAuth         : Predicate<"Subtarget->hasPAuth()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
-
+                                 AssemblerPredicateWithAll<(all_of FeaturePAuth), "pauth">;
 def HasPAuthLR       : Predicate<"Subtarget->hasPAuthLR()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
-
+                                 AssemblerPredicateWithAll<(all_of FeaturePAuthLR), "pauth-lr">;
 def HasJS            : Predicate<"Subtarget->hasJS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureJS), "jsconv">;
 def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
-
-def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
-                       AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureCCIDX), "ccidx">;
+def HasComplxNum     : Predicate<"Subtarget->hasComplxNum()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureComplxNum), "complxnum">;
 def HasNV            : Predicate<"Subtarget->hasNV()">,
-                       AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureNV), "nv">;
 def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureMPAM), "mpam">;
 def HasDIT           : Predicate<"Subtarget->hasDIT()">,
-                       AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
-
-def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureDIT), "dit">;
+def HasTRACEV8_4     : Predicate<"Subtarget->hasTRACEV8_4()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTRACEV8_4), "tracev8.4">;
 def HasAM            : Predicate<"Subtarget->hasAM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureAM), "am">;
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
-
-def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureSEL2), "sel2">;
+def HasTLB_RMI       : Predicate<"Subtarget->hasTLB_RMI()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTLB_RMI), "tlb-rmi">;
 def HasFlagM         : Predicate<"Subtarget->hasFlagM()">,
-                       AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
-
-def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPC_IMMO()">,
-                       AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
-
+                                 AssemblerPredicateWithAll<(all_of FeatureFlagM), "flagm">;
+def HasRCPC_IMMO     : Predicate<"Subtarget->hasRCPC_IMMO()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFPARMv8), "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->isNeonAvailable()">,
                                  AssemblerPredicateWithAll<(all_of FeatureNEON), "neon">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
@@ -149,13 +130,13 @@ def HasSVE2          : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasS
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2), "sve2">;
 def HasSVE2p1        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE2p1), "sve2p1">;
-def HasSVEAES       : Predicate<"Subtarget->hasSVEAES()">,
+def HasSVEAES        : Predicate<"Subtarget->hasSVEAES()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEAES), "sve-aes">;
-def HasSVESM4       : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">,
+def HasSVESM4        : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVESM4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVESM4), "sve-sm4">;
-def HasSVESHA3      : Predicate<"Subtarget->hasSVESHA3()">,
+def HasSVESHA3       : Predicate<"Subtarget->hasSVESHA3()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVESHA3), "sve-sha3">;
-def HasSVEBitPerm   : Predicate<"Subtarget->hasSVEBitPerm()">,
+def HasSVEBitPerm    : Predicate<"Subtarget->hasSVEBitPerm()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEBitPerm), "sve-bitperm">;
 def HasSMEandIsNonStreamingSafe
                      : Predicate<"Subtarget->hasSME()">,
@@ -196,7 +177,7 @@ def HasSSVE_FP8DOT2  : Predicate<"Subtarget->hasSSVE_FP8DOT2() || "
                                  "(Subtarget->hasSVE2() && Subtarget->hasFP8DOT2())">,
                                  AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT2,
                                                            (all_of FeatureSVE2, FeatureFP8DOT2)),
-                                "ssve-fp8dot2 or (sve2 and fp8dot2)">;
+                                 "ssve-fp8dot2 or (sve2 and fp8dot2)">;
 def HasFP8DOT4       : Predicate<"Subtarget->hasFP8DOT4()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFP8DOT4), "fp8dot4">;
 def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
@@ -204,43 +185,60 @@ def HasSSVE_FP8DOT4  : Predicate<"Subtarget->hasSSVE_FP8DOT4() || "
                                  AssemblerPredicateWithAll<(any_of FeatureSSVE_FP8DOT4,
                                                            (all_of FeatureSVE2, FeatureFP8DOT4)),
                                  "ssve-fp8dot4 or (sve2 and fp8dot4)">;
-def HasLUT          : Predicate<"Subtarget->hasLUT()">,
+def HasLUT           : Predicate<"Subtarget->hasLUT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLUT), "lut">;
-def HasSME_LUTv2    : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
+def HasSME_LUTv2     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME_LUTv2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_LUTv2), "sme-lutv2">;
-def HasSMEF8F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
+def HasSMEF8F16      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F16()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F16), "sme-f8f16">;
-def HasSMEF8F32     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
+def HasSMEF8F32      : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
-def HasSME_MOP4     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,
+def HasSME_MOP4      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_MOP4())">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_MOP4), "sme-mop4">;
-def HasSME_TMOP     : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,
+def HasSME_TMOP      : Predicate<"(Subtarget->isStreaming() && Subtarget->hasSME_TMOP())">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME_TMOP), "sme-tmop">;
-
-def HasCMPBR        : Predicate<"Subtarget->hasCMPBR()">,
+def HasCMPBR         : Predicate<"Subtarget->hasCMPBR()">,
                                  AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">;
-def HasF8F32MM      : Predicate<"Subtarget->hasF8F32MM()">,
+def HasF8F32MM       : Predicate<"Subtarget->hasF8F32MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">;
-def HasF8F16MM      : Predicate<"Subtarget->hasF8F16MM()">,
+def HasF8F16MM       : Predicate<"Subtarget->hasF8F16MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">;
-def HasFPRCVT       : Predicate<"Subtarget->hasFPRCVT()">,
+def HasFPRCVT        : Predicate<"Subtarget->hasFPRCVT()">,
                                  AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">;
-def HasLSFE         : Predicate<"Subtarget->hasLSFE()">,
+def HasLSFE          : Predicate<"Subtarget->hasLSFE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">;
-def HasSME2p2       : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
+def HasSME2p2        : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">;
-def HasSVEAES2      : Predicate<"Subtarget->hasSVEAES2()">,
+def HasSVEAES2       : Predicate<"Subtarget->hasSVEAES2()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">;
-def HasSVEBFSCALE   : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
+def HasSVEBFSCALE    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">;
-def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
+def HasSVE_F16F32MM  : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;
 def HasPCDPHINT      : Predicate<"Subtarget->hasPCDPHINT()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;
+                                 AssemblerPredicateWithAll<(all_of FeaturePCDPHINT), "pcdphint">;
 def HasLSUI          : Predicate<"Subtarget->hasLSUI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLSUI), "lsui">;
 def HasOCCMO         : Predicate<"Subtarget->hasOCCMO()">,
-                       AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">;
+                                 AssemblerPredicateWithAll<(all_of FeatureOCCMO), "occmo">;
+def HasCMH           : Predicate<"Subtarget->hasCMH()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureCMH), "cmh">;
+def HasLSCP          : Predicate<"Subtarget->hasLSCP()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureLSCP), "lscp">;
+def HasSVE2p2        : Predicate<"Subtarget->hasSVE2p2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p2), "sve2p2">;
+def HasSVE_B16MM     : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_B16MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE_B16MM), "sve-b16mm">;
+def HasF16MM         : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasF16MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16MM), "f16mm">;
+def HasSVE2p3        : Predicate<"Subtarget->hasSVE2p3()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE2p3), "sve2p3">;
+def HasSME2p3        : Predicate<"Subtarget->hasSME2p3()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSME2p3), "sme2p3">;
+def HasF16F32DOT     : Predicate<"Subtarget->hasF16F32DOT()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16F32DOT), "f16f32dot">;
+def HasF16F32MM      : Predicate<"Subtarget->hasF16F32MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF16F32MM), "f16f32mm">;
 
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
@@ -310,6 +308,10 @@ def HasSVE2p2_or_SME2p2
     : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2),
                 "sme2p2 or sve2p2">;
+def HasSVE2p3_or_SME2p3
+    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p3() || Subtarget->hasSME2p3())">,
+                AssemblerPredicateWithAll<(any_of FeatureSME2p3, FeatureSVE2p3),
+                "sme2p3 or sve2p3">;
 def HasNonStreamingSVE2p2_or_SME2p2
     : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p2()) ||"
                 "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
@@ -328,100 +330,110 @@ def HasNEONandIsStreamingSafe
       AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 // A subset of NEON instructions are legal in Streaming SVE mode only with +sme2p2.
 def HasNEONandIsSME2p2StreamingSafe
-    : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">,
-    AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
+                     : Predicate<"Subtarget->isNeonAvailable() || (Subtarget->hasNEON() && Subtarget->hasSME2p2())">,
+                                 AssemblerPredicateWithAll<(any_of FeatureNEON), "neon">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
                                  AssemblerPredicateWithAll<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
-                       AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
+                                 AssemblerPredicateWithAll<(all_of FeatureAltFPCmp), "altnzcv">;
 def HasFRInt3264     : Predicate<"Subtarget->hasFRInt3264()">,
-                       AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
+                                 AssemblerPredicateWithAll<(all_of FeatureFRInt3264), "frint3264">;
 def HasSB            : Predicate<"Subtarget->hasSB()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
-def HasPredRes      : Predicate<"Subtarget->hasPredRes()">,
-                       AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSB), "sb">;
+def HasPredRes       : Predicate<"Subtarget->hasPredRes()">,
+                                 AssemblerPredicateWithAll<(all_of FeaturePredRes), "predres">;
 def HasCCDP          : Predicate<"Subtarget->hasCCDP()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCacheDeepPersist), "ccdp">;
 def HasBTI           : Predicate<"Subtarget->hasBTI()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBranchTargetId), "bti">;
 def HasMTE           : Predicate<"Subtarget->hasMTE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMTE), "mte">;
 def HasTME           : Predicate<"Subtarget->hasTME()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTME), "tme">;
 def HasETE           : Predicate<"Subtarget->hasETE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
+                                 AssemblerPredicateWithAll<(all_of FeatureETE), "ete">;
 def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTRBE), "trbe">;
 def HasBF16          : Predicate<"Subtarget->hasBF16()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBF16), "bf16">;
 def HasNoBF16        : Predicate<"!Subtarget->hasBF16()">;
 def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulInt8), "i8mm">;
 def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP32), "f32mm">;
 def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMatMulFP64), "f64mm">;
 def HasXS            : Predicate<"Subtarget->hasXS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
+                                 AssemblerPredicateWithAll<(all_of FeatureXS), "xs">;
 def HasWFxT          : Predicate<"Subtarget->hasWFxT()">,
-                       AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
+                                 AssemblerPredicateWithAll<(all_of FeatureWFxT), "wfxt">;
 def HasLS64          : Predicate<"Subtarget->hasLS64()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLS64), "ls64">;
 def HasBRBE          : Predicate<"Subtarget->hasBRBE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
+                                 AssemblerPredicateWithAll<(all_of FeatureBRBE), "brbe">;
 def HasSPE_EEF       : Predicate<"Subtarget->hasSPE_EEF()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSPE_EEF), "spe-eef">;
 def HasHBC           : Predicate<"Subtarget->hasHBC()">,
-                       AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
+                                 AssemblerPredicateWithAll<(all_of FeatureHBC), "hbc">;
 def HasMOPS          : Predicate<"Subtarget->hasMOPS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
+                                 AssemblerPredicateWithAll<(all_of FeatureMOPS), "mops">;
 def HasCLRBHB        : Predicate<"Subtarget->hasCLRBHB()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCLRBHB), "clrbhb">;
 def HasSPECRES2      : Predicate<"Subtarget->hasSPECRES2()">,
-                       AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">;
+                                 AssemblerPredicateWithAll<(all_of FeatureSPECRES2), "specres2">;
 def HasITE           : Predicate<"Subtarget->hasITE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureITE), "ite">;
+                                 AssemblerPredicateWithAll<(all_of FeatureITE), "ite">;
 def HasTHE           : Predicate<"Subtarget->hasTHE()">,
-                       AssemblerPredicateWithAll<(all_of FeatureTHE), "the">;
+                                 AssemblerPredicateWithAll<(all_of FeatureTHE), "the">;
 def HasRCPC3         : Predicate<"Subtarget->hasRCPC3()">,
-                       AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">;
+                                 AssemblerPredicateWithAll<(all_of FeatureRCPC3), "rcpc3">;
 def HasLSE128        : Predicate<"Subtarget->hasLSE128()">,
-                       AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">;
+                                 AssemblerPredicateWithAll<(all_of FeatureLSE128), "lse128">;
 def HasD128          : Predicate<"Subtarget->hasD128()">,
-                       AssemblerPredicateWithAll<(all_of FeatureD128), "d128">;
+                                 AssemblerPredicateWithAll<(all_of FeatureD128), "d128">;
 def HasCHK           : Predicate<"Subtarget->hasCHK()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCHK), "chk">;
 def HasGCS           : Predicate<"Subtarget->hasGCS()">,
-                       AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">;
+                                 AssemblerPredicateWithAll<(all_of FeatureGCS), "gcs">;
 def HasCPA           : Predicate<"Subtarget->hasCPA()">,
-                       AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">;
+                                 AssemblerPredicateWithAll<(all_of FeatureCPA), "cpa">;
+def HasTLBID         : Predicate<"Subtarget->hasTLBID()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureTLBID), "tlbid">;
+def HasMPAMv2        : Predicate<"Subtarget->hasMPAMv2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureMPAMv2), "mpamv2">;
+def HasMTETC         : Predicate<"Subtarget->hasMTETC()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureMTETC), "mtetc">;
+def HasGCIE          : Predicate<"Subtarget->hasGCIE()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureGCIE), "gcie">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
 def UseExperimentalZeroingPseudos
-    : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
+                     : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
 def UseAlternateSExtLoadCVTF32
-    : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
+                     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 def UseNegativeImmediates
-    : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
-                                             "NegativeImmediates">;
+                     : Predicate<"false">,
+                                 AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
+                                 "NegativeImmediates">;
 
-def UseScalarIncVL : Predicate<"Subtarget->useScalarIncVL()">;
+def UseScalarIncVL   : Predicate<"Subtarget->useScalarIncVL()">;
 
 def NoUseScalarIncVL : Predicate<"!Subtarget->useScalarIncVL()">;
 
-def HasFastIncVL : Predicate<"!Subtarget->hasDisableFastIncVL()">;
+def HasFastIncVL     : Predicate<"!Subtarget->hasDisableFastIncVL()">;
 
-def UseSVEFPLD1R : Predicate<"!Subtarget->noSVEFPLD1R()">;
+def UseSVEFPLD1R     : Predicate<"!Subtarget->noSVEFPLD1R()">;
 
-def UseLDAPUR : Predicate<"!Subtarget->avoidLDAPUR()">;
+def UseLDAPUR        : Predicate<"!Subtarget->avoidLDAPUR()">;
 
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
                                   SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>,
                                                        SDTCisInt<1>]>>;
 
-def AllowMisalignedMemAccesses : Predicate<"!Subtarget->requiresStrictAlign()">;
+def AllowMisalignedMemAccesses
+                    : Predicate<"!Subtarget->requiresStrictAlign()">;
 
 def UseWzrToVecMove : Predicate<"Subtarget->useWzrToVecMove()">;
 
@@ -3692,6 +3704,12 @@ def UDF : UDFType<0, "udf">;
 // Load instructions.
 //===----------------------------------------------------------------------===//
 
+let Predicates = [HasLSCP] in {
+defm LDAP  : LoadAcquirePairOffset<0b0101, "ldap">;
+defm LDAPP : LoadAcquirePairOffset<0b0111, "ldapp">;
+defm STLP  : StoreAcquirePairOffset<0b0101, "stlp">;
+}
+
 // Pair (indexed, offset)
 defm LDPW : LoadPairOffset<0b00, 0, GPR32z, simm7s4, "ldp">;
 defm LDPX : LoadPairOffset<0b10, 0, GPR64z, simm7s8, "ldp">;
@@ -4004,26 +4022,6 @@ defm LDRSW  : LoadUI<0b10, 0, 0b10, GPR64, uimm12s4, "ldrsw",
 def : Pat<(i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))),
       (SUBREG_TO_REG (i64 0), (LDRWui GPR64sp:$Rn, uimm12s4:$offset), sub_32)>;
 
-// load zero-extended i32, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
-       (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
-
-// load zero-extended i16, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi16 (am_indexed32 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-
-// load zero-extended i8, bitcast to f64
-def : Pat <(f64 (bitconvert (i64 (zextloadi8 (am_indexed32 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-
-// load zero-extended i16, bitcast to f32
-def : Pat <(f32 (bitconvert (i32 (zextloadi16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
-           (SUBREG_TO_REG (i32 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
-
-// load zero-extended i8, bitcast to f32
-def : Pat <(f32 (bitconvert (i32 (zextloadi8 (am_indexed16 GPR64sp:$Rn, uimm12s1:$offset))))),
-           (SUBREG_TO_REG (i32 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
-
 // Pre-fetch.
 def PRFMui : PrefetchUI<0b11, 0, 0b10, "prfm",
                         [(AArch64Prefetch timm:$Rt,
@@ -4375,6 +4373,64 @@ def : Pat <(v1i64 (scalar_to_vector (i64
                (load (ro64.Xpat GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend))))),
            (LDRDroX GPR64sp:$Rn, GPR64:$Rm, ro64.Xext:$extend)>;
 
+// Patterns for bitconvert or scalar_to_vector of load operations.
+// Enables direct SIMD register loads for small integer types (i8/i16) that are
+// naturally zero-extended to i32/i64.
+multiclass ExtLoad8_16AllModes<ValueType OutTy, ValueType InnerTy,
+                                SDPatternOperator OuterOp,
+                                PatFrags LoadOp8, PatFrags LoadOp16> {
+  // 8-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_indexed8 GPR64sp:$Rn, uimm12s1:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRBui GPR64sp:$Rn, uimm12s1:$offset), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (am_unscaled8 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURBi GPR64sp:$Rn, simm9:$offset), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Wpat GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRBroW GPR64sp:$Rn, GPR32:$Rm, ro8.Wext:$extend), bsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp8 (ro8.Xpat GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRBroX GPR64sp:$Rn, GPR64:$Rm, ro8.Xext:$extend), bsub)>;
+
+  // 16-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRHui GPR64sp:$Rn, uimm12s2:$offset), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (am_unscaled16 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURHi GPR64sp:$Rn, simm9:$offset), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Wpat GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRHroW GPR64sp:$Rn, GPR32:$Rm, ro16.Wext:$extend), hsub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp16 (ro16.Xpat GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRHroX GPR64sp:$Rn, GPR64:$Rm, ro16.Xext:$extend), hsub)>;
+}
+
+// Extended multiclass that includes 32-bit loads in addition to 8-bit and 16-bit.
+multiclass ExtLoad8_16_32AllModes<ValueType OutTy, ValueType InnerTy,
+                                   SDPatternOperator OuterOp,
+                                   PatFrags LoadOp8, PatFrags LoadOp16, PatFrags LoadOp32> {
+  defm : ExtLoad8_16AllModes<OutTy, InnerTy, OuterOp, LoadOp8, LoadOp16>;
+
+  // 32-bit loads.
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_indexed32 GPR64sp:$Rn, uimm12s4:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDRSui GPR64sp:$Rn, uimm12s4:$offset), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (am_unscaled32 GPR64sp:$Rn, simm9:$offset))))),
+            (SUBREG_TO_REG (i64 0), (LDURSi GPR64sp:$Rn, simm9:$offset), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Wpat GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRSroW GPR64sp:$Rn, GPR32:$Rm, ro32.Wext:$extend), ssub)>;
+  def : Pat<(OutTy (OuterOp (InnerTy (LoadOp32 (ro32.Xpat GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend))))),
+            (SUBREG_TO_REG (i64 0), (LDRSroX GPR64sp:$Rn, GPR64:$Rm, ro32.Xext:$extend), ssub)>;
+}
+
+// Instantiate bitconvert patterns for floating-point types.
+defm : ExtLoad8_16AllModes<f32, i32, bitconvert, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16_32AllModes<f64, i64, bitconvert, zextloadi8, zextloadi16, zextloadi32>;
+
+// Instantiate scalar_to_vector patterns for all vector types.
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v16i8, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v8i16, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, zextloadi8, zextloadi16>;
+defm : ExtLoad8_16AllModes<v4i32, i32, scalar_to_vector, extloadi8, extloadi16>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, zextloadi8, zextloadi16, zextloadi32>;
+defm : ExtLoad8_16_32AllModes<v2i64, i64, scalar_to_vector, extloadi8, extloadi16, extloadi32>;
+
 // Pre-fetch.
 defm PRFUM : PrefetchUnscaled<0b11, 0, 0b10, "prfum",
                   [(AArch64Prefetch timm:$Rt,
@@ -5239,113 +5295,10 @@ let Predicates = [HasNEON, HasFPRCVT] in{
   defm FCVTNU : FPToIntegerSIMDScalar<0b01, 0b011, "fcvtnu", int_aarch64_neon_fcvtnu>;
   defm FCVTPS : FPToIntegerSIMDScalar<0b10, 0b010, "fcvtps", int_aarch64_neon_fcvtps>;
   defm FCVTPU : FPToIntegerSIMDScalar<0b10, 0b011, "fcvtpu", int_aarch64_neon_fcvtpu>;
-  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs">;
-  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu">;
-}
-
-
-// AArch64's FCVT instructions saturate when out of range.
-multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
-  let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
-            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
-  def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
-            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
-  }
-  def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
-            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
-  def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
-            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
-  def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
-            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
-  def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
-            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
-  let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
-            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
-  def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
-            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
-  }
-  def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
-            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
-  def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
-            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
-  def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
-            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
-  def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
-            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
-  let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
-            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
-            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
-  }
-  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)),
-            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)),
-            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
-  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)),
-            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
-            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
-
-  let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
-            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
-            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
-  }
-  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
-            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
-            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
-  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
-            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
-  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
-            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+  defm FCVTZS : FPToIntegerSIMDScalar<0b10, 0b110, "fcvtzs", any_fp_to_sint>;
+  defm FCVTZU : FPToIntegerSIMDScalar<0b10, 0b111, "fcvtzu", any_fp_to_uint>;
 }
 
-defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
-defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
-
-multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode round, string INST> {
-  def : Pat<(i32 (to_int (round f32:$Rn))),
-            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
-  def : Pat<(i64 (to_int (round f32:$Rn))),
-            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
-  def : Pat<(i32 (to_int (round f64:$Rn))),
-            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
-  def : Pat<(i64 (to_int (round f64:$Rn))),
-            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-
-  // These instructions saturate like fp_to_[su]int_sat.
-  let Predicates = [HasFullFP16] in {
-  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
-            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
-  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
-            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
-  }
-  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
-            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
-  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
-            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
-  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
-            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
-  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
-            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
-}
-
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fceil,  "FCVTPS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fceil,  "FCVTPU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ffloor, "FCVTMS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ffloor, "FCVTMU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, ftrunc, "FCVTZS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, ftrunc, "FCVTZU">;
-defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fround, "FCVTAS">;
-defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fround, "FCVTAU">;
-
 
 
 let Predicates = [HasFullFP16] in {
@@ -6553,8 +6506,8 @@ defm FCVTNU : SIMDFPTwoScalar<   1, 0, 0b11010, "fcvtnu", int_aarch64_neon_fcvtn
 defm FCVTPS : SIMDFPTwoScalar<   0, 1, 0b11010, "fcvtps", int_aarch64_neon_fcvtps>;
 defm FCVTPU : SIMDFPTwoScalar<   1, 1, 0b11010, "fcvtpu", int_aarch64_neon_fcvtpu>;
 def  FCVTXNv1i64 : SIMDInexactCvtTwoScalar<0b10110, "fcvtxn">;
-defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs">;
-defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu">;
+defm FCVTZS : SIMDFPTwoScalar<   0, 1, 0b11011, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : SIMDFPTwoScalar<   1, 1, 0b11011, "fcvtzu", any_fp_to_uint>;
 defm FRECPE : SIMDFPTwoScalar<   0, 1, 0b11101, "frecpe">;
 defm FRECPX : SIMDFPTwoScalar<   0, 1, 0b11111, "frecpx">;
 defm FRSQRTE : SIMDFPTwoScalar<  1, 1, 0b11101, "frsqrte">;
@@ -6574,6 +6527,7 @@ defm USQADD : SIMDTwoScalarBHSDTied< 1, 0b00011, "usqadd",
 
 // Floating-point conversion patterns.
 multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
+  let Predicates = [HasFPRCVT] in {
   def : Pat<(f32 (bitconvert (i32 (OpN (f64 FPR64:$Rn))))),
             (!cast<Instruction>(INST # SDr) FPR64:$Rn)>;
   def : Pat<(f32 (bitconvert (i32 (OpN (f16 FPR16:$Rn))))),
@@ -6582,6 +6536,7 @@ multiclass FPToIntegerSIMDScalarPatterns<SDPatternOperator OpN, string INST> {
             (!cast<Instruction>(INST # DHr) FPR16:$Rn)>;
   def : Pat<(f64 (bitconvert (i64 (OpN (f32 FPR32:$Rn))))),
             (!cast<Instruction>(INST # DSr) FPR32:$Rn)>;
+  }
   def : Pat<(f32 (bitconvert (i32 (OpN (f32 FPR32:$Rn))))),
             (!cast<Instruction>(INST # v1i32) FPR32:$Rn)>;
   def : Pat<(f64 (bitconvert (i64 (OpN (f64 FPR64:$Rn))))),
@@ -6596,6 +6551,8 @@ defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtns, "FCVTNS">;
 defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtnu, "FCVTNU">;
 defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtps, "FCVTPS">;
 defm: FPToIntegerSIMDScalarPatterns<int_aarch64_neon_fcvtpu, "FCVTPU">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_sint, "FCVTZS">;
+defm: FPToIntegerSIMDScalarPatterns<any_fp_to_uint, "FCVTZU">;
 
 multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
   let Predicates = [HasFullFP16] in {
@@ -6652,6 +6609,196 @@ multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
 defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzs, "FCVTZS">;
 defm : FPToIntegerIntPats<int_aarch64_neon_fcvtzu, "FCVTZU">;
 
+// AArch64's FCVT instructions saturate when out of range.
+multiclass FPToIntegerSatPats<SDNode to_int_sat, SDNode to_int_sat_gi, string INST> {
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat f16:$Rn, i32)),
+            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat f16:$Rn, i64)),
+            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat f32:$Rn, i32)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat f32:$Rn, i64)),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat f64:$Rn, i32)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int_sat f64:$Rn, i64)),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  // For global-isel we can use register classes to determine
+  // which FCVT instruction to use.
+  let Predicates = [HasFPRCVT] in {
+  def : Pat<(i32 (to_int_sat_gi f16:$Rn)),
+            (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f16:$Rn)),
+            (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f32:$Rn)),
+            (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat_gi f64:$Rn)),
+            (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat_gi f32:$Rn)),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi f64:$Rn)),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+  let Predicates = [HasFPRCVT] in {
+  def : Pat<(f32 (bitconvert (i32 (to_int_sat f16:$Rn, i32)))),
+            (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+  def : Pat<(f64 (bitconvert (i64 (to_int_sat f16:$Rn, i64)))),
+            (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+  def : Pat<(f64 (bitconvert (i64 (to_int_sat f32:$Rn, i64)))),
+            (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+  def : Pat<(f32 (bitconvert (i32 (to_int_sat f64:$Rn, i32)))),
+            (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(f32 (bitconvert (i32 (to_int_sat f32:$Rn, i32)))),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(f64 (bitconvert (i64 (to_int_sat f64:$Rn, i64)))),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f16:$Rn, fixedpoint_f16_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  }
+  def : Pat<(i32 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f32:$Rn, fixedpoint_f32_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i32:$scale), i32)),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat (fmul f64:$Rn, fixedpoint_f64_i64:$scale), i64)),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i32:$scale))),
+            (!cast<Instruction>(INST # SWHri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat_gi (fmul f16:$Rn, fixedpoint_f16_i64:$scale))),
+            (!cast<Instruction>(INST # SXHri) $Rn, $scale)>;
+  }
+  def : Pat<(i32 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i32:$scale))),
+            (!cast<Instruction>(INST # SWSri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat_gi (fmul f32:$Rn, fixedpoint_f32_i64:$scale))),
+            (!cast<Instruction>(INST # SXSri) $Rn, $scale)>;
+  def : Pat<(i32 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i32:$scale))),
+            (!cast<Instruction>(INST # SWDri) $Rn, $scale)>;
+  def : Pat<(i64 (to_int_sat_gi (fmul f64:$Rn, fixedpoint_f64_i64:$scale))),
+            (!cast<Instruction>(INST # SXDri) $Rn, $scale)>;
+}
+
+defm : FPToIntegerSatPats<fp_to_sint_sat, fp_to_sint_sat_gi, "FCVTZS">;
+defm : FPToIntegerSatPats<fp_to_uint_sat, fp_to_uint_sat_gi, "FCVTZU">;
+
+multiclass FPToIntegerPats<SDNode to_int, SDNode to_int_sat, SDNode to_int_sat_gi, SDNode round, string INST> {
+  def : Pat<(i32 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  // For global-isel we can use register classes to determine
+  // which FCVT instruction to use.
+  let Predicates = [HasFPRCVT] in {
+  def : Pat<(i64 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(i32 (to_int (round f32:$Rn))),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(i64 (to_int (round f64:$Rn))),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+  let Predicates = [HasFPRCVT] in {
+  def : Pat<(f64 (bitconvert (i64 (to_int (round f32:$Rn))))),
+            (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+  def : Pat<(f32 (bitconvert (i32 (to_int (round f64:$Rn))))),
+            (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(f32 (bitconvert (i32 (to_int (round f32:$Rn))))),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(f64 (bitconvert (i64 (to_int (round f64:$Rn))))),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+
+  // These instructions saturate like fp_to_[su]int_sat.
+  let Predicates = [HasFullFP16] in {
+  def : Pat<(i32 (to_int_sat (round f16:$Rn), i32)),
+            (!cast<Instruction>(INST # UWHr) f16:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f16:$Rn), i64)),
+            (!cast<Instruction>(INST # UXHr) f16:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat (round f32:$Rn), i32)),
+            (!cast<Instruction>(INST # UWSr) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f32:$Rn), i64)),
+            (!cast<Instruction>(INST # UXSr) f32:$Rn)>;
+  def : Pat<(i32 (to_int_sat (round f64:$Rn), i32)),
+            (!cast<Instruction>(INST # UWDr) f64:$Rn)>;
+  def : Pat<(i64 (to_int_sat (round f64:$Rn), i64)),
+            (!cast<Instruction>(INST # UXDr) f64:$Rn)>;
+
+  // For global-isel we can use register classes to determine
+  // which FCVT instruction to use.
+  let Predicates = [HasFPRCVT] in {
+    def : Pat<(i32 (to_int_sat_gi (round f16:$Rn))),
+              (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+    def : Pat<(i64 (to_int_sat_gi (round f16:$Rn))),
+              (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+    def : Pat<(i64 (to_int_sat_gi (round f32:$Rn))),
+              (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+    def : Pat<(i32 (to_int_sat_gi (round f64:$Rn))),
+              (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(i32 (to_int_sat_gi (round f32:$Rn))),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(i64 (to_int_sat_gi (round f64:$Rn))),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+            
+  let Predicates = [HasFPRCVT] in {
+    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f16:$Rn), i32)))),
+              (!cast<Instruction>(INST # SHr) f16:$Rn)>;
+    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f16:$Rn), i64)))),
+              (!cast<Instruction>(INST # DHr) f16:$Rn)>;
+    def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f32:$Rn), i64)))),
+              (!cast<Instruction>(INST # DSr) f32:$Rn)>;
+    def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f64:$Rn), i32)))),
+              (!cast<Instruction>(INST # SDr) f64:$Rn)>;
+  }
+  def : Pat<(f32 (bitconvert (i32 (to_int_sat (round f32:$Rn), i32)))),
+            (!cast<Instruction>(INST # v1i32) f32:$Rn)>;
+  def : Pat<(f64 (bitconvert (i64 (to_int_sat (round f64:$Rn), i64)))),
+            (!cast<Instruction>(INST # v1i64) f64:$Rn)>;
+}
+
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fceil,  "FCVTPS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fceil,  "FCVTPU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ffloor, "FCVTMS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ffloor, "FCVTMU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, ftrunc, "FCVTZS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, ftrunc, "FCVTZU">;
+defm : FPToIntegerPats<fp_to_sint, fp_to_sint_sat, fp_to_sint_sat_gi, fround, "FCVTAS">;
+defm : FPToIntegerPats<fp_to_uint, fp_to_uint_sat, fp_to_uint_sat_gi, fround, "FCVTAU">;
+
 // f16 -> s16 conversions
 let Predicates = [HasFullFP16] in {
   def : Pat<(i16(fp_to_sint_sat_gi f16:$Rn)), (FCVTZSv1f16 f16:$Rn)>;
@@ -11248,8 +11395,28 @@ let Predicates = [HasLSFE] in {
   def STBFMINNML : BaseAtomicFPStore<FPR16, 0b00, 0b1, 0b111, "stbfminnml">;
 }
 
+let Predicates = [HasF16F32DOT] in {
+  defm FDOT :SIMDThreeSameVectorFDot<"fdot">;
+  defm FDOTlane: SIMDThreeSameVectorFDOTIndex<"fdot">;
+}
+
+let Predicates = [HasF16MM] in
+  defm FMMLA : SIMDThreeSameVectorFMLA<"fmmla">;
+
+let Predicates = [HasF16F32MM] in
+  defm FMMLA : SIMDThreeSameVectorFMLAWiden<"fmmla">;
+
 let Uses = [FPMR, FPCR] in
-defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+  defm FMMLA : SIMDThreeSameVectorFP8MatrixMul<"fmmla">;
+
+//===----------------------------------------------------------------------===//
+// Contention Management Hints (FEAT_CMH)
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasCMH] in {
+  defm SHUH  : SHUH<"shuh">;       // Shared Update Hint instruction
+  def STCPH  : STCPHInst<"stcph">; // Store Concurrent Priority Hint instruction
+}
 
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index d1832f4469b77..f680a5e52551d 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -651,7 +651,7 @@ struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
   AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~AArch64FunctionInfo() = default;
+  ~AArch64FunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<AArch64FunctionInfo> {
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 72a7676241770..47d76f361e8cc 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -154,7 +154,7 @@ class AArch64RegisterInfo final : public AArch64GenRegisterInfo {
 
   bool shouldAnalyzePhysregInMachineLoopInfo(MCRegister R) const override;
 
-  virtual bool isIgnoredCVReg(MCRegister LLVMReg) const override;
+  bool isIgnoredCVReg(MCRegister LLVMReg) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index ef974df823100..cd94a2590c6d2 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -993,7 +993,7 @@ def PPR_3b : PPRClass<0, 7> { // Restricted 3 bit SVE predicate register class.
   let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 0, 8>";
 }
 def PPR_p8to15 : PPRClass<8, 15> {
-  let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PNRRegClassID, 8, 8>";
+  let DecoderMethod = "DecodeSimpleRegisterClass<AArch64::PPRRegClassID, 8, 8>";
 }
 
 def PPRMul2 : PPRClass<0, 14, 2>;
@@ -1341,6 +1341,10 @@ def Z_q  : RegisterOperand<ZPR,  "printTypedVectorList<0,'q'>"> {
   let ParserMatchClass = ZPRVectorList<128, 1>;
 }
 
+def ZZ_Any  : RegisterOperand<ZPR2, "printTypedVectorList<0,0>"> {
+  let ParserMatchClass = ZPRVectorList<0, 2>;
+}
+
 def ZZ_b  : RegisterOperand<ZPR2, "printTypedVectorList<0,'b'>"> {
   let ParserMatchClass = ZPRVectorList<8, 2>;
 }
@@ -1361,6 +1365,10 @@ def ZZ_q  : RegisterOperand<ZPR2, "printTypedVectorList<0,'q'>"> {
   let ParserMatchClass = ZPRVectorList<128, 2>;
 }
 
+def ZZZ_Any  : RegisterOperand<ZPR3, "printTypedVectorList<0,0>"> {
+  let ParserMatchClass = ZPRVectorList<0, 3>;
+}
+
 def ZZZ_b  : RegisterOperand<ZPR3, "printTypedVectorList<0,'b'>"> {
   let ParserMatchClass = ZPRVectorList<8, 3>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
index e552afee0d8cf..752b185832c30 100644
--- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td
@@ -1173,3 +1173,14 @@ let Predicates = [HasSME_MOP4, HasSMEF64F64] in {
   defm FMOP4A : sme2_fmop4as_fp64_non_widening<0, "fmop4a", "int_aarch64_sme_mop4a">;
   defm FMOP4S : sme2_fmop4as_fp64_non_widening<1, "fmop4s", "int_aarch64_sme_mop4s">;
 }
+
+//===----------------------------------------------------------------------===//
+// SME2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSME2p3] in {
+  def LUTI6_ZTZ       : sme2_lut_single<"luti6">;
+  def LUTI6_4ZT3Z     : sme2_luti6_zt_consecutive<"luti6">;
+  def LUTI6_S_4ZT3Z   : sme2_luti6_zt_strided<"luti6">;
+  def LUTI6_4Z2Z2ZI   : sme2_luti6_vector_vg4_consecutive<"luti6">;
+  def LUTI6_S_4Z2Z2ZI : sme2_luti6_vector_vg4_strided<"luti6">;
+} // [HasSME2p3]
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index 98a128e582866..3b268dcbca600 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2569,7 +2569,7 @@ let Predicates = [HasBF16, HasSVE_or_SME] in {
 } // End HasBF16, HasSVE_or_SME
 
 let Predicates = [HasBF16, HasSVE] in {
-  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b01, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
+  defm BFMMLA_ZZZ_HtoS : sve_fp_matrix_mla<0b011, "bfmmla", ZPR32, ZPR16, int_aarch64_sve_bfmmla, nxv4f32, nxv8bf16>;
 } // End HasBF16, HasSVE
 
 let Predicates = [HasBF16, HasSVE_or_SME] in {
@@ -3680,15 +3680,15 @@ let Predicates = [HasSVE_or_SME, HasMatMulInt8] in {
 } // End HasSVE_or_SME, HasMatMulInt8
 
 let Predicates = [HasSVE, HasMatMulFP32] in {
-  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b10, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;
+  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0b101, "fmmla", ZPR32, ZPR32, int_aarch64_sve_fmmla, nxv4f32, nxv4f32>;
 } // End HasSVE, HasMatMulFP32
 
 let Predicates = [HasSVE_F16F32MM] in {
-  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b00, "fmmla", ZPR32, ZPR16>;
+  def FMLLA_ZZZ_HtoS : sve_fp_matrix_mla<0b001, "fmmla", ZPR32, ZPR16>;
 } // End HasSVE_F16F32MM
 
 let Predicates = [HasSVE, HasMatMulFP64] in {
-  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b11, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>;
+  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<0b111, "fmmla", ZPR64, ZPR64, int_aarch64_sve_fmmla, nxv2f64, nxv2f64>;
   defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8,  nxv16i8, nxv16i1, AArch64ld1ro_z>;
   defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1,  AArch64ld1ro_z>;
   defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1,  AArch64ld1ro_z>;
@@ -4272,9 +4272,9 @@ def : Pat<(nxv4i32 (partial_reduce_smla nxv4i32:$Acc, nxv8i16:$MulLHS, nxv8i16:$
 defm SQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"sqcvtn", 0b00, int_aarch64_sve_sqcvtn_x2>;
 defm UQCVTN_Z2Z_StoH  : sve2p1_multi_vec_extract_narrow<"uqcvtn", 0b01, int_aarch64_sve_uqcvtn_x2>;
 defm SQCVTUN_Z2Z_StoH : sve2p1_multi_vec_extract_narrow<"sqcvtun", 0b10, int_aarch64_sve_sqcvtun_x2>;
-defm SQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>;
-defm UQRSHRN_Z2ZI_StoH  : sve2p1_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>;
-defm SQRSHRUN_Z2ZI_StoH : sve2p1_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>;
+defm SQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqrshrn", 0b101, int_aarch64_sve_sqrshrn_x2>;
+defm UQRSHRN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"uqrshrn", 0b111, int_aarch64_sve_uqrshrn_x2>;
+defm SQRSHRUN_Z2ZI_StoH : sve_multi_vec_shift_narrow<"sqrshrun", 0b001, int_aarch64_sve_sqrshrun_x2>;
 
 defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>;
 defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>;
@@ -4615,6 +4615,75 @@ let Predicates = [HasSVE2p2_or_SME2p2] in {
   defm REVD_ZPzZ : sve_int_perm_rev_revd_z<"revd", AArch64revd_mt>;
 } // End HasSME2p2orSVE2p2
 
+
+//===----------------------------------------------------------------------===//
+// SME2.3 or SVE2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p3_or_SME2p3] in {
+  // SVE2 Add pairwise within quadword vector segments (unpredicated)
+  defm ADDQP_ZZZ     : sve2_int_mul<0b110, "addqp",   null_frag>;
+
+  // SVE2 Add subtract/subtract pairwise
+  defm ADDSUBP_ZZZ   : sve2_int_mul<0b111, "addsubp", null_frag>;
+  defm SUBP_ZPmZZ    : sve2_int_arith_pred<0b100001, "subp", null_frag>;
+
+  // SVE2 integer absolute difference and accumulate long
+  defm SABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b0, "sabal">;
+  defm UABAL_ZZZ : sve2_int_two_way_absdiff_accum_long<0b1, "uabal">;
+
+  // SVE2 integer dot product
+  def SDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b0, "sdot", ZPR16, ZPR8>;
+  def UDOT_ZZZ_BtoH : sve_intx_dot<0b01, 0b00000, 0b1, "udot", ZPR16, ZPR8>;
+
+  // SVE2 integer indexed dot product
+  def SDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b0, "sdot">;
+  def UDOT_ZZZI_BtoH : sve_intx_dot_by_indexed_elem_x<0b1, "udot">;
+
+  // SVE2 fp convert, narrow and interleave to integer, rounding toward zero
+  defm FCVTZSN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzsn", 0b0>;
+  defm FCVTZUN_Z2Z : sve2_fp_to_int_downcvt<"fcvtzun", 0b1>;
+
+  // SVE2 signed/unsigned integer convert to floating-point
+  defm SCVTF_ZZ   : sve2_int_to_fp_upcvt<"scvtf",   0b00>;
+  defm SCVTFLT_ZZ : sve2_int_to_fp_upcvt<"scvtflt", 0b10>;
+  defm UCVTF_ZZ   : sve2_int_to_fp_upcvt<"ucvtf",   0b01>;
+  defm UCVTFLT_ZZ : sve2_int_to_fp_upcvt<"ucvtflt", 0b11>;
+
+  // SVE2 saturating shift right narrow by immediate and interleave
+  defm SQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqrshrn",  0b101>;
+  defm SQRSHRUN_Z2ZI_HtoB : sve_multi_vec_round_shift_narrow<"sqrshrun", 0b001>;
+  defm SQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"sqshrn",   0b000>;
+  defm SQSHRUN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"sqshrun",  0b100>;
+  defm UQRSHRN_Z2ZI_HtoB  : sve_multi_vec_round_shift_narrow<"uqrshrn",  0b111>;
+  defm UQSHRN_Z2ZI_HtoB   : sve_multi_vec_round_shift_narrow<"uqshrn",   0b010>;
+  defm SQSHRUN_Z2ZI_StoH  : sve_multi_vec_shift_narrow<"sqshrun",  0b100, null_frag>;
+  defm SQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"sqshrn",   0b000, null_frag>;
+  defm UQSHRN_Z2ZI_StoH   : sve_multi_vec_shift_narrow<"uqshrn",   0b010, null_frag>;
+
+  defm LUTI6_Z2ZZI : sve2_luti6_vector_index<"luti6">;
+} // End HasSME2p3orSVE2p3
+
+//===----------------------------------------------------------------------===//
+// SVE2.3 instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p3] in {
+  def LUTI6_Z2ZZ : sve2_luti6_vector<"luti6">;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE_B16MM Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE_B16MM] in {
+  def BFMMLA_ZZZ_H : sve_fp_matrix_mla<0b110, "bfmmla", ZPR16, ZPR16>;
+}
+
+//===----------------------------------------------------------------------===//
+// F16MM Instructions
+//===----------------------------------------------------------------------===//
+let Predicates = [HasSVE2p2, HasF16MM] in {
+  def FMMLA_ZZZ_H : sve_fp_matrix_mla<0b100, "fmmla", ZPR16, ZPR16>;
+}
+
 //===----------------------------------------------------------------------===//
 // SME2.2 or SVE2.2 instructions - Legal in streaming mode iff target has SME2p2
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
index bdde8e388cccc..2387f176f3051 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedNeoverseV2.td
@@ -2762,11 +2762,11 @@ def : InstRW<[V2Write_11c_18L01_18V01], (instregex "^ST4[BHWD]_IMM$")>;
 def : InstRW<[V2Write_11c_18L01_18S_18V01], (instregex "^ST4[BHWD]$")>;
 
 // Non temporal store, scalar + imm
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BHWD]_ZRI$")>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BHWD]_ZRI$")>;
 
 // Non temporal store, scalar + scalar
-def : InstRW<[V2Write_2c_1L01_1S_1V], (instrs STNT1H_ZRR)>;
-def : InstRW<[V2Write_2c_1L01_1V], (instregex "^STNT1[BWD]_ZRR$")>;
+def : InstRW<[V2Write_2c_1L01_1S_1V01], (instrs STNT1H_ZRR)>;
+def : InstRW<[V2Write_2c_1L01_1V01], (instregex "^STNT1[BWD]_ZRR$")>;
 
 // Scatter non temporal store, vector + scalar 32-bit element size
 def : InstRW<[V2Write_4c_4L01_4V01], (instregex "^STNT1[BHW]_ZZR_S")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 943891746c45f..ae46d717d0cb1 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -205,6 +205,7 @@ def lookupDCByName : SearchIndex {
   let Key = ["Name"];
 }
 
+//                Op1    CRn     CRm     Op2
 def : DC<"ZVA",   0b011, 0b0111, 0b0100, 0b001>;
 def : DC<"IVAC",  0b000, 0b0111, 0b0110, 0b001>;
 def : DC<"ISW",   0b000, 0b0111, 0b0110, 0b010>;
@@ -241,6 +242,11 @@ def : DC<"CIGDVAC", 0b011, 0b0111, 0b1110, 0b101>;
 def : DC<"GZVA",    0b011, 0b0111, 0b0100, 0b100>;
 }
 
+let Requires = [{ {AArch64::FeatureMTETC} }] in {
+def : DC<"ZGBVA",   0b011, 0b0111, 0b0100, 0b101>;
+def : DC<"GBVA",    0b011, 0b0111, 0b0100, 0b111>;
+}
+
 let Requires = [{ {AArch64::FeatureMEC} }] in {
 def : DC<"CIPAE",   0b100, 0b0111, 0b1110, 0b000>;
 def : DC<"CIGDPAE", 0b100, 0b0111, 0b1110, 0b111>;
@@ -812,12 +818,27 @@ def : BTI<"c",  0b010>;
 def : BTI<"j",  0b100>;
 def : BTI<"jc", 0b110>;
 
+//===----------------------------------------------------------------------===//
+// CMHPriority instruction options.
+//===----------------------------------------------------------------------===//
+
+class CMHPriorityHint<string name, bits<1> encoding> : SearchableTable {
+  let SearchableFields = ["Name", "Encoding"];
+  let EnumValueField = "Encoding";
+
+  string Name = name;
+  bits<1> Encoding;
+  let Encoding = encoding;
+}
+
+def : CMHPriorityHint<"ph", 0b1>;
+
 //===----------------------------------------------------------------------===//
 // TLBI (translation lookaside buffer invalidate) instruction options.
 //===----------------------------------------------------------------------===//
 
 class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                 bits<3> op2, bit needsreg> {
+                 bits<3> op2, bit needsreg, bit optionalreg> {
   string Name = name;
   bits<14> Encoding;
   let Encoding{13-11} = op1;
@@ -825,24 +846,25 @@ class TLBICommon<string name, bits<3> op1, bits<4> crn, bits<4> crm,
   let Encoding{6-3} = crm;
   let Encoding{2-0} = op2;
   bit NeedsReg = needsreg;
+  bit OptionalReg = optionalreg;
   list<string> Requires = [];
   list<string> ExtraRequires = [];
   code RequiresStr = [{ { }] # !interleave(Requires # ExtraRequires, [{, }]) # [{ } }];
 }
 
 class TLBIEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                bits<3> op2, bit needsreg>
-  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+                bits<3> op2, bit needsreg, bit optionalreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;
 
 class TLBIPEntry<string name, bits<3> op1, bits<4> crn, bits<4> crm,
-                 bits<3> op2, bit needsreg>
-  : TLBICommon<name, op1, crn, crm, op2, needsreg>;
+                 bits<3> op2, bit needsreg, bit optionalreg>
+  : TLBICommon<name, op1, crn, crm, op2, needsreg, optionalreg>;
 
 multiclass TLBITableBase {
   def NAME # Table : GenericTable {
     let FilterClass = NAME # "Entry";
     let CppTypeName = NAME;
-    let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+    let Fields = ["Name", "Encoding", "NeedsReg", "OptionalReg", "RequiresStr"];
     let PrimaryKey = ["Encoding"];
     let PrimaryKeyName = "lookup" # NAME # "ByEncoding";
   }
@@ -856,60 +878,60 @@ defm TLBI  : TLBITableBase;
 defm TLBIP : TLBITableBase;
 
 multiclass TLBI<string name, bit hasTLBIP, bits<3> op1, bits<4> crn, bits<4> crm,
-             bits<3> op2, bit needsreg = 1> {
-  def : TLBIEntry<name, op1, crn, crm, op2, needsreg>;
-  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+             bits<3> op2, bit needsreg = 1, bit optionalreg = 0> {
+  def : TLBIEntry<name, op1, crn, crm, op2, needsreg, optionalreg>;
+  def : TLBIEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {
     let Encoding{7} = 1;
     let ExtraRequires = ["AArch64::FeatureXS"];
   }
   if !eq(hasTLBIP, true) then {
-    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg>;
-    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg> {
+    def : TLBIPEntry<name, op1, crn, crm, op2, needsreg, optionalreg>;
+    def : TLBIPEntry<!strconcat(name, "nXS"), op1, crn, crm, op2, needsreg, optionalreg> {
       let Encoding{7} = 1;
       let ExtraRequires = ["AArch64::FeatureXS"];
     }
   }
 }
 
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
 defm : TLBI<"IPAS2E1IS",    1, 0b100, 0b1000, 0b0000, 0b001>;
 defm : TLBI<"IPAS2LE1IS",   1, 0b100, 0b1000, 0b0000, 0b101>;
-defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0>;
-defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0>;
+defm : TLBI<"VMALLE1IS",    0, 0b000, 0b1000, 0b0011, 0b000, 0, 1>;
+defm : TLBI<"ALLE2IS",      0, 0b100, 0b1000, 0b0011, 0b000, 0, 1>;
+defm : TLBI<"ALLE3IS",      0, 0b110, 0b1000, 0b0011, 0b000, 0, 1>;
 defm : TLBI<"VAE1IS",       1, 0b000, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"VAE2IS",       1, 0b100, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"VAE3IS",       1, 0b110, 0b1000, 0b0011, 0b001>;
 defm : TLBI<"ASIDE1IS",     0, 0b000, 0b1000, 0b0011, 0b010>;
 defm : TLBI<"VAAE1IS",      1, 0b000, 0b1000, 0b0011, 0b011>;
-defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0>;
+defm : TLBI<"ALLE1IS",      0, 0b100, 0b1000, 0b0011, 0b100, 0, 1>;
 defm : TLBI<"VALE1IS",      1, 0b000, 0b1000, 0b0011, 0b101>;
 defm : TLBI<"VALE2IS",      1, 0b100, 0b1000, 0b0011, 0b101>;
 defm : TLBI<"VALE3IS",      1, 0b110, 0b1000, 0b0011, 0b101>;
-defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0>;
+defm : TLBI<"VMALLS12E1IS", 0, 0b100, 0b1000, 0b0011, 0b110, 0, 1>;
 defm : TLBI<"VAALE1IS",     1, 0b000, 0b1000, 0b0011, 0b111>;
 defm : TLBI<"IPAS2E1",      1, 0b100, 0b1000, 0b0100, 0b001>;
 defm : TLBI<"IPAS2LE1",     1, 0b100, 0b1000, 0b0100, 0b101>;
-defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0>;
-defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0>;
+defm : TLBI<"VMALLE1",      0, 0b000, 0b1000, 0b0111, 0b000, 0, 0>;
+defm : TLBI<"ALLE2",        0, 0b100, 0b1000, 0b0111, 0b000, 0, 0>;
+defm : TLBI<"ALLE3",        0, 0b110, 0b1000, 0b0111, 0b000, 0, 0>;
 defm : TLBI<"VAE1",         1, 0b000, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"VAE2",         1, 0b100, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"VAE3",         1, 0b110, 0b1000, 0b0111, 0b001>;
 defm : TLBI<"ASIDE1",       0, 0b000, 0b1000, 0b0111, 0b010>;
 defm : TLBI<"VAAE1",        1, 0b000, 0b1000, 0b0111, 0b011>;
-defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"ALLE1",        0, 0b100, 0b1000, 0b0111, 0b100, 0, 0>;
 defm : TLBI<"VALE1",        1, 0b000, 0b1000, 0b0111, 0b101>;
 defm : TLBI<"VALE2",        1, 0b100, 0b1000, 0b0111, 0b101>;
 defm : TLBI<"VALE3",        1, 0b110, 0b1000, 0b0111, 0b101>;
-defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0>;
+defm : TLBI<"VMALLS12E1",   0, 0b100, 0b1000, 0b0111, 0b110, 0, 0>;
 defm : TLBI<"VAALE1",       1, 0b000, 0b1000, 0b0111, 0b111>;
 
 // Armv8.4-A Translation Lookaside Buffer Instructions (TLBI)
 let Requires = ["AArch64::FeatureTLB_RMI"] in {
 // Armv8.4-A Outer Sharable TLB Maintenance instructions:
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
-defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
+defm : TLBI<"VMALLE1OS",    0, 0b000, 0b1000, 0b0001, 0b000, 0, 1>;
 defm : TLBI<"VAE1OS",       1, 0b000, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"ASIDE1OS",     0, 0b000, 0b1000, 0b0001, 0b010>;
 defm : TLBI<"VAAE1OS",      1, 0b000, 0b1000, 0b0001, 0b011>;
@@ -919,15 +941,15 @@ defm : TLBI<"IPAS2E1OS",    1, 0b100, 0b1000, 0b0100, 0b000>;
 defm : TLBI<"IPAS2LE1OS",   1, 0b100, 0b1000, 0b0100, 0b100>;
 defm : TLBI<"VAE2OS",       1, 0b100, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"VALE2OS",      1, 0b100, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0>;
+defm : TLBI<"VMALLS12E1OS", 0, 0b100, 0b1000, 0b0001, 0b110, 0, 1>;
 defm : TLBI<"VAE3OS",       1, 0b110, 0b1000, 0b0001, 0b001>;
 defm : TLBI<"VALE3OS",      1, 0b110, 0b1000, 0b0001, 0b101>;
-defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0>;
-defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0>;
+defm : TLBI<"ALLE2OS",      0, 0b100, 0b1000, 0b0001, 0b000, 0, 1>;
+defm : TLBI<"ALLE1OS",      0, 0b100, 0b1000, 0b0001, 0b100, 0, 1>;
+defm : TLBI<"ALLE3OS",      0, 0b110, 0b1000, 0b0001, 0b000, 0, 1>;
 
 // Armv8.4-A TLB Range Maintenance instructions:
-//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
+//                   hasTLBIP  op1    CRn     CRm     op2
 defm : TLBI<"RVAE1",        1, 0b000, 0b1000, 0b0110, 0b001>;
 defm : TLBI<"RVAAE1",       1, 0b000, 0b1000, 0b0110, 0b011>;
 defm : TLBI<"RVALE1",       1, 0b000, 0b1000, 0b0110, 0b101>;
@@ -962,18 +984,19 @@ defm : TLBI<"RVALE3OS",     1, 0b110, 0b1000, 0b0101, 0b101>;
 
 // Armv9-A Realm Management Extension TLBI Instructions
 let Requires = ["AArch64::FeatureRME"] in {
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg
 defm : TLBI<"RPAOS",        0, 0b110, 0b1000, 0b0100, 0b011>;
 defm : TLBI<"RPALOS",       0, 0b110, 0b1000, 0b0100, 0b111>;
-defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0>;
-defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0>;
+defm : TLBI<"PAALLOS",      0, 0b110, 0b1000, 0b0001, 0b100, 0, 0>;
+defm : TLBI<"PAALL",        0, 0b110, 0b1000, 0b0111, 0b100, 0, 0>;
 }
 
 // Armv9.5-A TLBI VMALL for Dirty State
 let Requires = ["AArch64::FeatureTLBIW"] in {
-//                                           op1,   CRn,    CRm,    op2,   needsreg
-defm : TLBI<"VMALLWS2E1",    0, 0b100, 0b1000, 0b0110, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1IS",  0, 0b100, 0b1000, 0b0010, 0b010, 0>;
-defm : TLBI<"VMALLWS2E1OS",  0, 0b100, 0b1000, 0b0101, 0b010, 0>;
+//                   hasTLBIP  op1    CRn     CRm     op2    needsreg, optreg
+defm : TLBI<"VMALLWS2E1",   0, 0b100, 0b1000, 0b0110, 0b010, 0, 0>;
+defm : TLBI<"VMALLWS2E1IS", 0, 0b100, 0b1000, 0b0010, 0b010, 0, 1>;
+defm : TLBI<"VMALLWS2E1OS", 0, 0b100, 0b1000, 0b0101, 0b010, 0, 1>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1862,13 +1885,6 @@ def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
 
 // v8.4a MPAM registers
 //                             Op0   Op1    CRn     CRm     Op2
-let Requires = [{ {AArch64::FeatureMPAM} }] in {
-def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
-def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>;
-def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;
 def : RWSysReg<"MPAMVPMV_EL2", 0b11, 0b100, 0b1010, 0b0100, 0b001>;
 def : RWSysReg<"MPAMVPM0_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b000>;
 def : RWSysReg<"MPAMVPM1_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b001>;
@@ -1878,8 +1894,6 @@ def : RWSysReg<"MPAMVPM4_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b100>;
 def : RWSysReg<"MPAMVPM5_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b101>;
 def : RWSysReg<"MPAMVPM6_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b110>;
 def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
-def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
-} //FeatureMPAM
 
 // v8.4a Activity Monitor registers
 //                                 Op0   Op1    CRn     CRm     Op2
@@ -2319,6 +2333,26 @@ def : RWSysReg<"MPAMBW0_EL1",             0b11, 0b000, 0b1010, 0b0101, 0b101>;
 def : RWSysReg<"MPAMBWCAP_EL2",           0b11, 0b100, 0b1010, 0b0101, 0b110>;
 def : RWSysReg<"MPAMBWSM_EL1",            0b11, 0b000, 0b1010, 0b0101, 0b111>;
 
+// v9.7a Memory partitioning and monitoring version 2
+// (FEAT_MPAMv2) registers
+//                               Op0   Op1    CRn     CRm     Op2
+// MPAM system registers that are also available for MPAMv2
+def : RWSysReg<"MPAM0_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b001>;
+def : RWSysReg<"MPAM1_EL1",    0b11, 0b000, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM1_EL12",   0b11, 0b101, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM2_EL2",    0b11, 0b100, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAM3_EL3",    0b11, 0b110, 0b1010, 0b0101, 0b000>;
+def : RWSysReg<"MPAMHCR_EL2",  0b11, 0b100, 0b1010, 0b0100, 0b000>;
+def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
+// Only MPAMv2 registers
+def : RWSysReg<"MPAMCTL_EL1",   0b11, 0b000, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL12",  0b11, 0b101, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL2",   0b11, 0b100, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMCTL_EL3",   0b11, 0b110, 0b1010, 0b0101, 0b010>;
+def : RWSysReg<"MPAMVIDCR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b000>;
+def : RWSysReg<"MPAMVIDSR_EL2", 0b11, 0b100, 0b1010, 0b0111, 0b001>;
+def : RWSysReg<"MPAMVIDSR_EL3", 0b11, 0b110, 0b1010, 0b0111, 0b001>;
+
 //===----------------------------------------------------------------------===//
 // FEAT_SRMASK v9.6a registers
 //===----------------------------------------------------------------------===//
@@ -2412,3 +2446,251 @@ def : DC<"CIVAPS",    0b000, 0b0111, 0b1111, 0b001>;
 let Requires = [{ {AArch64::FeaturePoPS, AArch64::FeatureMTE} }] in {
 def : DC<"CIGDVAPS",  0b000, 0b0111, 0b1111, 0b101>;
 }
+
+// v9.7a TLBI domains system registers (MemSys)
+foreach n = 0-3 in {
+  defvar nb = !cast<bits<3>>(n);
+  def : RWSysReg<"VTLBID"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1000, nb>;
+}
+
+foreach n = 0-3 in {
+  defvar nb = !cast<bits<3>>(n);
+  def : RWSysReg<"VTLBIDOS"#n#"_EL2", 0b11,  0b100, 0b0010, 0b1001, nb>;
+}
+
+def : ROSysReg<"TLBIDIDR_EL1",      0b11,  0b000, 0b1010, 0b0100, 0b110>;
+
+// MPAM Lookaside Buffer Invalidate (MLBI) instructions
+class MLBI<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2, bit needsreg> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = needsreg;
+  string RequiresStr = [{ {AArch64::FeatureMPAMv2} }];
+}
+
+def MLBITable : GenericTable {
+  let FilterClass = "MLBI";
+  let CppTypeName = "MLBI";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupMLBIByEncoding";
+}
+
+def lookupMLBIByName : SearchIndex {
+  let Table = MLBITable;
+  let Key = ["Name"];
+}
+
+//                     Op1    CRn     CRm     Op2    needsReg
+def : MLBI<"ALLE1",    0b100, 0b0111, 0b0000, 0b100, 0>;
+def : MLBI<"VMALLE1",  0b100, 0b0111, 0b0000, 0b101, 0>;
+def : MLBI<"VPIDE1",   0b100, 0b0111, 0b0000, 0b110, 1>;
+def : MLBI<"VPMGE1",   0b100, 0b0111, 0b0000, 0b111, 1>;
+
+
+// v9.7-A GICv5 (FEAT_GCIE)
+// CPU Interface Registers
+//                                        Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICC_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"ICC_APR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b000>;
+def : RWSysReg<"ICC_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>;
+def : RWSysReg<"ICC_CR0_EL3",             0b11, 0b110, 0b1100, 0b1001, 0b000>;
+def : ROSysReg<"ICC_DOMHPPIR_EL3",        0b11, 0b110, 0b1100, 0b1000, 0b010>;
+def : ROSysReg<"ICC_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>;
+def : ROSysReg<"ICC_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>;
+def : ROSysReg<"ICC_HPPIR_EL3",           0b11, 0b110, 0b1100, 0b1001, 0b001>;
+def : ROSysReg<"ICC_IAFFIDR_EL1",         0b11, 0b000, 0b1100, 0b1010, 0b101>;
+def : RWSysReg<"ICC_ICSR_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b100>;
+def : ROSysReg<"ICC_IDR0_EL1",            0b11, 0b000, 0b1100, 0b1010, 0b010>;
+def : RWSysReg<"ICC_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>;
+def : RWSysReg<"ICC_PCR_EL3",             0b11, 0b110, 0b1100, 0b1000, 0b001>;
+
+// Virtual CPU Interface Registers
+//                                        Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICV_APR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b000>;
+def : RWSysReg<"ICV_CR0_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b001>;
+def : RWSysReg<"ICV_HAPR_EL1",            0b11, 0b001, 0b1100, 0b0000, 0b011>;
+def : RWSysReg<"ICV_HPPIR_EL1",           0b11, 0b000, 0b1100, 0b1010, 0b011>;
+def : RWSysReg<"ICV_PCR_EL1",             0b11, 0b001, 0b1100, 0b0000, 0b010>;
+
+foreach n=0-3 in {
+  defvar nb = !cast<bits<2>>(n);
+//                                             Op0   Op1    CRn     CRm     Op2
+  def : RWSysReg<"ICC_PPI_DOMAINR"#n#"_EL3",   0b11, 0b110, 0b1100, 0b1000, {0b1,nb{1-0}}>;
+
+}
+
+foreach n=0-15 in{
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICC_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+// PPI and Virtual PPI Registers
+multiclass PPIRegisters<string prefix> {
+  foreach n=0-1 in {
+    defvar nb = !cast<bit>(n);
+//                                                  Op0   Op1    CRn     CRm     Op2
+    def : RWSysReg<prefix#"_PPI_CACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b00,nb}>;
+    def : RWSysReg<prefix#"_PPI_CPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b10,nb}>;
+    def : RWSysReg<prefix#"_PPI_ENABLER"#n#"_EL1",  0b11, 0b000, 0b1100, 0b1010, {0b11,nb}>;
+    def : RWSysReg<prefix#"_PPI_SACTIVER"#n#"_EL1", 0b11, 0b000, 0b1100, 0b1101, {0b01,nb}>;
+    def : RWSysReg<prefix#"_PPI_SPENDR"#n#"_EL1",   0b11, 0b000, 0b1100, 0b1101, {0b11,nb}>;
+    def : RWSysReg<prefix#"_PPI_HMR"#n#"_EL1",      0b11, 0b000, 0b1100, 0b1010, {0b00,nb}>;
+  }
+}
+
+defm : PPIRegisters<"ICC">;  // PPI Registers
+defm : PPIRegisters<"ICV">;  // Virtual PPI Registers
+
+foreach n=0-15 in {
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICV_PPI_PRIORITYR"#n#"_EL1",   0b11, 0b000, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+// Hypervisor Control Registers
+//                                    Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICH_APR_EL2",         0b11, 0b100, 0b1100, 0b1000, 0b100>;
+def : RWSysReg<"ICH_CONTEXTR_EL2",    0b11, 0b100, 0b1100, 0b1011, 0b110>;
+def : RWSysReg<"ICH_HFGITR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b111>;
+def : RWSysReg<"ICH_HFGRTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
+def : RWSysReg<"ICH_HFGWTR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b110>;
+def : ROSysReg<"ICH_HPPIR_EL2",       0b11, 0b100, 0b1100, 0b1000, 0b101>;
+def : RWSysReg<"ICH_VCTLR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b100>;
+
+foreach n=0-1 in {
+  defvar nb = !cast<bit>(n);
+//                                           Op0   Op1    CRn     CRm     Op2
+def : RWSysReg<"ICH_PPI_ACTIVER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b11,nb}>;
+def : RWSysReg<"ICH_PPI_DVIR"#n#"_EL2",      0b11, 0b100, 0b1100, 0b1010, {0b00,nb}>;
+def : RWSysReg<"ICH_PPI_ENABLER"#n#"_EL2",   0b11, 0b100, 0b1100, 0b1010, {0b01,nb}>;
+def : RWSysReg<"ICH_PPI_PENDR"#n#"_EL2",     0b11, 0b100, 0b1100, 0b1010, {0b10,nb}>;
+}
+
+foreach n=0-15 in {
+  defvar nb = !cast<bits<4>>(n);
+//                                               Op0   Op1    CRn     CRm            Op2
+  def : RWSysReg<"ICH_PPI_PRIORITYR"#n#"_EL2",   0b11, 0b100, 0b1100, {0b111,nb{3}}, nb{2-0}>;
+}
+
+//===----------------------------------------------------------------------===//
+// GICv5 instruction options.
+//===----------------------------------------------------------------------===//
+
+// GIC
+class GIC<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = 1;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+// GSB
+class GSB<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+// GICR
+class GICR<string name, bits<3> op1, bits<4> crn, bits<4> crm, bits<3> op2> {
+  string Name = name;
+  bits<14> Encoding;
+  let Encoding{13-11} = op1;
+  let Encoding{10-7} = crn;
+  let Encoding{6-3} = crm;
+  let Encoding{2-0} = op2;
+  bit NeedsReg = 1;
+  string RequiresStr = [{ {AArch64::FeatureGCIE} }];
+}
+
+def GICTable : GenericTable {
+  let FilterClass = "GIC";
+  let CppTypeName = "GIC";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGICByEncoding";
+}
+
+def GSBTable : GenericTable {
+  let FilterClass = "GSB";
+  let CppTypeName = "GSB";
+  let Fields = ["Name", "Encoding", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGSBByEncoding";
+}
+
+def GICRTable : GenericTable {
+  let FilterClass = "GICR";
+  let CppTypeName = "GICR";
+  let Fields = ["Name", "Encoding", "NeedsReg", "RequiresStr"];
+
+  let PrimaryKey = ["Encoding"];
+  let PrimaryKeyName = "lookupGICRByEncoding";
+}
+
+def lookupGICByName : SearchIndex {
+  let Table = GICTable;
+  let Key = ["Name"];
+}
+
+def lookupGSBByName : SearchIndex {
+  let Table = GSBTable;
+  let Key = ["Name"];
+}
+
+def lookupGICRByName : SearchIndex {
+  let Table = GICRTable;
+  let Key = ["Name"];
+}
+
+//                    Op1    CRn     CRm     Op2
+def : GSB<"sys",      0b000, 0b1100, 0b0000, 0b000>;
+def : GSB<"ack",      0b000, 0b1100, 0b0000, 0b001>;
+
+//                    Op1    CRn     CRm     Op2
+def : GICR<"cdia",    0b000, 0b1100, 0b0011, 0b000>;
+def : GICR<"cdnmia",  0b000, 0b1100, 0b0011, 0b001>;
+
+//                    Op1    CRn     CRm     Op2
+def : GIC<"cdaff",    0b000, 0b1100, 0b0001, 0b011>;
+def : GIC<"cddi",     0b000, 0b1100, 0b0010, 0b000>;
+def : GIC<"cddis",    0b000, 0b1100, 0b0001, 0b000>;
+def : GIC<"cden",     0b000, 0b1100, 0b0001, 0b001>;
+def : GIC<"cdeoi",    0b000, 0b1100, 0b0001, 0b111>;
+def : GIC<"cdhm",     0b000, 0b1100, 0b0010, 0b001>;
+def : GIC<"cdpend",   0b000, 0b1100, 0b0001, 0b100>;
+def : GIC<"cdpri",    0b000, 0b1100, 0b0001, 0b010>;
+def : GIC<"cdrcfg",   0b000, 0b1100, 0b0001, 0b101>;
+def : GIC<"vdaff",    0b100, 0b1100, 0b0001, 0b011>;
+def : GIC<"vddi",     0b100, 0b1100, 0b0010, 0b000>;
+def : GIC<"vddis",    0b100, 0b1100, 0b0001, 0b000>;
+def : GIC<"vden",     0b100, 0b1100, 0b0001, 0b001>;
+def : GIC<"vdhm",     0b100, 0b1100, 0b0010, 0b001>;
+def : GIC<"vdpend",   0b100, 0b1100, 0b0001, 0b100>;
+def : GIC<"vdpri",    0b100, 0b1100, 0b0001, 0b010>;
+def : GIC<"vdrcfg",   0b100, 0b1100, 0b0001, 0b101>;
+def : GIC<"ldaff",    0b110, 0b1100, 0b0001, 0b011>;
+def : GIC<"lddi",     0b110, 0b1100, 0b0010, 0b000>;
+def : GIC<"lddis",    0b110, 0b1100, 0b0001, 0b000>;
+def : GIC<"lden",     0b110, 0b1100, 0b0001, 0b001>;
+def : GIC<"ldhm",     0b110, 0b1100, 0b0010, 0b001>;
+def : GIC<"ldpend",   0b110, 0b1100, 0b0001, 0b100>;
+def : GIC<"ldpri",    0b110, 0b1100, 0b0001, 0b010>;
+def : GIC<"ldrcfg",   0b110, 0b1100, 0b0001, 0b101>;
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index e3370d31a0e39..fede586cf35bc 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -224,7 +224,8 @@ static cl::opt<bool> EnableScalableAutovecInStreamingMode(
 static bool isSMEABIRoutineCall(const CallInst &CI,
                                 const AArch64TargetLowering &TLI) {
   const auto *F = CI.getCalledFunction();
-  return F && SMEAttrs(F->getName(), TLI).isSMEABIRoutine();
+  return F &&
+         SMEAttrs(F->getName(), TLI.getRuntimeLibcallsInfo()).isSMEABIRoutine();
 }
 
 /// Returns true if the function has explicit operations that can only be
@@ -355,7 +356,7 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
   // change only once and avoid inlining of G into F.
 
   SMEAttrs FAttrs(*F);
-  SMECallAttrs CallAttrs(Call, getTLI());
+  SMECallAttrs CallAttrs(Call, &getTLI()->getRuntimeLibcallsInfo());
 
   if (SMECallAttrs(FAttrs, CallAttrs.callee()).requiresSMChange()) {
     if (F == Call.getCaller()) // (1)
@@ -957,23 +958,50 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return TyL.first + ExtraCost;
   }
   case Intrinsic::get_active_lane_mask: {
-    auto *RetTy = dyn_cast<FixedVectorType>(ICA.getReturnType());
-    if (RetTy) {
-      EVT RetVT = getTLI()->getValueType(DL, RetTy);
-      EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
-      if (!getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT) &&
-          !getTLI()->isTypeLegal(RetVT)) {
-        // We don't have enough context at this point to determine if the mask
-        // is going to be kept live after the block, which will force the vXi1
-        // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
-        // For now, we just assume the vectorizer created this intrinsic and
-        // the result will be the input for a PHI. In this case the cost will
-        // be extremely high for fixed-width vectors.
-        // NOTE: getScalarizationOverhead returns a cost that's far too
-        // pessimistic for the actual generated codegen. In reality there are
-        // two instructions generated per lane.
-        return RetTy->getNumElements() * 2;
+    auto RetTy = cast<VectorType>(ICA.getReturnType());
+    EVT RetVT = getTLI()->getValueType(DL, RetTy);
+    EVT OpVT = getTLI()->getValueType(DL, ICA.getArgTypes()[0]);
+    if (getTLI()->shouldExpandGetActiveLaneMask(RetVT, OpVT))
+      break;
+
+    if (RetTy->isScalableTy()) {
+      if (TLI->getTypeAction(RetTy->getContext(), RetVT) !=
+          TargetLowering::TypeSplitVector)
+        break;
+
+      auto LT = getTypeLegalizationCost(RetTy);
+      InstructionCost Cost = LT.first;
+      // When SVE2p1 or SME2 is available, we can halve getTypeLegalizationCost
+      // as get_active_lane_mask may lower to the sve_whilelo_x2 intrinsic, e.g.
+      //   nxv32i1 = get_active_lane_mask(base, idx) ->
+      //    {nxv16i1, nxv16i1} = sve_whilelo_x2(base, idx)
+      if (ST->hasSVE2p1() || ST->hasSME2()) {
+        Cost /= 2;
+        if (Cost == 1)
+          return Cost;
       }
+
+      // If more than one whilelo intrinsic is required, include the extra cost
+      // required by the saturating add & select required to increment the
+      // start value after the first intrinsic call.
+      Type *OpTy = ICA.getArgTypes()[0];
+      IntrinsicCostAttributes AddAttrs(Intrinsic::uadd_sat, OpTy, {OpTy, OpTy});
+      InstructionCost SplitCost = getIntrinsicInstrCost(AddAttrs, CostKind);
+      Type *CondTy = OpTy->getWithNewBitWidth(1);
+      SplitCost += getCmpSelInstrCost(Instruction::Select, OpTy, CondTy,
+                                      CmpInst::ICMP_UGT, CostKind);
+      return Cost + (SplitCost * (Cost - 1));
+    } else if (!getTLI()->isTypeLegal(RetVT)) {
+      // We don't have enough context at this point to determine if the mask
+      // is going to be kept live after the block, which will force the vXi1
+      // type to be expanded to legal vectors of integers, e.g. v4i1->v4i32.
+      // For now, we just assume the vectorizer created this intrinsic and
+      // the result will be the input for a PHI. In this case the cost will
+      // be extremely high for fixed-width vectors.
+      // NOTE: getScalarizationOverhead returns a cost that's far too
+      // pessimistic for the actual generated codegen. In reality there are
+      // two instructions generated per lane.
+      return cast<FixedVectorType>(RetTy)->getNumElements() * 2;
     }
     break;
   }
@@ -1577,18 +1605,26 @@ static SVEIntrinsicInfo constructSVEIntrinsicInfo(IntrinsicInst &II) {
 }
 
 static bool isAllActivePredicate(Value *Pred) {
-  // Look through convert.from.svbool(convert.to.svbool(...) chain.
   Value *UncastedPred;
+
+  // Look through predicate casts that only remove lanes.
   if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_from_svbool>(
-                      m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
-                          m_Value(UncastedPred)))))
-    // If the predicate has the same or less lanes than the uncasted
-    // predicate then we know the casting has no effect.
-    if (cast<ScalableVectorType>(Pred->getType())->getMinNumElements() <=
-        cast<ScalableVectorType>(UncastedPred->getType())->getMinNumElements())
-      Pred = UncastedPred;
+                      m_Value(UncastedPred)))) {
+    auto *OrigPredTy = cast<ScalableVectorType>(Pred->getType());
+    Pred = UncastedPred;
+
+    if (match(Pred, m_Intrinsic<Intrinsic::aarch64_sve_convert_to_svbool>(
+                        m_Value(UncastedPred))))
+      // If the predicate has the same or less lanes than the uncasted predicate
+      // then we know the casting has no effect.
+      if (OrigPredTy->getMinNumElements() <=
+          cast<ScalableVectorType>(UncastedPred->getType())
+              ->getMinNumElements())
+        Pred = UncastedPred;
+  }
+
   auto *C = dyn_cast<Constant>(Pred);
-  return (C && C->isAllOnesValue());
+  return C && C->isAllOnesValue();
 }
 
 // Simplify `V` by only considering the operations that affect active lanes.
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index 636d4f8a9ca3a..6273cfc1005d6 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -159,6 +159,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
 
   bool parseSysAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
+  bool parseSyslAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   bool parseSyspAlias(StringRef Name, SMLoc NameLoc, OperandVector &Operands);
   void createSysAlias(uint16_t Encoding, OperandVector &Operands, SMLoc S);
   AArch64CC::CondCode parseCondCodeString(StringRef Cond,
@@ -266,6 +267,7 @@ class AArch64AsmParser : public MCTargetAsmParser {
   ParseStatus tryParseRPRFMOperand(OperandVector &Operands);
   ParseStatus tryParsePSBHint(OperandVector &Operands);
   ParseStatus tryParseBTIHint(OperandVector &Operands);
+  ParseStatus tryParseCMHPriorityHint(OperandVector &Operands);
   ParseStatus tryParseAdrpLabel(OperandVector &Operands);
   ParseStatus tryParseAdrLabel(OperandVector &Operands);
   template <bool AddFPZeroAsLiteral>
@@ -370,6 +372,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     k_PSBHint,
     k_PHint,
     k_BTIHint,
+    k_CMHPriorityHint,
   } Kind;
 
   SMLoc StartLoc, EndLoc;
@@ -499,6 +502,11 @@ class AArch64Operand : public MCParsedAsmOperand {
     unsigned Length;
     unsigned Val;
   };
+  struct CMHPriorityHintOp {
+    const char *Data;
+    unsigned Length;
+    unsigned Val;
+  };
 
   struct SVCROp {
     const char *Data;
@@ -525,6 +533,7 @@ class AArch64Operand : public MCParsedAsmOperand {
     struct PSBHintOp PSBHint;
     struct PHintOp PHint;
     struct BTIHintOp BTIHint;
+    struct CMHPriorityHintOp CMHPriorityHint;
     struct ShiftExtendOp ShiftExtend;
     struct SVCROp SVCR;
   };
@@ -595,6 +604,9 @@ class AArch64Operand : public MCParsedAsmOperand {
     case k_BTIHint:
       BTIHint = o.BTIHint;
       break;
+    case k_CMHPriorityHint:
+      CMHPriorityHint = o.CMHPriorityHint;
+      break;
     case k_ShiftExtend:
       ShiftExtend = o.ShiftExtend;
       break;
@@ -769,6 +781,16 @@ class AArch64Operand : public MCParsedAsmOperand {
     return StringRef(BTIHint.Data, BTIHint.Length);
   }
 
+  unsigned getCMHPriorityHint() const {
+    assert(Kind == k_CMHPriorityHint && "Invalid access!");
+    return CMHPriorityHint.Val;
+  }
+
+  StringRef getCMHPriorityHintName() const {
+    assert(Kind == k_CMHPriorityHint && "Invalid access!");
+    return StringRef(CMHPriorityHint.Data, CMHPriorityHint.Length);
+  }
+
   StringRef getSVCR() const {
     assert(Kind == k_SVCR && "Invalid access!");
     return StringRef(SVCR.Data, SVCR.Length);
@@ -1511,6 +1533,7 @@ class AArch64Operand : public MCParsedAsmOperand {
   bool isPSBHint() const { return Kind == k_PSBHint; }
   bool isPHint() const { return Kind == k_PHint; }
   bool isBTIHint() const { return Kind == k_BTIHint; }
+  bool isCMHPriorityHint() const { return Kind == k_CMHPriorityHint; }
   bool isShiftExtend() const { return Kind == k_ShiftExtend; }
   bool isShifter() const {
     if (!isShiftExtend())
@@ -2196,6 +2219,11 @@ class AArch64Operand : public MCParsedAsmOperand {
     Inst.addOperand(MCOperand::createImm(getBTIHint()));
   }
 
+  void addCMHPriorityHintOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(getCMHPriorityHint()));
+  }
+
   void addShifterOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     unsigned Imm =
@@ -2546,6 +2574,17 @@ class AArch64Operand : public MCParsedAsmOperand {
     return Op;
   }
 
+  static std::unique_ptr<AArch64Operand>
+  CreateCMHPriorityHint(unsigned Val, StringRef Str, SMLoc S, MCContext &Ctx) {
+    auto Op = std::make_unique<AArch64Operand>(k_CMHPriorityHint, Ctx);
+    Op->CMHPriorityHint.Val = Val;
+    Op->CMHPriorityHint.Data = Str.data();
+    Op->CMHPriorityHint.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
   static std::unique_ptr<AArch64Operand>
   CreateMatrixRegister(unsigned RegNum, unsigned ElementWidth, MatrixKind Kind,
                        SMLoc S, SMLoc E, MCContext &Ctx) {
@@ -2656,6 +2695,9 @@ void AArch64Operand::print(raw_ostream &OS, const MCAsmInfo &MAI) const {
   case k_BTIHint:
     OS << getBTIHintName();
     break;
+  case k_CMHPriorityHint:
+    OS << getCMHPriorityHintName();
+    break;
   case k_MatrixRegister:
     OS << "<matrix " << getMatrixReg() << ">";
     break;
@@ -3279,6 +3321,24 @@ ParseStatus AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
   return ParseStatus::Success;
 }
 
+/// tryParseCMHPriorityHint - Try to parse a CMHPriority operand
+ParseStatus AArch64AsmParser::tryParseCMHPriorityHint(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  const AsmToken &Tok = getTok();
+  if (Tok.isNot(AsmToken::Identifier))
+    return TokError("invalid operand for instruction");
+
+  auto CMHPriority =
+      AArch64CMHPriorityHint::lookupCMHPriorityHintByName(Tok.getString());
+  if (!CMHPriority)
+    return TokError("invalid operand for instruction");
+
+  Operands.push_back(AArch64Operand::CreateCMHPriorityHint(
+      CMHPriority->Encoding, Tok.getString(), S, getContext()));
+  Lex(); // Eat identifier token.
+  return ParseStatus::Success;
+}
+
 /// tryParseAdrpLabel - Parse and validate a source label for the ADRP
 /// instruction.
 ParseStatus AArch64AsmParser::tryParseAdrpLabel(OperandVector &Operands) {
@@ -3824,6 +3884,18 @@ static const struct Extension {
     {"ssve-bitperm", {AArch64::FeatureSSVE_BitPerm}},
     {"sme-mop4", {AArch64::FeatureSME_MOP4}},
     {"sme-tmop", {AArch64::FeatureSME_TMOP}},
+    {"cmh", {AArch64::FeatureCMH}},
+    {"lscp", {AArch64::FeatureLSCP}},
+    {"tlbid", {AArch64::FeatureTLBID}},
+    {"mpamv2", {AArch64::FeatureMPAMv2}},
+    {"mtetc", {AArch64::FeatureMTETC}},
+    {"gcie", {AArch64::FeatureGCIE}},
+    {"sme2p3", {AArch64::FeatureSME2p3}},
+    {"sve2p3", {AArch64::FeatureSVE2p3}},
+    {"sve-b16mm", {AArch64::FeatureSVE_B16MM}},
+    {"f16mm", {AArch64::FeatureF16MM}},
+    {"f16f32dot", {AArch64::FeatureF16F32DOT}},
+    {"f16f32mm", {AArch64::FeatureF16F32MM}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
@@ -3861,6 +3933,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv9.5a";
   else if (FBS[AArch64::HasV9_6aOps])
     Str += "ARMv9.6a";
+  else if (FBS[AArch64::HasV9_7aOps])
+    Str += "ARMv9.7a";
   else if (FBS[AArch64::HasV8_0rOps])
     Str += "ARMv8r";
   else {
@@ -3894,8 +3968,9 @@ void AArch64AsmParser::createSysAlias(uint16_t Encoding, OperandVector &Operands
       AArch64Operand::CreateImm(Expr, S, getLoc(), getContext()));
 }
 
-/// parseSysAlias - The IC, DC, AT, and TLBI instructions are simple aliases for
-/// the SYS instruction. Parse them specially so that we create a SYS MCInst.
+/// parseSysAlias - The IC, DC, AT, TLBI, MLBI and GIC{R} and GSB instructions
+/// are simple aliases for the SYS instruction. Parse them specially so that
+/// we create a SYS MCInst.
 bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
                                    OperandVector &Operands) {
   if (Name.contains('.'))
@@ -3908,6 +3983,8 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
   StringRef Op = Tok.getString();
   SMLoc S = Tok.getLoc();
   bool ExpectRegister = true;
+  bool OptionalRegister = false;
+  bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);
 
   if (Mnemonic == "ic") {
     const AArch64IC::IC *IC = AArch64IC::lookupICByName(Op);
@@ -3950,13 +4027,50 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
       return TokError(Str);
     }
     ExpectRegister = TLBI->NeedsReg;
+    bool hasTLBID = getSTI().hasFeature(AArch64::FeatureTLBID);
+    if (hasAll || hasTLBID) {
+      OptionalRegister = TLBI->OptionalReg;
+    }
     createSysAlias(TLBI->Encoding, Operands, S);
-  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" || Mnemonic == "cosp") {
+  } else if (Mnemonic == "mlbi") {
+    const AArch64MLBI::MLBI *MLBI = AArch64MLBI::lookupMLBIByName(Op);
+    if (!MLBI)
+      return TokError("invalid operand for MLBI instruction");
+    else if (!MLBI->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("MLBI " + std::string(MLBI->Name) + " requires: ");
+      setRequiredFeatureString(MLBI->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = MLBI->NeedsReg;
+    createSysAlias(MLBI->Encoding, Operands, S);
+  } else if (Mnemonic == "gic") {
+    const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByName(Op);
+    if (!GIC)
+      return TokError("invalid operand for GIC instruction");
+    else if (!GIC->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GIC " + std::string(GIC->Name) + " requires: ");
+      setRequiredFeatureString(GIC->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = true;
+    createSysAlias(GIC->Encoding, Operands, S);
+  } else if (Mnemonic == "gsb") {
+    const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByName(Op);
+    if (!GSB)
+      return TokError("invalid operand for GSB instruction");
+    else if (!GSB->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GSB " + std::string(GSB->Name) + " requires: ");
+      setRequiredFeatureString(GSB->getRequiredFeatures(), Str);
+      return TokError(Str);
+    }
+    ExpectRegister = false;
+    createSysAlias(GSB->Encoding, Operands, S);
+  } else if (Mnemonic == "cfp" || Mnemonic == "dvp" || Mnemonic == "cpp" ||
+             Mnemonic == "cosp") {
 
     if (Op.lower() != "rctx")
       return TokError("invalid operand for prediction restriction instruction");
 
-    bool hasAll = getSTI().hasFeature(AArch64::FeatureAll);
     bool hasPredres = hasAll || getSTI().hasFeature(AArch64::FeaturePredRes);
     bool hasSpecres2 = hasAll || getSTI().hasFeature(AArch64::FeatureSPECRES2);
 
@@ -3989,10 +4103,61 @@ bool AArch64AsmParser::parseSysAlias(StringRef Name, SMLoc NameLoc,
     HasRegister = true;
   }
 
-  if (ExpectRegister && !HasRegister)
-    return TokError("specified " + Mnemonic + " op requires a register");
-  else if (!ExpectRegister && HasRegister)
-    return TokError("specified " + Mnemonic + " op does not use a register");
+  if (!OptionalRegister) {
+    if (ExpectRegister && !HasRegister)
+      return TokError("specified " + Mnemonic + " op requires a register");
+    else if (!ExpectRegister && HasRegister)
+      return TokError("specified " + Mnemonic + " op does not use a register");
+  }
+
+  if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
+    return true;
+
+  return false;
+}
+
+/// parseSyslAlias - The GICR instructions are simple aliases for
+/// the SYSL instruction. Parse them specially so that we create a
+/// SYS MCInst.
+bool AArch64AsmParser::parseSyslAlias(StringRef Name, SMLoc NameLoc,
+                                      OperandVector &Operands) {
+
+  Mnemonic = Name;
+  Operands.push_back(
+      AArch64Operand::CreateToken("sysl", NameLoc, getContext()));
+
+  // Now expect two operands (identifier + register)
+  SMLoc startLoc = getLoc();
+  const AsmToken &regTok = getTok();
+  StringRef reg = regTok.getString();
+  unsigned RegNum = matchRegisterNameAlias(reg.lower(), RegKind::Scalar);
+  if (!RegNum)
+    return TokError("expected register operand");
+
+  Operands.push_back(AArch64Operand::CreateReg(
+      RegNum, RegKind::Scalar, startLoc, getLoc(), getContext(), EqualsReg));
+
+  Lex(); // Eat token
+  if (parseToken(AsmToken::Comma))
+    return true;
+
+  // Check for identifier
+  const AsmToken &operandTok = getTok();
+  StringRef Op = operandTok.getString();
+  SMLoc S2 = operandTok.getLoc();
+  Lex(); // Eat token
+
+  if (Mnemonic == "gicr") {
+    const AArch64GICR::GICR *GICR = AArch64GICR::lookupGICRByName(Op);
+    if (!GICR)
+      return Error(S2, "invalid operand for GICR instruction");
+    else if (!GICR->haveFeatures(getSTI().getFeatureBits())) {
+      std::string Str("GICR " + std::string(GICR->Name) + " requires: ");
+      setRequiredFeatureString(GICR->getRequiredFeatures(), Str);
+      return Error(S2, Str);
+    }
+    createSysAlias(GICR->Encoding, Operands, S2);
+  }
 
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in argument list"))
     return true;
@@ -4025,7 +4190,7 @@ bool AArch64AsmParser::parseSyspAlias(StringRef Name, SMLoc NameLoc,
       return TokError("invalid operand for TLBIP instruction");
     const AArch64TLBIP::TLBIP TLBIP(
         TLBIPorig->Name, TLBIPorig->Encoding | (HasnXSQualifier ? (1 << 7) : 0),
-        TLBIPorig->NeedsReg,
+        TLBIPorig->NeedsReg, TLBIPorig->OptionalReg,
         HasnXSQualifier
             ? TLBIPorig->FeaturesRequired | FeatureBitset({AArch64::FeatureXS})
             : TLBIPorig->FeaturesRequired);
@@ -4719,6 +4884,13 @@ ParseStatus AArch64AsmParser::tryParseVectorList(OperandVector &Operands,
       FirstReg, Count, Stride, NumElements, ElementWidth, VectorKind, S,
       getLoc(), getContext()));
 
+  if (getTok().is(AsmToken::LBrac)) {
+    ParseStatus Res = tryParseVectorIndex(Operands);
+    if (Res.isFailure())
+      return ParseStatus::Failure;
+    return ParseStatus::Success;
+  }
+
   return ParseStatus::Success;
 }
 
@@ -5267,12 +5439,17 @@ bool AArch64AsmParser::parseInstruction(ParseInstructionInfo &Info,
   size_t Start = 0, Next = Name.find('.');
   StringRef Head = Name.slice(Start, Next);
 
-  // IC, DC, AT, TLBI and Prediction invalidation instructions are aliases for
-  // the SYS instruction.
+  // IC, DC, AT, TLBI, MLBI, GIC{R}, GSB and Prediction invalidation
+  // instructions are aliases for the SYS instruction.
   if (Head == "ic" || Head == "dc" || Head == "at" || Head == "tlbi" ||
-      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp")
+      Head == "cfp" || Head == "dvp" || Head == "cpp" || Head == "cosp" ||
+      Head == "mlbi" || Head == "gic" || Head == "gsb")
     return parseSysAlias(Head, NameLoc, Operands);
 
+  // GICR instructions are aliases for the SYSL instruction.
+  if (Head == "gicr")
+    return parseSyslAlias(Head, NameLoc, Operands);
+
   // TLBIP instructions are aliases for the SYSP instruction.
   if (Head == "tlbip")
     return parseSyspAlias(Head, NameLoc, Operands);
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index 3e55b7620ebf9..14b0f9a564e01 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -5126,23 +5126,13 @@ bool AArch64InstructionSelector::selectShuffleVector(
     MachineInstr &I, MachineRegisterInfo &MRI) {
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
   Register Src1Reg = I.getOperand(1).getReg();
-  const LLT Src1Ty = MRI.getType(Src1Reg);
   Register Src2Reg = I.getOperand(2).getReg();
-  const LLT Src2Ty = MRI.getType(Src2Reg);
   ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
 
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
   LLVMContext &Ctx = MF.getFunction().getContext();
 
-  // G_SHUFFLE_VECTOR is weird in that the source operands can be scalars, if
-  // it's originated from a <1 x T> type. Those should have been lowered into
-  // G_BUILD_VECTOR earlier.
-  if (!Src1Ty.isVector() || !Src2Ty.isVector()) {
-    LLVM_DEBUG(dbgs() << "Could not select a \"scalar\" G_SHUFFLE_VECTOR\n");
-    return false;
-  }
-
   unsigned BytesPerElt = DstTy.getElementType().getSizeInBits() / 8;
 
   SmallVector<Constant *, 64> CstIdxs;
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 05a431312472e..5f93847bc680e 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -1201,25 +1201,17 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
         return llvm::is_contained(
             {v8s8, v16s8, v4s16, v8s16, v2s32, v4s32, v2s64}, DstTy);
       })
-      // G_SHUFFLE_VECTOR can have scalar sources (from 1 x s vectors) or scalar
-      // destinations, we just want those lowered into G_BUILD_VECTOR or
-      // G_EXTRACT_ELEMENT.
-      .lowerIf([=](const LegalityQuery &Query) {
-        return !Query.Types[0].isVector() || !Query.Types[1].isVector();
-      })
       .moreElementsIf(
           [](const LegalityQuery &Query) {
-            return Query.Types[0].isVector() && Query.Types[1].isVector() &&
-                   Query.Types[0].getNumElements() >
-                       Query.Types[1].getNumElements();
+            return Query.Types[0].getNumElements() >
+                   Query.Types[1].getNumElements();
           },
           changeTo(1, 0))
       .moreElementsToNextPow2(0)
       .moreElementsIf(
           [](const LegalityQuery &Query) {
-            return Query.Types[0].isVector() && Query.Types[1].isVector() &&
-                   Query.Types[0].getNumElements() <
-                       Query.Types[1].getNumElements();
+            return Query.Types[0].getNumElements() <
+                   Query.Types[1].getNumElements();
           },
           changeTo(0, 1))
       .widenScalarOrEltToNextPow2OrMinSize(0, 8)
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 830a35bbeb494..6d2d70511e894 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -856,7 +856,9 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     break;
   }
   case TargetOpcode::G_FPTOSI_SAT:
-  case TargetOpcode::G_FPTOUI_SAT: {
+  case TargetOpcode::G_FPTOUI_SAT:
+  case TargetOpcode::G_FPTOSI:
+  case TargetOpcode::G_FPTOUI: {
     LLT DstType = MRI.getType(MI.getOperand(0).getReg());
     if (DstType.isVector())
       break;
@@ -864,11 +866,19 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
       break;
     }
-    OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
+    TypeSize DstSize = getSizeInBits(MI.getOperand(0).getReg(), MRI, TRI);
+    TypeSize SrcSize = getSizeInBits(MI.getOperand(1).getReg(), MRI, TRI);
+    if (((DstSize == SrcSize) || STI.hasFeature(AArch64::FeatureFPRCVT)) &&
+        all_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+               [&](const MachineInstr &UseMI) {
+                 return onlyUsesFP(UseMI, MRI, TRI) ||
+                        prefersFPUse(UseMI, MRI, TRI);
+               }))
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    else
+      OpRegBankIdx = {PMI_FirstGPR, PMI_FirstFPR};
     break;
   }
-  case TargetOpcode::G_FPTOSI:
-  case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_INTRINSIC_LRINT:
   case TargetOpcode::G_INTRINSIC_LLRINT:
     if (MRI.getType(MI.getOperand(0).getReg()).isVector())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 35bd24418f3e7..5c3e26e923d70 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -84,6 +84,12 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
       return;
     }
 
+  if (Opcode == AArch64::SYSLxt)
+    if (printSyslAlias(MI, STI, O)) {
+      printAnnotation(O, Annot);
+      return;
+    }
+
   if (Opcode == AArch64::SYSPxt || Opcode == AArch64::SYSPxt_XZR)
     if (printSyspAlias(MI, STI, O)) {
       printAnnotation(O, Annot);
@@ -909,13 +915,25 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
   Encoding |= CnVal << 7;
   Encoding |= Op1Val << 11;
 
-  bool NeedsReg;
+  bool NeedsReg = false;
+  bool OptionalReg = false;
   std::string Ins;
   std::string Name;
 
   if (CnVal == 7) {
     switch (CmVal) {
     default: return false;
+    // MLBI aliases
+    case 0: {
+      const AArch64MLBI::MLBI *MLBI =
+          AArch64MLBI::lookupMLBIByEncoding(Encoding);
+      if (!MLBI || !MLBI->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = MLBI->NeedsReg;
+      Ins = "mlbi\t";
+      Name = std::string(MLBI->Name);
+    } break;
     // Maybe IC, maybe Prediction Restriction
     case 1:
       switch (Op1Val) {
@@ -1004,19 +1022,41 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
       return false;
 
     NeedsReg = TLBI->NeedsReg;
+    if (STI.hasFeature(AArch64::FeatureAll) ||
+        STI.hasFeature(AArch64::FeatureTLBID))
+      OptionalReg = TLBI->OptionalReg;
     Ins = "tlbi\t";
     Name = std::string(TLBI->Name);
-  }
-  else
+  } else if (CnVal == 12) {
+    if (CmVal != 0) {
+      // GIC aliases
+      const AArch64GIC::GIC *GIC = AArch64GIC::lookupGICByEncoding(Encoding);
+      if (!GIC || !GIC->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = true;
+      Ins = "gic\t";
+      Name = std::string(GIC->Name);
+    } else {
+      // GSB aliases
+      const AArch64GSB::GSB *GSB = AArch64GSB::lookupGSBByEncoding(Encoding);
+      if (!GSB || !GSB->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      NeedsReg = false;
+      Ins = "gsb\t";
+      Name = std::string(GSB->Name);
+    }
+  } else
     return false;
 
   StringRef Reg = getRegisterName(MI->getOperand(4).getReg());
   bool NotXZR = Reg != "xzr";
 
-  // If a mandatory is not specified in the TableGen
+  // If a mandatory or optional register is not specified in the TableGen
   // (i.e. no register operand should be present), and the register value
   // is not xzr/x31, then disassemble to a SYS alias instead.
-  if (NotXZR && !NeedsReg)
+  if (NotXZR && !NeedsReg && !OptionalReg)
     return false;
 
   std::string Str = Ins + Name;
@@ -1024,12 +1064,64 @@ bool AArch64InstPrinter::printSysAlias(const MCInst *MI,
 
   O << '\t' << Str;
 
-  if (NeedsReg)
+  // For optional registers, don't print the value if it's xzr/x31
+  // since this defaults to xzr/x31 if register is not specified.
+  if (NeedsReg || (OptionalReg && NotXZR))
     O << ", " << Reg;
 
   return true;
 }
 
+bool AArch64InstPrinter::printSyslAlias(const MCInst *MI,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O) {
+#ifndef NDEBUG
+  unsigned Opcode = MI->getOpcode();
+  assert(Opcode == AArch64::SYSLxt && "Invalid opcode for SYSL alias!");
+#endif
+
+  StringRef Reg = getRegisterName(MI->getOperand(0).getReg());
+  const MCOperand &Op1 = MI->getOperand(1);
+  const MCOperand &Cn = MI->getOperand(2);
+  const MCOperand &Cm = MI->getOperand(3);
+  const MCOperand &Op2 = MI->getOperand(4);
+
+  unsigned Op1Val = Op1.getImm();
+  unsigned CnVal = Cn.getImm();
+  unsigned CmVal = Cm.getImm();
+  unsigned Op2Val = Op2.getImm();
+
+  uint16_t Encoding = Op2Val;
+  Encoding |= CmVal << 3;
+  Encoding |= CnVal << 7;
+  Encoding |= Op1Val << 11;
+
+  std::string Ins;
+  std::string Name;
+
+  if (CnVal == 12) {
+    if (CmVal == 3) {
+      // GICR aliases
+      const AArch64GICR::GICR *GICR =
+          AArch64GICR::lookupGICRByEncoding(Encoding);
+      if (!GICR || !GICR->haveFeatures(STI.getFeatureBits()))
+        return false;
+
+      Ins = "gicr";
+      Name = std::string(GICR->Name);
+    } else
+      return false;
+  } else
+    return false;
+
+  std::string Str;
+  llvm::transform(Name, Name.begin(), ::tolower);
+
+  O << '\t' << Ins << '\t' << Reg.str() << ", " << Name;
+
+  return true;
+}
+
 bool AArch64InstPrinter::printSyspAlias(const MCInst *MI,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
@@ -1508,6 +1600,17 @@ void AArch64InstPrinter::printBTIHintOp(const MCInst *MI, unsigned OpNum,
     markup(O, Markup::Immediate) << '#' << formatImm(btihintop);
 }
 
+void AArch64InstPrinter::printCMHPriorityHintOp(const MCInst *MI,
+                                                unsigned OpNum,
+                                                const MCSubtargetInfo &STI,
+                                                raw_ostream &O) {
+  unsigned priorityhint_op = MI->getOperand(OpNum).getImm();
+  auto PHint =
+      AArch64CMHPriorityHint::lookupCMHPriorityHintByEncoding(priorityhint_op);
+  if (PHint)
+    O << PHint->Name;
+}
+
 void AArch64InstPrinter::printFPImmOperand(const MCInst *MI, unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 15ef2ddfc22fd..307402d920d32 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -52,6 +52,8 @@ class AArch64InstPrinter : public MCInstPrinter {
 protected:
   bool printSysAlias(const MCInst *MI, const MCSubtargetInfo &STI,
                      raw_ostream &O);
+  bool printSyslAlias(const MCInst *MI, const MCSubtargetInfo &STI,
+                      raw_ostream &O);
   bool printSyspAlias(const MCInst *MI, const MCSubtargetInfo &STI,
                       raw_ostream &O);
   bool printRangePrefetchAlias(const MCInst *MI, const MCSubtargetInfo &STI,
@@ -151,6 +153,9 @@ class AArch64InstPrinter : public MCInstPrinter {
   void printBTIHintOp(const MCInst *MI, unsigned OpNum,
                       const MCSubtargetInfo &STI, raw_ostream &O);
 
+  void printCMHPriorityHintOp(const MCInst *MI, unsigned OpNum,
+                              const MCSubtargetInfo &STI, raw_ostream &O);
+
   void printFPImmOperand(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 
diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td
index 33f35ad98a425..99836aeed7c0a 100644
--- a/llvm/lib/Target/AArch64/SMEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td
@@ -3920,6 +3920,78 @@ multiclass sme2_luti4_vector_vg4_index<string mnemonic> {
   def _S : sme2_luti4_vector_vg4_index<0b10, ZZZZ_s_mul_r, mnemonic>;
 }
 
+// 8-bit Look up table
+class sme2_lut_single<string asm>
+  : I<(outs ZPR8:$Zd), (ins ZTR:$ZTt, ZPRAny:$Zn),
+    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+  bits<0> ZTt;
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-10} = 0b1100000011001000010000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+//===----------------------------------------------------------------------===//
+// Lookup table read with 6-bit indices (8-bit)
+class sme2_luti6_zt_base<RegisterOperand zd_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZTR:$ZTt, ZZZ_Any:$Zn),
+    asm, "\t$Zd, $ZTt, $Zn", "", []>, Sched<[]> {
+  bits<0> ZTt;
+  bits<3> Zd;
+  bits<3> Zn;
+  let Inst{31-21} = 0b11000000100;
+  let Inst{19-10} = 0b1010000000;
+  let Inst{9-7}   = Zn;
+  let Inst{6-5}   = 0b00;
+}
+
+class sme2_luti6_zt_consecutive<string asm>
+  : sme2_luti6_zt_base<ZZZZ_b_mul_r, asm> {
+  let Inst{20}    = 0;
+  let Inst{4-2}   = Zd;
+  let Inst{1-0}   = 0b00;
+}
+
+class sme2_luti6_zt_strided<string asm>
+  : sme2_luti6_zt_base<ZZZZ_b_strided, asm> {
+  let Inst{20}    = 1;
+  let Inst{4}     = Zd{2};
+  let Inst{3-2}   = 0b00;
+  let Inst{1-0}   = Zd{1-0};
+}
+
+//===----------------------------------------------------------------------===//
+// Lookup table read with 6-bit indices (8-bit)
+class sme2_luti6_vector_vg4_base<RegisterOperand zd_ty, string asm>
+  : I<(outs zd_ty:$Zd), (ins ZZ_h:$Zn, ZZ_Any:$Zm, VectorIndexD:$i1),
+    asm, "\t$Zd, $Zn, $Zm$i1", "", []>, Sched<[]> {
+  bits<3> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  bits<1> i1;
+  let Inst{31-23} = 0b110000010;
+  let Inst{22}    = i1;
+  let Inst{21}    = 0b1;
+  let Inst{20-16} = Zm;
+  let Inst{9-5}   = Zn;
+}
+
+class sme2_luti6_vector_vg4_consecutive<string asm>
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_mul_r, asm> {
+  let Inst{15-10} = 0b111101;
+  let Inst{4-2}   = Zd;
+  let Inst{1-0}   = 0b00;
+}
+
+class sme2_luti6_vector_vg4_strided<string asm>
+  : sme2_luti6_vector_vg4_base<ZZZZ_h_strided, asm> {
+  let Inst{15-10} = 0b111111;
+  let Inst{4}     = Zd{2};
+  let Inst{3-2}   = 0b00;
+  let Inst{1-0}   = Zd{1-0};
+}
+
 //===----------------------------------------------------------------------===//
 // SME2 MOV
 class sme2_mova_vec_to_tile_vg2_multi_base<bits<2> sz, bit v,
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index 3cdd505f12116..1664f4ad0c8fa 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -3787,7 +3787,7 @@ multiclass sve2p1_two_way_dot_vv<string mnemonic, bit u, SDPatternOperator intri
 // SVE Integer Dot Product Group - Indexed Group
 //===----------------------------------------------------------------------===//
 
-class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
+class sve_intx_dot_by_indexed_elem<bit U, string asm,
                                    ZPRRegOp zprty1, ZPRRegOp zprty2,
                                    ZPRRegOp zprty3, Operand itype>
 : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty3:$Zm, itype:$iop),
@@ -3795,8 +3795,7 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
   "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
-  let Inst{31-23} = 0b010001001;
-  let Inst{22}    = sz;
+  let Inst{31-24} = 0b01000100;
   let Inst{21}    = 0b1;
   let Inst{15-11} = 0;
   let Inst{10}    = U;
@@ -3810,16 +3809,18 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
 
 multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
                                         SDPatternOperator op> {
-  def _BtoS : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
+  def _BtoS : sve_intx_dot_by_indexed_elem<opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
     bits<2> iop;
     bits<3> Zm;
+    let Inst{23-22} = 0b10;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _HtoD : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
+  def _HtoD : sve_intx_dot_by_indexed_elem<opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
     bits<1> iop;
     bits<4> Zm;
-    let Inst{20} = iop;
+    let Inst{23-22} = 0b11;
+    let Inst{20}    = iop;
     let Inst{19-16} = Zm;
   }
 
@@ -3827,6 +3828,16 @@ multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
   def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv8i16, nxv8i16, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _HtoD)>;
 }
 
+class sve_intx_dot_by_indexed_elem_x<bit opc, string asm>
+: sve_intx_dot_by_indexed_elem<opc, asm, ZPR16, ZPR8, ZPR3b8, VectorIndexH32b_timm> {
+ bits<3> iop;
+ bits<3> Zm;
+ let Inst{23}    = 0b0;
+ let Inst{22}    = iop{2};
+ let Inst{20-19} = iop{1-0};
+ let Inst{18-16} = Zm;
+}
+
 //===----------------------------------------------------------------------===//
 // SVE2 Complex Integer Dot Product Group
 //===----------------------------------------------------------------------===//
@@ -4085,7 +4096,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
   bits<5> Zdn;
   let Inst{31-24} = 0b01000100;
   let Inst{23-22} = sz;
-  let Inst{21-20} = 0b01;
+  let Inst{21}    = 0b0;
   let Inst{20-16} = opc{5-1};
   let Inst{15-14} = 0b10;
   let Inst{13}    = opc{0};
@@ -4590,15 +4601,15 @@ multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
 }
 
-class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
+class sve2_int_absdiff_accum<bits<3> sz, bits<4> opc, string asm,
                              ZPRRegOp zprty1, ZPRRegOp zprty2>
 : I<(outs zprty1:$Zda), (ins zprty1:$_Zda, zprty2:$Zn, zprty2:$Zm),
   asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
   bits<5> Zm;
-  let Inst{31-24} = 0b01000101;
-  let Inst{23-22} = sz;
+  let Inst{31-25} = 0b0100010;
+  let Inst{24-22} = sz;
   let Inst{21}    = 0b0;
   let Inst{20-16} = Zm;
   let Inst{15-14} = 0b11;
@@ -4613,10 +4624,10 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
 }
 
 multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
-  def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
-  def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
-  def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
-  def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
+  def _B : sve2_int_absdiff_accum<0b100, { 0b111, opc }, asm, ZPR8, ZPR8>;
+  def _H : sve2_int_absdiff_accum<0b101, { 0b111, opc }, asm, ZPR16, ZPR16>;
+  def _S : sve2_int_absdiff_accum<0b110, { 0b111, opc }, asm, ZPR32, ZPR32>;
+  def _D : sve2_int_absdiff_accum<0b111, { 0b111, opc }, asm, ZPR64, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -4626,20 +4637,26 @@ multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
 
 multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
                                        SDPatternOperator op> {
-  def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
-  def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
-  def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+  def _H : sve2_int_absdiff_accum<0b101, { 0b00, opc }, asm, ZPR16, ZPR8>;
+  def _S : sve2_int_absdiff_accum<0b110, { 0b00, opc }, asm, ZPR32, ZPR16>;
+  def _D : sve2_int_absdiff_accum<0b111, { 0b00, opc }, asm, ZPR64, ZPR32>;
 
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve2_int_two_way_absdiff_accum_long<bit U, string asm> {
+  def _BtoH : sve2_int_absdiff_accum<0b001, { 0b01, U, 0b1 }, asm, ZPR16, ZPR8>;
+  def _HtoS : sve2_int_absdiff_accum<0b010, { 0b01, U, 0b1 }, asm, ZPR32, ZPR16>;
+  def _StoD : sve2_int_absdiff_accum<0b011, { 0b01, U, 0b1 }, asm, ZPR64, ZPR32>;
+}
+
 multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm,
                                       SDPatternOperator op> {
-  def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
+  def _S : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
                                   ZPR32, ZPR32>;
-  def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
+  def _D : sve2_int_absdiff_accum<{ 0b1, opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
                                   ZPR64, ZPR64>;
 
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
@@ -9610,17 +9627,18 @@ multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
 // SVE Floating Point Matrix Multiply Accumulate Group
 //===----------------------------------------------------------------------===//
 
-class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty>
+class sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty>
 : I<(outs zda_ty:$Zda), (ins zda_ty:$_Zda, reg_ty:$Zn, reg_ty:$Zm),
     asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
   bits<5> Zda;
   bits<5> Zn;
   bits<5> Zm;
   let Inst{31-24} = 0b01100100;
-  let Inst{23-22} = opc;
+  let Inst{23-22} = opc{2-1};
   let Inst{21}    = 1;
   let Inst{20-16} = Zm;
-  let Inst{15-10} = 0b111001;
+  let Inst{15-11} = 0b11100;
+  let Inst{10}    = opc{0};
   let Inst{9-5}   = Zn;
   let Inst{4-0}   = Zda;
 
@@ -9630,10 +9648,12 @@ class sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_t
   let mayRaiseFPException = 1;
 }
 
-multiclass sve_fp_matrix_mla<bits<2> opc, string asm, ZPRRegOp zda_ty, ZPRRegOp reg_ty, SDPatternOperator op, ValueType zda_vt, ValueType reg_vt> {
+multiclass sve_fp_matrix_mla<bits<3> opc, string asm, ZPRRegOp zda_ty,
+                             ZPRRegOp reg_ty, SDPatternOperator op,
+                             ValueType zda_vt, ValueType reg_vt> {
   def NAME : sve_fp_matrix_mla<opc, asm, zda_ty, reg_ty>;
 
-  def : SVE_3_Op_Pat<zda_vt, op , zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<zda_vt, op, zda_vt, reg_vt, reg_vt, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -10030,18 +10050,19 @@ multiclass sve2p1_multi_vec_extract_narrow<string mnemonic, bits<2> opc, SDPatte
 }
 
 // SVE2 multi-vec shift narrow
-class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
-    : I<(outs ZPR16:$Zd), (ins ZZ_s_mul_r:$Zn, vecshiftR16:$imm4),
-        mnemonic, "\t$Zd, $Zn, $imm4",
+class sve2p1_multi_vec_shift_narrow<string mnemonic, ZPRRegOp ZdRC, RegisterOperand ZSrcOp,
+                                    Operand immtype, bits<3> opc, bits<2> tsz>
+    : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn, immtype:$imm),
+        mnemonic, "\t$Zd, $Zn, $imm",
         "", []>, Sched<[]> {
   bits<5> Zd;
   bits<4> Zn;
-  bits<4> imm4;
+  bits<4> imm;
   let Inst{31-23} = 0b010001011;
   let Inst{22}    = tsz{1};
   let Inst{21}    = 0b1;
   let Inst{20}    = tsz{0};
-  let Inst{19-16} = imm4;
+  let Inst{18-16} = imm{2-0};  // imm3
   let Inst{15-14} = 0b00;
   let Inst{13-11} = opc;
   let Inst{10}    = 0b0;
@@ -10052,12 +10073,19 @@ class sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, bits<2> tsz>
   let hasSideEffects = 0;
 }
 
-multiclass sve2p1_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> {
-  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, opc, 0b01>;
+multiclass sve_multi_vec_shift_narrow<string mnemonic, bits<3> opc, SDPatternOperator intrinsic> {
+  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR16, ZZ_s_mul_r, vecshiftR16, opc, 0b01> {
+    let Inst{19} = imm{3}; // imm4
+  }
 
   def : SVE2p1_Sat_Shift_VG2_Pat<NAME, intrinsic, nxv8i16, nxv4i32, vecshiftR16>;
 }
 
+multiclass sve_multi_vec_round_shift_narrow<string mnemonic, bits<3> opc> {
+  def NAME : sve2p1_multi_vec_shift_narrow<mnemonic, ZPR8, ZZ_h_mul_r, vecshiftR8, opc, 0b00> {
+    let Inst{19} = 0b1;    // always 1 for imm3 version
+  }
+}
 
 // SME2 multi-vec contiguous load (scalar plus scalar, two registers)
 class sve2p1_mem_cld_ss_2z<string mnemonic, bits<2> msz, bit n,
@@ -11164,7 +11192,7 @@ multiclass sve2_fp8_dot_indexed_s<string asm, SDPatternOperator op> {
   def : SVE_4_Op_Pat<nxv4f32, op, nxv4f32, nxv16i8, nxv16i8, i32, !cast<Instruction>(NAME)>;
 }
 
-// FP8 Look up table
+// Look up table
 class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,
                             Operand idx_ty, bits<4>opc, string mnemonic>
     : I<(outs zd_ty:$Zd), (ins zn_ty:$Zn, ZPRAny:$Zm, idx_ty:$idx),
@@ -11183,7 +11211,7 @@ class sve2_lut_vector_index<ZPRRegOp zd_ty, RegisterOperand zn_ty,
   let Inst{4-0}   = Zd;
 }
 
-// FP8 Look up table read with 2-bit indices
+// Look up table read with 2-bit indices
 multiclass sve2_luti2_vector_index<string mnemonic> {
   def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexS32b, {?, 0b100}, mnemonic> {
     bits<2> idx;
@@ -11205,7 +11233,7 @@ multiclass sve2_luti2_vector_index<string mnemonic> {
                          i32, timm32_0_7, !cast<Instruction>(NAME # _H)>;
 }
 
-// FP8 Look up table read with 4-bit indices
+// Look up table read with 4-bit indices
 multiclass sve2_luti4_vector_index<string mnemonic> {
   def _B : sve2_lut_vector_index<ZPR8, Z_b, VectorIndexD32b, 0b1001, mnemonic> {
     bit idx;
@@ -11226,7 +11254,7 @@ multiclass sve2_luti4_vector_index<string mnemonic> {
                          i32, timm32_0_3, !cast<Instruction>(NAME # _H)>;
 }
 
-// FP8 Look up table read with 4-bit indices (two contiguous registers)
+// Look up table read with 4-bit indices (two contiguous registers)
 multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
   def NAME : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexS32b, {?, 0b101}, mnemonic> {
     bits<2> idx;
@@ -11250,6 +11278,29 @@ multiclass sve2_luti4_vector_vg2_index<string mnemonic> {
                                                 nxv16i8:$Op3, timm32_0_3:$Op4))>;
 }
 
+// Look up table read with 6-bit indices
+multiclass sve2_luti6_vector_index<string mnemonic> {
+  def _H : sve2_lut_vector_index<ZPR16, ZZ_h, VectorIndexD32b, 0b1011, mnemonic> {
+    bit idx;
+    let Inst{23} = idx;
+  }
+}
+
+// Look up table
+class sve2_luti6_vector<string mnemonic>
+    : I<(outs ZPR8:$Zd), (ins ZZ_b:$Zn, ZPRAny:$Zm),
+      mnemonic, "\t$Zd, $Zn, $Zm",
+      "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-21} = 0b01000101001;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b101011;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
 //===----------------------------------------------------------------------===//
 // Checked Pointer Arithmetic (FEAT_CPA)
 //===----------------------------------------------------------------------===//
@@ -11280,3 +11331,49 @@ class sve_int_mla_cpa<string asm>
 
   let ElementSize = ZPR64.ElementSize;
 }
+
+//===----------------------------------------------------------------------===//
+// FP to Int down-converts
+//===----------------------------------------------------------------------===//
+class sve2_fp_to_int_downcvt<string asm, ZPRRegOp ZdRC, RegisterOperand ZSrcOp, bits<2> size, bit U>
+  : I<(outs ZdRC:$Zd), (ins ZSrcOp:$Zn),
+      asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<4> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = size;
+  let Inst{21-11} = 0b00110100110;
+  let Inst{10}    = U;
+  let Inst{9-6}   = Zn;
+  let Inst{5}     = 0b0;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve2_fp_to_int_downcvt<string asm, bit U> {
+  def _HtoB : sve2_fp_to_int_downcvt<asm, ZPR8,  ZZ_h_mul_r, 0b01, U>;
+  def _StoH : sve2_fp_to_int_downcvt<asm, ZPR16, ZZ_s_mul_r, 0b10, U>;
+  def _DtoS : sve2_fp_to_int_downcvt<asm, ZPR32, ZZ_d_mul_r, 0b11, U>;
+}
+
+//===----------------------------------------------------------------------===//
+// Int to FP up-converts
+//===----------------------------------------------------------------------===//
+class sve2_int_to_fp_upcvt<string asm, ZPRRegOp ZdRC, ZPRRegOp ZnRC,
+                        bits<2> size, bits<2> U>
+  : I<(outs ZdRC:$Zd), (ins  ZnRC:$Zn),
+      asm, "\t$Zd, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zn;
+  let Inst{31-24} = 0b01100101;
+  let Inst{23-22} = size;
+  let Inst{21-12} = 0b0011000011;
+  let Inst{11-10} = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve2_int_to_fp_upcvt<string asm, bits<2> U> {
+  def _BtoH : sve2_int_to_fp_upcvt<asm, ZPR16, ZPR8,  0b01, U>;
+  def _HtoS : sve2_int_to_fp_upcvt<asm, ZPR32, ZPR16, 0b10, U>;
+  def _StoD : sve2_int_to_fp_upcvt<asm, ZPR64, ZPR32, 0b11, U>;
+}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
index d6cb0e857d57a..268a22968f8ab 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -138,6 +138,13 @@ namespace llvm {
   }
 }
 
+namespace llvm {
+namespace AArch64CMHPriorityHint {
+#define GET_CMHPRIORITYHINT_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64CMHPriorityHint
+} // namespace llvm
+
 namespace llvm {
   namespace AArch64SysReg {
 #define GET_SysRegsList_IMPL
@@ -190,6 +197,32 @@ namespace AArch64TLBIP {
 #define GET_TLBIPTable_IMPL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64TLBIP
+
+namespace AArch64MLBI {
+#define GET_MLBITable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64MLBI
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GIC {
+#define GET_GICTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GIC
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GICR {
+#define GET_GICRTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GICR
+} // namespace llvm
+
+namespace llvm {
+namespace AArch64GSB {
+#define GET_GSBTable_IMPL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GSB
 } // namespace llvm
 
 namespace llvm {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index fea33ef10149d..27812e94a3516 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -409,6 +409,16 @@ struct SysAliasReg : SysAlias {
       : SysAlias(N, E, F), NeedsReg(R) {}
 };
 
+struct SysAliasOptionalReg : SysAlias {
+  bool NeedsReg;
+  bool OptionalReg;
+  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O)
+      : SysAlias(N, E), NeedsReg(R), OptionalReg(O) {}
+  constexpr SysAliasOptionalReg(const char *N, uint16_t E, bool R, bool O,
+                                FeatureBitset F)
+      : SysAlias(N, E, F), NeedsReg(R), OptionalReg(O) {}
+};
+
 struct SysAliasImm : SysAlias {
   uint16_t ImmValue;
   constexpr SysAliasImm(const char *N, uint16_t E, uint16_t I)
@@ -677,6 +687,14 @@ namespace AArch64BTIHint {
 #include "AArch64GenSystemOperands.inc"
 }
 
+namespace AArch64CMHPriorityHint {
+struct CMHPriorityHint : SysAlias {
+  using SysAlias::SysAlias;
+};
+#define GET_CMHPRIORITYHINT_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64CMHPriorityHint
+
 namespace AArch64SME {
 enum ToggleCondition : unsigned {
   Always,
@@ -788,21 +806,53 @@ namespace AArch64SysReg {
 }
 
 namespace AArch64TLBI {
-  struct TLBI : SysAliasReg {
-    using SysAliasReg::SysAliasReg;
-  };
-  #define GET_TLBITable_DECL
-  #include "AArch64GenSystemOperands.inc"
+struct TLBI : SysAliasOptionalReg {
+  using SysAliasOptionalReg::SysAliasOptionalReg;
+};
+#define GET_TLBITable_DECL
+#include "AArch64GenSystemOperands.inc"
 }
 
 namespace AArch64TLBIP {
-struct TLBIP : SysAliasReg {
-  using SysAliasReg::SysAliasReg;
+struct TLBIP : SysAliasOptionalReg {
+  using SysAliasOptionalReg::SysAliasOptionalReg;
 };
 #define GET_TLBIPTable_DECL
 #include "AArch64GenSystemOperands.inc"
 } // namespace AArch64TLBIP
 
+namespace AArch64MLBI {
+struct MLBI : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_MLBITable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64MLBI
+
+namespace AArch64GIC {
+struct GIC : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_GICTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GIC
+
+namespace AArch64GICR {
+struct GICR : SysAliasReg {
+  using SysAliasReg::SysAliasReg;
+};
+#define GET_GICRTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GICR
+
+namespace AArch64GSB {
+struct GSB : SysAlias {
+  using SysAlias::SysAlias;
+};
+#define GET_GSBTable_DECL
+#include "AArch64GenSystemOperands.inc"
+} // namespace AArch64GSB
+
 namespace AArch64II {
 /// Target Operand Flag enum.
 enum TOF {
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
index d71f7280597b9..085c858820568 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.cpp
@@ -75,8 +75,8 @@ SMEAttrs::SMEAttrs(const AttributeList &Attrs) {
 }
 
 void SMEAttrs::addKnownFunctionAttrs(StringRef FuncName,
-                                     const AArch64TargetLowering &TLI) {
-  RTLIB::LibcallImpl Impl = TLI.getSupportedLibcallImpl(FuncName);
+                                     const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+  RTLIB::LibcallImpl Impl = RTLCI.getSupportedLibcallImpl(FuncName);
   if (Impl == RTLIB::Unsupported)
     return;
   unsigned KnownAttrs = SMEAttrs::Normal;
@@ -124,21 +124,22 @@ bool SMECallAttrs::requiresSMChange() const {
   return true;
 }
 
-SMECallAttrs::SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI)
+SMECallAttrs::SMECallAttrs(const CallBase &CB,
+                           const RTLIB::RuntimeLibcallsInfo *RTLCI)
     : CallerFn(*CB.getFunction()), CalledFn(SMEAttrs::Normal),
       Callsite(CB.getAttributes()), IsIndirect(CB.isIndirectCall()) {
   if (auto *CalledFunction = CB.getCalledFunction())
-    CalledFn = SMEAttrs(*CalledFunction, TLI);
-
-  // An `invoke` of an agnostic ZA function may not return normally (it may
-  // resume in an exception block). In this case, it acts like a private ZA
-  // callee and may require a ZA save to be set up before it is called.
-  if (isa<InvokeInst>(CB))
-    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
+    CalledFn = SMEAttrs(*CalledFunction, RTLCI);
 
   // FIXME: We probably should not allow SME attributes on direct calls but
   // clang duplicates streaming mode attributes at each callsite.
   assert((IsIndirect ||
           ((Callsite.withoutPerCallsiteFlags() | CalledFn) == CalledFn)) &&
          "SME attributes at callsite do not match declaration");
+
+  // An `invoke` of an agnostic ZA function may not return normally (it may
+  // resume in an exception block). In this case, it acts like a private ZA
+  // callee and may require a ZA save to be set up before it is called.
+  if (isa<InvokeInst>(CB))
+    CalledFn.set(SMEAttrs::ZA_State_Agnostic, /*Enable=*/false);
 }
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
index d26e3cd3a9f76..28c397e221fdc 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64SMEAttributes.h
@@ -12,8 +12,9 @@
 #include "llvm/IR/Function.h"
 
 namespace llvm {
-
-class AArch64TargetLowering;
+namespace RTLIB {
+struct RuntimeLibcallsInfo;
+}
 
 class Function;
 class CallBase;
@@ -52,14 +53,14 @@ class SMEAttrs {
 
   SMEAttrs() = default;
   SMEAttrs(unsigned Mask) { set(Mask); }
-  SMEAttrs(const Function &F, const AArch64TargetLowering *TLI = nullptr)
+  SMEAttrs(const Function &F, const RTLIB::RuntimeLibcallsInfo *RTLCI = nullptr)
       : SMEAttrs(F.getAttributes()) {
-    if (TLI)
-      addKnownFunctionAttrs(F.getName(), *TLI);
+    if (RTLCI)
+      addKnownFunctionAttrs(F.getName(), *RTLCI);
   }
   SMEAttrs(const AttributeList &L);
-  SMEAttrs(StringRef FuncName, const AArch64TargetLowering &TLI) {
-    addKnownFunctionAttrs(FuncName, TLI);
+  SMEAttrs(StringRef FuncName, const RTLIB::RuntimeLibcallsInfo &RTLCI) {
+    addKnownFunctionAttrs(FuncName, RTLCI);
   };
 
   void set(unsigned M, bool Enable = true) {
@@ -157,7 +158,7 @@ class SMEAttrs {
 
 private:
   void addKnownFunctionAttrs(StringRef FuncName,
-                             const AArch64TargetLowering &TLI);
+                             const RTLIB::RuntimeLibcallsInfo &RTLCI);
   void validate() const;
 };
 
@@ -175,7 +176,7 @@ class SMECallAttrs {
                SMEAttrs Callsite = SMEAttrs::Normal)
       : CallerFn(Caller), CalledFn(Callee), Callsite(Callsite) {}
 
-  SMECallAttrs(const CallBase &CB, const AArch64TargetLowering *TLI);
+  SMECallAttrs(const CallBase &CB, const RTLIB::RuntimeLibcallsInfo *RTLCI);
 
   SMEAttrs &caller() { return CallerFn; }
   SMEAttrs &callee() { return IsIndirect ? Callsite : CalledFn; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index ea327482a3d0e..1c8383c3a682d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -1430,6 +1430,18 @@ def FeatureAddSubU64Insts
 def FeatureMadU32Inst : SubtargetFeature<"mad-u32-inst", "HasMadU32Inst",
                                          "true", "Has v_mad_u32 instruction">;
 
+def FeatureAddMinMaxInsts : SubtargetFeature<"add-min-max-insts",
+  "HasAddMinMaxInsts",
+  "true",
+  "Has v_add_{min|max}_{i|u}32 instructions"
+>;
+
+def FeaturePkAddMinMaxInsts : SubtargetFeature<"pk-add-min-max-insts",
+  "HasPkAddMinMaxInsts",
+  "true",
+  "Has v_pk_add_{min|max}_{i|u}16 instructions"
+>;
+
 def FeatureMemToLDSLoad : SubtargetFeature<"vmem-to-lds-load-insts",
   "HasVMemToLDSLoad",
   "true",
@@ -2115,6 +2127,8 @@ def FeatureISAVersion12_50 : FeatureSet<
    FeatureLshlAddU64Inst,
    FeatureAddSubU64Insts,
    FeatureMadU32Inst,
+   FeatureAddMinMaxInsts,
+   FeaturePkAddMinMaxInsts,
    FeatureLdsBarrierArriveAtomic,
    FeatureSetPrioIncWgInst,
    Feature45BitNumRecordsBufferResource,
@@ -2658,11 +2672,11 @@ def HasFmaakFmamkF64Insts :
 
 def HasAddMinMaxInsts :
   Predicate<"Subtarget->hasAddMinMaxInsts()">,
-  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+  AssemblerPredicate<(any_of FeatureAddMinMaxInsts)>;
 
 def HasPkAddMinMaxInsts :
   Predicate<"Subtarget->hasPkAddMinMaxInsts()">,
-  AssemblerPredicate<(any_of FeatureGFX1250Insts)>;
+  AssemblerPredicate<(any_of FeaturePkAddMinMaxInsts)>;
 
 def HasPkMinMax3Insts :
   Predicate<"Subtarget->hasPkMinMax3Insts()">,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index e8b211f7866ad..7f00eadbf3f3f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -176,6 +176,19 @@ def binop_s64_with_s32_mask_combines : GICombineGroup<[
   combine_or_s64_with_s32_mask, combine_and_s64_with_s32_mask
 ]>;
 
+// (or i64:x, (zext i32:y)) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+// (or (zext i32:y), i64:x) -> i64:(merge (or lo_32(x), i32:y), hi_32(x))
+def or_s64_zext_s32_frag : GICombinePatFrag<(outs root:$dst), (ins $src_s64, $src_s32),
+  [(pattern (G_OR $dst, i64:$src_s64, i64:$zext_val), (G_ZEXT i64:$zext_val, i32:$src_s32)),
+   (pattern (G_OR $dst, i64:$zext_val, i64:$src_s64), (G_ZEXT i64:$zext_val, i32:$src_s32))]>;
+
+def combine_or_s64_s32 : GICombineRule<
+  (defs root:$dst),
+  (match (or_s64_zext_s32_frag $dst, i64:$x, i32:$y):$dst),
+  (apply (G_UNMERGE_VALUES $x_lo, $x_hi, $x),
+         (G_OR $or, $x_lo, $y),
+         (G_MERGE_VALUES $dst, $or, $x_hi))>;
+
 let Predicates = [Has16BitInsts, NotHasMed3_16] in {
 // For gfx8, expand f16-fmed3-as-f32 into a min/max f16 sequence. This
 // saves one instruction compared to the promotion.
@@ -206,7 +219,7 @@ def AMDGPUPreLegalizerCombiner: GICombiner<
   "AMDGPUPreLegalizerCombinerImpl",
   [all_combines, combine_fmul_with_select_to_fldexp, clamp_i64_to_i16,
    foldable_fneg, combine_shuffle_vector_to_build_vector,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
@@ -215,7 +228,7 @@ def AMDGPUPostLegalizerCombiner: GICombiner<
   [all_combines, gfx6gfx7_combines, gfx8_combines, combine_fmul_with_select_to_fldexp,
    uchar_to_float, cvt_f32_ubyteN, remove_fcanonicalize, foldable_fneg,
    rcp_sqrt_to_rsq, fdiv_by_sqrt_to_rsq_f16, sign_extension_in_reg, smulu64,
-   binop_s64_with_s32_mask_combines]> {
+   binop_s64_with_s32_mask_combines, combine_or_s64_s32]> {
   let CombineAllMethodName = "tryCombineAllImpl";
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 1b4b113fad61c..6bad4dbdf5321 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -131,7 +131,7 @@ class LLVM_EXTERNAL_VISIBILITY MetadataStreamerMsgPackV4
 
 public:
   MetadataStreamerMsgPackV4() = default;
-  ~MetadataStreamerMsgPackV4() = default;
+  ~MetadataStreamerMsgPackV4() override = default;
 
   bool emitTo(AMDGPUTargetStreamer &TargetStreamer) override;
 
@@ -154,7 +154,7 @@ class MetadataStreamerMsgPackV5 : public MetadataStreamerMsgPackV4 {
 
 public:
   MetadataStreamerMsgPackV5() = default;
-  ~MetadataStreamerMsgPackV5() = default;
+  ~MetadataStreamerMsgPackV5() override = default;
 };
 
 class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
@@ -163,7 +163,7 @@ class MetadataStreamerMsgPackV6 final : public MetadataStreamerMsgPackV5 {
 
 public:
   MetadataStreamerMsgPackV6() = default;
-  ~MetadataStreamerMsgPackV6() = default;
+  ~MetadataStreamerMsgPackV6() override = default;
 
   void emitKernelAttrs(const AMDGPUTargetMachine &TM, const MachineFunction &MF,
                        msgpack::MapDocNode Kern) override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 8ed4062e43946..1b559a628be08 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -514,8 +514,8 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
       MVT::i64, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::i64, Expand);
 
-  setOperationAction({ISD::ABS, ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX},
-                     MVT::i32, Legal);
+  setOperationAction({ISD::SMIN, ISD::UMIN, ISD::SMAX, ISD::UMAX}, MVT::i32,
+                     Legal);
 
   setOperationAction(
       {ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF, ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF},
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 596a89521641e..1a13b2226ecd6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -976,9 +976,25 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     FPOpActions.clampMaxNumElementsStrict(0, S32, 2);
   }
 
+  auto &MinNumMaxNumIeee =
+      getActionDefinitionsBuilder({G_FMINNUM_IEEE, G_FMAXNUM_IEEE});
+
+  if (ST.hasVOP3PInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypesPK16)
+        .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+        .clampMaxNumElements(0, S16, 2)
+        .clampScalar(0, S16, S64)
+        .scalarize(0);
+  } else if (ST.has16BitInsts()) {
+    MinNumMaxNumIeee.legalFor(FPTypes16).clampScalar(0, S16, S64).scalarize(0);
+  } else {
+    MinNumMaxNumIeee.legalFor(FPTypesBase)
+        .clampScalar(0, S32, S64)
+        .scalarize(0);
+  }
+
   auto &MinNumMaxNum = getActionDefinitionsBuilder(
-      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM, G_FMINNUM_IEEE,
-       G_FMAXNUM_IEEE});
+      {G_FMINNUM, G_FMAXNUM, G_FMINIMUMNUM, G_FMAXIMUMNUM});
 
   if (ST.hasVOP3PInsts()) {
     MinNumMaxNum.customFor(FPTypesPK16)
@@ -2136,9 +2152,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         .legalFor(FPTypesPK16)
         .clampMaxNumElements(0, S16, 2)
         .scalarize(0);
+  } else if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .lowerFor({V2S16})
+        .clampMaxNumElementsStrict(0, S16, 2)
+        .scalarize(0)
+        .lower();
   } else {
-    // TODO: Implement
-    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
+    getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM})
+        .scalarize(0)
+        .clampScalar(0, S32, S64)
+        .lower();
   }
 
   getActionDefinitionsBuilder({G_MEMCPY, G_MEMCPY_INLINE, G_MEMMOVE, G_MEMSET})
@@ -2195,8 +2219,6 @@ bool AMDGPULegalizerInfo::legalizeCustom(
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINIMUMNUM:
   case TargetOpcode::G_FMAXIMUMNUM:
-  case TargetOpcode::G_FMINNUM_IEEE:
-  case TargetOpcode::G_FMAXNUM_IEEE:
     return legalizeMinNumMaxNum(Helper, MI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
@@ -2817,23 +2839,8 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
   MachineFunction &MF = Helper.MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
-                        MI.getOpcode() == AMDGPU::G_FMAXNUM_IEEE;
-
-  // With ieee_mode disabled, the instructions have the correct behavior
-  // already for G_FMINIMUMNUM/G_FMAXIMUMNUM.
-  //
-  // FIXME: G_FMINNUM/G_FMAXNUM should match the behavior with ieee_mode
-  // enabled.
-  if (!MFI->getMode().IEEE) {
-    if (MI.getOpcode() == AMDGPU::G_FMINIMUMNUM ||
-        MI.getOpcode() == AMDGPU::G_FMAXIMUMNUM)
-      return true;
-
-    return !IsIEEEOp;
-  }
-
-  if (IsIEEEOp)
+  // With ieee_mode disabled, the instructions have the correct behavior.
+  if (!MFI->getMode().IEEE)
     return true;
 
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
index c5c9473752529..0804133faca43 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMIRFormatter.h
@@ -26,20 +26,19 @@ struct PerFunctionMIParsingState;
 class AMDGPUMIRFormatter final : public MIRFormatter {
 public:
   AMDGPUMIRFormatter() = default;
-  virtual ~AMDGPUMIRFormatter() = default;
+  ~AMDGPUMIRFormatter() override = default;
 
   /// Implement target specific printing for machine operand immediate value, so
   /// that we can have more meaningful mnemonic than a 64-bit integer. Passing
   /// None to OpIdx means the index is unknown.
-  virtual void printImm(raw_ostream &OS, const MachineInstr &MI,
-                        std::optional<unsigned> OpIdx,
-                        int64_t Imm) const override;
+  void printImm(raw_ostream &OS, const MachineInstr &MI,
+                std::optional<unsigned> OpIdx, int64_t Imm) const override;
 
   /// Implement target specific parsing of immediate mnemonics. The mnemonic is
   /// a string with a leading dot.
-  virtual bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
-                                StringRef Src, int64_t &Imm,
-                                ErrorCallbackType ErrorCallback) const override;
+  bool parseImmMnemonic(const unsigned OpCode, const unsigned OpIdx,
+                        StringRef Src, int64_t &Imm,
+                        ErrorCallbackType ErrorCallback) const override;
 
   /// Implement target specific parsing of target custom pseudo source value.
   bool
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 4c2f22c72a5e3..c1ee3a2ac6a89 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -4835,6 +4835,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_perm_pk16_b4_u4:
     case Intrinsic::amdgcn_perm_pk16_b6_u4:
     case Intrinsic::amdgcn_perm_pk16_b8_u4:
+    case Intrinsic::amdgcn_add_max_i32:
+    case Intrinsic::amdgcn_add_max_u32:
+    case Intrinsic::amdgcn_add_min_i32:
+    case Intrinsic::amdgcn_add_min_u32:
+    case Intrinsic::amdgcn_pk_add_max_i16:
+    case Intrinsic::amdgcn_pk_add_max_u16:
+    case Intrinsic::amdgcn_pk_add_min_i16:
+    case Intrinsic::amdgcn_pk_add_min_u16:
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_log:
     case Intrinsic::amdgcn_exp2:
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 3a2eae1166df6..80cfd48ec8125 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -753,7 +753,7 @@ static StringRef getGPUOrDefault(const Triple &TT, StringRef GPU) {
   return "r600";
 }
 
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel() {
   // The AMDGPU toolchain only supports generating shared objects, so we
   // must always use PIC.
   return Reloc::PIC_;
@@ -767,8 +767,8 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                                          CodeGenOptLevel OptLevel)
     : CodeGenTargetMachineImpl(
           T, TT.computeDataLayout(), TT, getGPUOrDefault(TT, CPU), FS, Options,
-          getEffectiveRelocModel(RM),
-          getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
+          getEffectiveRelocModel(), getEffectiveCodeModel(CM, CodeModel::Small),
+          OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
   initAsmInfo();
   if (TT.isAMDGCN()) {
@@ -2246,7 +2246,7 @@ void AMDGPUCodeGenPassBuilder::addIRPasses(AddIRPass &addPass) const {
       (AMDGPUAtomicOptimizerStrategy != ScanOptions::None))
     addPass(AMDGPUAtomicOptimizerPass(TM, AMDGPUAtomicOptimizerStrategy));
 
-  addPass(AtomicExpandPass(&TM));
+  addPass(AtomicExpandPass(TM));
 
   if (TM.getOptLevel() > CodeGenOptLevel::None) {
     addPass(AMDGPUPromoteAllocaPass(TM));
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index 99ba04378ba2e..5580e4c0746bd 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -1860,7 +1860,6 @@ class AMDGPUAsmParser : public MCTargetAsmParser {
   bool validateTHAndScopeBits(const MCInst &Inst, const OperandVector &Operands,
                               const unsigned CPol);
   bool validateTFE(const MCInst &Inst, const OperandVector &Operands);
-  bool validateSetVgprMSB(const MCInst &Inst, const OperandVector &Operands);
   bool validateLdsDirect(const MCInst &Inst, const OperandVector &Operands);
   bool validateWMMA(const MCInst &Inst, const OperandVector &Operands);
   unsigned getConstantBusLimit(unsigned Opcode) const;
@@ -5506,22 +5505,6 @@ bool AMDGPUAsmParser::validateTFE(const MCInst &Inst,
   return true;
 }
 
-bool AMDGPUAsmParser::validateSetVgprMSB(const MCInst &Inst,
-                                         const OperandVector &Operands) {
-  if (Inst.getOpcode() != AMDGPU::S_SET_VGPR_MSB_gfx12)
-    return true;
-
-  int Simm16Pos =
-      AMDGPU::getNamedOperandIdx(Inst.getOpcode(), AMDGPU::OpName::simm16);
-  if ((unsigned)Inst.getOperand(Simm16Pos).getImm() > 255) {
-    SMLoc Loc = Operands[1]->getStartLoc();
-    Error(Loc, "s_set_vgpr_msb accepts values in range [0..255]");
-    return false;
-  }
-
-  return true;
-}
-
 bool AMDGPUAsmParser::validateWMMA(const MCInst &Inst,
                                    const OperandVector &Operands) {
   unsigned Opc = Inst.getOpcode();
@@ -5681,9 +5664,6 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst, SMLoc IDLoc,
   if (!validateTFE(Inst, Operands)) {
     return false;
   }
-  if (!validateSetVgprMSB(Inst, Operands)) {
-    return false;
-  }
   if (!validateWMMA(Inst, Operands)) {
     return false;
   }
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index a46678081b51c..ac660d5fada79 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -277,6 +277,8 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasLshlAddU64Inst = false;
   bool HasAddSubU64Insts = false;
   bool HasMadU32Inst = false;
+  bool HasAddMinMaxInsts = false;
+  bool HasPkAddMinMaxInsts = false;
   bool HasPointSampleAccel = false;
   bool HasLdsBarrierArriveAtomic = false;
   bool HasSetPrioIncWgInst = false;
@@ -1567,10 +1569,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool hasIntMinMax64() const { return GFX1250Insts; }
 
   // \returns true if the target has V_ADD_{MIN|MAX}_{I|U}32 instructions.
-  bool hasAddMinMaxInsts() const { return GFX1250Insts; }
+  bool hasAddMinMaxInsts() const { return HasAddMinMaxInsts; }
 
   // \returns true if the target has V_PK_ADD_{MIN|MAX}_{I|U}16 instructions.
-  bool hasPkAddMinMaxInsts() const { return GFX1250Insts; }
+  bool hasPkAddMinMaxInsts() const { return HasPkAddMinMaxInsts; }
 
   // \returns true if the target has V_PK_{MIN|MAX}3_{I|U}16 instructions.
   bool hasPkMinMax3Insts() const { return GFX1250Insts; }
diff --git a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
index cbc7427ce6cdf..4d0c163c5ea74 100644
--- a/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
+++ b/llvm/lib/Target/AMDGPU/MCA/AMDGPUCustomBehaviour.h
@@ -32,7 +32,7 @@ class AMDGPUInstrPostProcess : public InstrPostProcess {
   AMDGPUInstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~AMDGPUInstrPostProcess() = default;
+  ~AMDGPUInstrPostProcess() override = default;
 
   void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
 };
@@ -88,7 +88,7 @@ class AMDGPUCustomBehaviour : public CustomBehaviour {
   AMDGPUCustomBehaviour(const MCSubtargetInfo &STI,
                         const mca::SourceMgr &SrcMgr, const MCInstrInfo &MCII);
 
-  ~AMDGPUCustomBehaviour() = default;
+  ~AMDGPUCustomBehaviour() override = default;
   /// This method is used to determine if an instruction
   /// should be allowed to be dispatched. The return value is
   /// how many cycles until the instruction can be dispatched.
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
index 54fcd2af49ecd..246a3f88ebce4 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCExpr.h
@@ -64,7 +64,7 @@ class AMDGPUMCExpr : public MCTargetExpr {
   ArrayRef<const MCExpr *> Args;
 
   AMDGPUMCExpr(VariantKind Kind, ArrayRef<const MCExpr *> Args, MCContext &Ctx);
-  ~AMDGPUMCExpr();
+  ~AMDGPUMCExpr() override;
 
   bool evaluateExtraSGPRs(MCValue &Res, const MCAssembler *Asm) const;
   bool evaluateTotalNumVGPR(MCValue &Res, const MCAssembler *Asm) const;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 09ef6ac7bcdf2..2aa54c920a046 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -45,9 +45,6 @@ R600TargetLowering::R600TargetLowering(const TargetMachine &TM,
   // Legalize loads and stores to the private address space.
   setOperationAction(ISD::LOAD, {MVT::i32, MVT::v2i32, MVT::v4i32}, Custom);
 
-  // 32-bit ABS is legal for AMDGPU except for R600
-  setOperationAction(ISD::ABS, MVT::i32, Expand);
-
   // EXTLOAD should be the same as ZEXTLOAD. It is legal for some address
   // spaces, so it is custom lowered to handle those where it isn't.
   for (auto Op : {ISD::SEXTLOAD, ISD::ZEXTLOAD, ISD::EXTLOAD})
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index ad62483e42997..6570818b55ec9 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -298,7 +298,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::BR_CC,
                      {MVT::i1, MVT::i32, MVT::i64, MVT::f32, MVT::f64}, Expand);
 
-  setOperationAction({ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
+  setOperationAction({ISD::ABS, ISD::UADDO, ISD::USUBO}, MVT::i32, Legal);
 
   setOperationAction({ISD::UADDO_CARRY, ISD::USUBO_CARRY}, MVT::i32, Legal);
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index 220a893efb0c8..e93b8e6cf050d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -10778,6 +10778,59 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   if (SrcReg2 && !getFoldableImm(SrcReg2, *MRI, CmpValue))
     return false;
 
+  const auto optimizeCmpSelect = [&CmpInstr, SrcReg, CmpValue, MRI,
+                                  this]() -> bool {
+    if (CmpValue != 0)
+      return false;
+
+    MachineInstr *Def = MRI->getUniqueVRegDef(SrcReg);
+    if (!Def || Def->getParent() != CmpInstr.getParent())
+      return false;
+
+    const auto foldableSelect = [](MachineInstr *Def) -> bool {
+      if (Def->getOpcode() == AMDGPU::S_CSELECT_B32 ||
+          Def->getOpcode() == AMDGPU::S_CSELECT_B64) {
+        bool Op1IsNonZeroImm =
+            Def->getOperand(1).isImm() && Def->getOperand(1).getImm() != 0;
+        bool Op2IsZeroImm =
+            Def->getOperand(2).isImm() && Def->getOperand(2).getImm() == 0;
+        if (Op1IsNonZeroImm && Op2IsZeroImm)
+          return true;
+      }
+      return false;
+    };
+
+    // For S_OP that set SCC = DST!=0, do the transformation
+    //
+    //   s_cmp_lg_* (S_OP ...), 0 => (S_OP ...)
+
+    // If foldableSelect, s_cmp_lg_* is redundant because the SCC input value
+    // for S_CSELECT* already has the same value that will be calculated by
+    // s_cmp_lg_*
+    //
+    //   s_cmp_lg_* (S_CSELECT* (non-zero imm), 0), 0 => (S_CSELECT* (non-zero
+    //   imm), 0)
+    if (!setsSCCifResultIsNonZero(*Def) && !foldableSelect(Def))
+      return false;
+
+    MachineInstr *KillsSCC = nullptr;
+    for (MachineInstr &MI :
+         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
+        return false;
+      if (MI.killsRegister(AMDGPU::SCC, &RI))
+        KillsSCC = &MI;
+    }
+
+    if (MachineOperand *SccDef =
+            Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr))
+      SccDef->setIsDead(false);
+    if (KillsSCC)
+      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
+    CmpInstr.eraseFromParent();
+    return true;
+  };
+
   const auto optimizeCmpAnd = [&CmpInstr, SrcReg, CmpValue, MRI,
                                this](int64_t ExpectedValue, unsigned SrcSize,
                                      bool IsReversible, bool IsSigned) -> bool {
@@ -10852,16 +10905,20 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
     if (IsReversedCC && !MRI->hasOneNonDBGUse(DefReg))
       return false;
 
-    for (auto I = std::next(Def->getIterator()), E = CmpInstr.getIterator();
-         I != E; ++I) {
-      if (I->modifiesRegister(AMDGPU::SCC, &RI) ||
-          I->killsRegister(AMDGPU::SCC, &RI))
+    MachineInstr *KillsSCC = nullptr;
+    for (MachineInstr &MI :
+         make_range(std::next(Def->getIterator()), CmpInstr.getIterator())) {
+      if (MI.modifiesRegister(AMDGPU::SCC, &RI))
         return false;
+      if (MI.killsRegister(AMDGPU::SCC, &RI))
+        KillsSCC = &MI;
     }
 
     MachineOperand *SccDef =
         Def->findRegisterDefOperand(AMDGPU::SCC, /*TRI=*/nullptr);
     SccDef->setIsDead(false);
+    if (KillsSCC)
+      KillsSCC->clearRegisterKills(AMDGPU::SCC, /*TRI=*/nullptr);
     CmpInstr.eraseFromParent();
 
     if (!MRI->use_nodbg_empty(DefReg)) {
@@ -10905,7 +10962,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMP_LG_I32:
   case AMDGPU::S_CMPK_LG_U32:
   case AMDGPU::S_CMPK_LG_I32:
-    return optimizeCmpAnd(0, 32, true, false);
+    return optimizeCmpAnd(0, 32, true, false) || optimizeCmpSelect();
   case AMDGPU::S_CMP_GT_U32:
   case AMDGPU::S_CMPK_GT_U32:
     return optimizeCmpAnd(0, 32, false, false);
@@ -10913,7 +10970,7 @@ bool SIInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
   case AMDGPU::S_CMPK_GT_I32:
     return optimizeCmpAnd(0, 32, false, true);
   case AMDGPU::S_CMP_LG_U64:
-    return optimizeCmpAnd(0, 64, true, false);
+    return optimizeCmpAnd(0, 64, true, false) || optimizeCmpSelect();
   }
 
   return false;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index d68909fb8f752..ea5f95f6f8394 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -735,6 +735,52 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
     }
   }
 
+  static bool setsSCCifResultIsNonZero(const MachineInstr &MI) {
+    switch (MI.getOpcode()) {
+    case AMDGPU::S_ABSDIFF_I32:
+    case AMDGPU::S_ABS_I32:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_ANDN2_B32:
+    case AMDGPU::S_ANDN2_B64:
+    case AMDGPU::S_ASHR_I32:
+    case AMDGPU::S_ASHR_I64:
+    case AMDGPU::S_BCNT0_I32_B32:
+    case AMDGPU::S_BCNT0_I32_B64:
+    case AMDGPU::S_BCNT1_I32_B32:
+    case AMDGPU::S_BCNT1_I32_B64:
+    case AMDGPU::S_BFE_I32:
+    case AMDGPU::S_BFE_I64:
+    case AMDGPU::S_BFE_U32:
+    case AMDGPU::S_BFE_U64:
+    case AMDGPU::S_LSHL_B32:
+    case AMDGPU::S_LSHL_B64:
+    case AMDGPU::S_LSHR_B32:
+    case AMDGPU::S_LSHR_B64:
+    case AMDGPU::S_NAND_B32:
+    case AMDGPU::S_NAND_B64:
+    case AMDGPU::S_NOR_B32:
+    case AMDGPU::S_NOR_B64:
+    case AMDGPU::S_NOT_B32:
+    case AMDGPU::S_NOT_B64:
+    case AMDGPU::S_OR_B32:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_ORN2_B32:
+    case AMDGPU::S_ORN2_B64:
+    case AMDGPU::S_QUADMASK_B32:
+    case AMDGPU::S_QUADMASK_B64:
+    case AMDGPU::S_WQM_B32:
+    case AMDGPU::S_WQM_B64:
+    case AMDGPU::S_XNOR_B32:
+    case AMDGPU::S_XNOR_B64:
+    case AMDGPU::S_XOR_B32:
+    case AMDGPU::S_XOR_B64:
+      return true;
+    default:
+      return false;
+    }
+  }
+
   static bool isEXP(const MachineInstr &MI) {
     return MI.getDesc().TSFlags & SIInstrFlags::EXP;
   }
@@ -1615,7 +1661,7 @@ class SIInstrInfo final : public AMDGPUGenInstrInfo {
                            unsigned *PredCost = nullptr) const override;
 
   InstructionUniformity
-  getInstructionUniformity(const MachineInstr &MI) const override final;
+  getInstructionUniformity(const MachineInstr &MI) const final;
 
   InstructionUniformity
   getGenericInstructionUniformity(const MachineInstr &MI) const;
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index c5ff670bd2c02..cb2265af61eb6 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -313,7 +313,7 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
                         const llvm::MachineFunction &MF);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~SIMachineFunctionInfo() = default;
+  ~SIMachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<SIMachineFunctionInfo> {
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index 07264d973648f..a177a424bdfce 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -640,7 +640,7 @@ class SIGfx12CacheControl : public SIGfx11CacheControl {
 
   bool finalizeStore(MachineInstr &MI, bool Atomic) const override;
 
-  virtual bool handleCooperativeAtomic(MachineInstr &MI) const override;
+  bool handleCooperativeAtomic(MachineInstr &MI) const override;
 
   bool insertRelease(MachineBasicBlock::iterator &MI, SIAtomicScope Scope,
                      SIAtomicAddrSpace AddrSpace, bool IsCrossAddrSpaceOrdering,
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 42ec8ba876a8f..05ba76ab489d8 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -775,10 +775,10 @@ let SubtargetPredicate = HasMinimum3Maximum3F16, ReadsModeReg = 0 in {
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
 let SubtargetPredicate = HasAddMinMaxInsts, isCommutable = 1, isReMaterializable = 1 in {
-  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP_I32_I32_I32_I32>;
-  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP_I32_I32_I32_I32>;
+  defm V_ADD_MAX_I32 : VOP3Inst <"v_add_max_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_i32>;
+  defm V_ADD_MAX_U32 : VOP3Inst <"v_add_max_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_max_u32>;
+  defm V_ADD_MIN_I32 : VOP3Inst <"v_add_min_i32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_i32>;
+  defm V_ADD_MIN_U32 : VOP3Inst <"v_add_min_u32", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>, int_amdgcn_add_min_u32>;
 }
 
 defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
@@ -976,10 +976,10 @@ def : GCNPat <
 } // End SubtargetPredicate = HasLshlAddU64Inst
 
 let SubtargetPredicate = HasAddMinMaxInsts in {
-def : ThreeOp_i32_Pats<add, smax, V_ADD_MAX_I32_e64>;
-def : ThreeOp_i32_Pats<add, umax, V_ADD_MAX_U32_e64>;
-def : ThreeOp_i32_Pats<add, smin, V_ADD_MIN_I32_e64>;
-def : ThreeOp_i32_Pats<add, umin, V_ADD_MIN_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smax, V_ADD_MAX_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umax, V_ADD_MAX_U32_e64>;
+def : ThreeOp_i32_Pats<saddsat, smin, V_ADD_MIN_I32_e64>;
+def : ThreeOp_i32_Pats<uaddsat, umin, V_ADD_MIN_U32_e64>;
 }
 
 def : VOPBinOpClampPat<saddsat, V_ADD_I32_e64, i32>;
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index bdbcded386721..717030df9a4c9 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -75,7 +75,7 @@ multiclass VOP3PInst<string OpName, VOPProfile P,
                      SDPatternOperator node = null_frag, bit IsDOT = 0> {
   def NAME : VOP3P_Pseudo<OpName, P,
                           !if (P.HasModifiers,
-                               getVOP3PModPat<P, node, IsDOT, IsDOT>.ret,
+                               getVOP3PModPat<P, node, !or(P.EnableClamp, IsDOT), IsDOT>.ret,
                                getVOP3Pat<P, node>.ret)>;
   let SubtargetPredicate = isGFX11Plus in {
   if P.HasExtVOP3DPP then
@@ -434,15 +434,16 @@ defm : MadFmaMixFP16Pats_t16<fma, V_FMA_MIX_BF16_t16>;
 } // End SubtargetPredicate = HasFmaMixBF16Insts
 
 def PK_ADD_MINMAX_Profile : VOP3P_Profile<VOP_V2I16_V2I16_V2I16_V2I16, VOP3_PACKED> {
-  let HasModifiers = 0;
+  let HasNeg = 0;
+  let EnableClamp = 1;
 }
 
 let isCommutable = 1, isReMaterializable = 1 in {
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile>;
-defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile>;
+defm V_PK_ADD_MAX_I16 : VOP3PInst<"v_pk_add_max_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_i16>;
+defm V_PK_ADD_MAX_U16 : VOP3PInst<"v_pk_add_max_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_max_u16>;
+defm V_PK_ADD_MIN_I16 : VOP3PInst<"v_pk_add_min_i16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_i16>;
+defm V_PK_ADD_MIN_U16 : VOP3PInst<"v_pk_add_min_u16", PK_ADD_MINMAX_Profile, int_amdgcn_pk_add_min_u16>;
 }
 let SubtargetPredicate = HasPkMinMax3Insts in {
 defm V_PK_MAX3_I16 : VOP3PInst<"v_pk_max3_i16", PK_ADD_MINMAX_Profile>;
@@ -463,10 +464,10 @@ class ThreeOp_OpSelClampPats <SDPatternOperator op1, SDPatternOperator op2,
 >;
 
 let SubtargetPredicate = HasPkAddMinMaxInsts in {
-def : ThreeOp_OpSelClampPats<add, smax, V_PK_ADD_MAX_I16>;
-def : ThreeOp_OpSelClampPats<add, umax, V_PK_ADD_MAX_U16>;
-def : ThreeOp_OpSelClampPats<add, smin, V_PK_ADD_MIN_I16>;
-def : ThreeOp_OpSelClampPats<add, umin, V_PK_ADD_MIN_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smax, V_PK_ADD_MAX_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umax, V_PK_ADD_MAX_U16>;
+def : ThreeOp_OpSelClampPats<saddsat, smin, V_PK_ADD_MIN_I16>;
+def : ThreeOp_OpSelClampPats<uaddsat, umin, V_PK_ADD_MIN_U16>;
 }
 
 let SubtargetPredicate = HasPkMinMax3Insts in {
diff --git a/llvm/lib/Target/ARM/ARMArchitectures.td b/llvm/lib/Target/ARM/ARMArchitectures.td
index 301ed5bf3e3fc..bfcecfe27091f 100644
--- a/llvm/lib/Target/ARM/ARMArchitectures.td
+++ b/llvm/lib/Target/ARM/ARMArchitectures.td
@@ -297,6 +297,18 @@ def ARMv96a   : Architecture<"armv9.6-a", "ARMv96a",  [HasV9_6aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv97a   : Architecture<"armv9.7-a", "ARMv97a",  [HasV9_7aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
                                                        FeatureDB,
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 1f773e2a7e0fc..36b99087e0a32 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -820,7 +820,7 @@ void ARMAsmPrinter::emitAttributes() {
 
     auto *BTIValue = mdconst::extract_or_null<ConstantInt>(
         SourceModule->getModuleFlag("branch-target-enforcement"));
-    if (BTIValue && BTIValue->isOne()) {
+    if (BTIValue && !BTIValue->isZero()) {
       // If "+pacbti" is used as an architecture extension,
       // Tag_BTI_extension is emitted in
       // ARMTargetStreamer::emitTargetAttributes().
@@ -1471,6 +1471,435 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 // instructions) auto-generated.
 #include "ARMGenMCPseudoLowering.inc"
 
+// Helper function to check if a register is live (used as an implicit operand)
+// in the given call instruction.
+static bool isRegisterLiveInCall(const MachineInstr &Call, MCRegister Reg) {
+  for (const MachineOperand &MO : Call.implicit_operands()) {
+    if (MO.isReg() && MO.getReg() == Reg && MO.isUse()) {
+      return true;
+    }
+  }
+  return false;
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type,
+                                         const MachineInstr &Call,
+                                         int64_t PrefixNops) {
+  // Choose scratch register: r12 primary, r3 if target is r12.
+  unsigned ScratchReg = ARM::R12;
+  if (AddrReg == ARM::R12) {
+    ScratchReg = ARM::R3;
+  }
+
+  // Calculate ESR for ARM mode (16-bit): 0x8000 | (scratch_reg << 5) | addr_reg
+  // Note: scratch_reg is always 0x1F since the EOR sequence clobbers it.
+  const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+  unsigned AddrIndex = TRI->getEncodingValue(AddrReg);
+  unsigned ESR = 0x8000 | (31 << 5) | (AddrIndex & 31);
+
+  // Check if r3 is live and needs to be spilled.
+  bool NeedSpillR3 =
+      (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3);
+
+  // If we need to spill r3, push it first.
+  if (NeedSpillR3) {
+    // push {r3}
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::STMDB_UPD)
+                                     .addReg(ARM::SP)
+                                     .addReg(ARM::SP)
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0)
+                                     .addReg(ARM::R3));
+  }
+
+  // Clear bit 0 of target address to handle Thumb function pointers.
+  // In 32-bit ARM, function pointers may have the low bit set to indicate
+  // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later).
+  // We need to clear it to avoid an alignment fault when loading.
+  // bic scratch, target, #1
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::BICri)
+                                   .addReg(ScratchReg)
+                                   .addReg(AddrReg)
+                                   .addImm(1)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0)
+                                   .addReg(0));
+
+  // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)]
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDRi12)
+                                   .addReg(ScratchReg)
+                                   .addReg(ScratchReg)
+                                   .addImm(-(PrefixNops * 4 + 4))
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // Each EOR instruction XORs one byte of the type, shifted to its position.
+  for (int i = 0; i < 4; i++) {
+    uint8_t byte = (Type >> (i * 8)) & 0xFF;
+    uint32_t imm = byte << (i * 8);
+    bool isLast = (i == 3);
+
+    // Encode as ARM modified immediate.
+    int SOImmVal = ARM_AM::getSOImmVal(imm);
+    assert(SOImmVal != -1 &&
+           "Cannot encode immediate as ARM modified immediate");
+
+    // eor[s] scratch, scratch, #imm (last one sets flags with CPSR)
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(ARM::EORri)
+                       .addReg(ScratchReg)
+                       .addReg(ScratchReg)
+                       .addImm(SOImmVal)
+                       .addImm(ARMCC::AL)
+                       .addReg(0)
+                       .addReg(isLast ? ARM::CPSR : ARM::NoRegister));
+  }
+
+  // If we spilled r3, restore it immediately after the comparison.
+  // This must happen before the branch so r3 is valid on both paths.
+  if (NeedSpillR3) {
+    // pop {r3}
+    EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::LDMIA_UPD)
+                                     .addReg(ARM::SP)
+                                     .addReg(ARM::SP)
+                                     .addImm(ARMCC::AL)
+                                     .addReg(0)
+                                     .addReg(ARM::R3));
+  }
+
+  // beq .Lpass (branch if types match, i.e., scratch is zero)
+  MCSymbol *Pass = OutContext.createTempSymbol();
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(ARM::Bcc)
+                     .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+                     .addImm(ARMCC::EQ)
+                     .addReg(ARM::CPSR));
+
+  // udf #ESR (trap with encoded diagnostic)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::UDF).addImm(ESR));
+
+  OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type,
+                                          const MachineInstr &Call,
+                                          int64_t PrefixNops) {
+  // Choose scratch register: r12 primary, r3 if target is r12.
+  unsigned ScratchReg = ARM::R12;
+  if (AddrReg == ARM::R12) {
+    ScratchReg = ARM::R3;
+  }
+
+  // Calculate ESR for Thumb mode (8-bit): 0x80 | addr_reg
+  // Bit 7: KCFI trap indicator
+  // Bits 6-5: Reserved
+  // Bits 4-0: Address register encoding
+  const ARMBaseRegisterInfo *TRI = static_cast<const ARMBaseRegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+  unsigned AddrIndex = TRI->getEncodingValue(AddrReg);
+  unsigned ESR = 0x80 | (AddrIndex & 0x1F);
+
+  // Check if r3 is live and needs to be spilled.
+  bool NeedSpillR3 =
+      (ScratchReg == ARM::R3) && isRegisterLiveInCall(Call, ARM::R3);
+
+  // If we need to spill r3, push it first.
+  if (NeedSpillR3) {
+    // push {r3}
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+  }
+
+  // Clear bit 0 of target address to handle Thumb function pointers.
+  // In 32-bit ARM, function pointers may have the low bit set to indicate
+  // Thumb state when ARM/Thumb interworking is enabled (ARMv4T and later).
+  // We need to clear it to avoid an alignment fault when loading.
+  // bic scratch, target, #1
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2BICri)
+                                   .addReg(ScratchReg)
+                                   .addReg(AddrReg)
+                                   .addImm(1)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0)
+                                   .addReg(0));
+
+  // ldr scratch, [scratch, #-(PrefixNops * 4 + 4)]
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::t2LDRi8)
+                                   .addReg(ScratchReg)
+                                   .addReg(ScratchReg)
+                                   .addImm(-(PrefixNops * 4 + 4))
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // Each EOR instruction XORs one byte of the type, shifted to its position.
+  for (int i = 0; i < 4; i++) {
+    uint8_t byte = (Type >> (i * 8)) & 0xFF;
+    uint32_t imm = byte << (i * 8);
+    bool isLast = (i == 3);
+
+    // Verify the immediate can be encoded as Thumb2 modified immediate.
+    assert(ARM_AM::getT2SOImmVal(imm) != -1 &&
+           "Cannot encode immediate as Thumb2 modified immediate");
+
+    // eor[s] scratch, scratch, #imm (last one sets flags with CPSR)
+    EmitToStreamer(*OutStreamer,
+                   MCInstBuilder(ARM::t2EORri)
+                       .addReg(ScratchReg)
+                       .addReg(ScratchReg)
+                       .addImm(imm)
+                       .addImm(ARMCC::AL)
+                       .addReg(0)
+                       .addReg(isLast ? ARM::CPSR : ARM::NoRegister));
+  }
+
+  // If we spilled r3, restore it immediately after the comparison.
+  // This must happen before the branch so r3 is valid on both paths.
+  if (NeedSpillR3) {
+    // pop {r3}
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+  }
+
+  // beq .Lpass (branch if types match, i.e., scratch is zero)
+  MCSymbol *Pass = OutContext.createTempSymbol();
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(ARM::t2Bcc)
+                     .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+                     .addImm(ARMCC::EQ)
+                     .addReg(ARM::CPSR));
+
+  // udf #ESR (trap with encoded diagnostic)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tUDF).addImm(ESR));
+
+  OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type,
+                                          const MachineInstr &Call,
+                                          int64_t PrefixNops) {
+  // For Thumb1, use R2 unconditionally as scratch register (a low register
+  // required for tLDRi). R3 is used for building the type hash.
+  unsigned ScratchReg = ARM::R2;
+  unsigned TempReg = ARM::R3;
+
+  // Check if r3 is live and needs to be spilled.
+  bool NeedSpillR3 = isRegisterLiveInCall(Call, ARM::R3);
+
+  // Spill r3 if needed
+  if (NeedSpillR3) {
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+  }
+
+  // Check if r2 is live and needs to be spilled.
+  bool NeedSpillR2 = isRegisterLiveInCall(Call, ARM::R2);
+
+  // Push R2 if it's live
+  if (NeedSpillR2) {
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPUSH).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2));
+  }
+
+  // Clear bit 0 from target address
+  // TempReg (R3) is used first as helper for BIC, then later for building type
+  // hash.
+
+  // movs temp, #1
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addImm(1)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // mov scratch, target
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
+                                   .addReg(ScratchReg)
+                                   .addReg(AddrReg)
+                                   .addImm(ARMCC::AL));
+
+  // bics scratch, temp (scratch = scratch & ~temp)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBIC)
+                                   .addReg(ScratchReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(ScratchReg)
+                                   .addReg(TempReg)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // Load type hash. Thumb1 doesn't support negative offsets, so subtract.
+  int offset = PrefixNops * 4 + 4;
+
+  // subs scratch, #offset
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tSUBi8)
+                                   .addReg(ScratchReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(ScratchReg)
+                                   .addImm(offset)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // ldr scratch, [scratch, #0]
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLDRi)
+                                   .addReg(ScratchReg)
+                                   .addReg(ScratchReg)
+                                   .addImm(0)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // Load expected type inline (instead of EOR sequence)
+  //
+  // This creates the 32-bit value byte-by-byte in the temp register:
+  // movs temp, #byte3 (high byte)
+  // lsls temp, temp, #8
+  // adds temp, #byte2
+  // lsls temp, temp, #8
+  // adds temp, #byte1
+  // lsls temp, temp, #8
+  // adds temp, #byte0 (low byte)
+
+  uint8_t byte0 = (Type >> 0) & 0xFF;
+  uint8_t byte1 = (Type >> 8) & 0xFF;
+  uint8_t byte2 = (Type >> 16) & 0xFF;
+  uint8_t byte3 = (Type >> 24) & 0xFF;
+
+  // movs temp, #byte3 (start with high byte)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVi8)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addImm(byte3)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // lsls temp, temp, #8
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(8)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // adds temp, #byte2
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(byte2)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // lsls temp, temp, #8
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(8)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // adds temp, #byte1
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(byte1)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // lsls temp, temp, #8
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tLSLri)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(8)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // adds temp, #byte0 (low byte)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDi8)
+                                   .addReg(TempReg)
+                                   .addReg(ARM::CPSR)
+                                   .addReg(TempReg)
+                                   .addImm(byte0)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // cmp scratch, temp
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tCMPr)
+                                   .addReg(ScratchReg)
+                                   .addReg(TempReg)
+                                   .addImm(ARMCC::AL)
+                                   .addReg(0));
+
+  // Restore registers if spilled (pop in reverse order of push: R2, then R3)
+  if (NeedSpillR2) {
+    // pop {r2}
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R2));
+  }
+
+  // Restore r3 if spilled
+  if (NeedSpillR3) {
+    // pop {r3}
+    EmitToStreamer(
+        *OutStreamer,
+        MCInstBuilder(ARM::tPOP).addImm(ARMCC::AL).addReg(0).addReg(ARM::R3));
+  }
+
+  // beq .Lpass (branch if types match, i.e., scratch == temp)
+  MCSymbol *Pass = OutContext.createTempSymbol();
+  EmitToStreamer(*OutStreamer,
+                 MCInstBuilder(ARM::tBcc)
+                     .addExpr(MCSymbolRefExpr::create(Pass, OutContext))
+                     .addImm(ARMCC::EQ)
+                     .addReg(ARM::CPSR));
+
+  // bkpt #0 (trap with encoded diagnostic)
+  EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBKPT).addImm(0));
+
+  OutStreamer->emitLabel(Pass);
+}
+
+void ARMAsmPrinter::LowerKCFI_CHECK(const MachineInstr &MI) {
+  Register AddrReg = MI.getOperand(0).getReg();
+  const int64_t Type = MI.getOperand(1).getImm();
+
+  // Get the call instruction that follows this KCFI_CHECK.
+  assert(std::next(MI.getIterator())->isCall() &&
+         "KCFI_CHECK not followed by a call instruction");
+  const MachineInstr &Call = *std::next(MI.getIterator());
+
+  // Adjust the offset for patchable-function-prefix.
+  int64_t PrefixNops = 0;
+  MI.getMF()
+      ->getFunction()
+      .getFnAttribute("patchable-function-prefix")
+      .getValueAsString()
+      .getAsInteger(10, PrefixNops);
+
+  // Emit the appropriate instruction sequence based on the opcode variant.
+  switch (MI.getOpcode()) {
+  case ARM::KCFI_CHECK_ARM:
+    EmitKCFI_CHECK_ARM32(AddrReg, Type, Call, PrefixNops);
+    break;
+  case ARM::KCFI_CHECK_Thumb2:
+    EmitKCFI_CHECK_Thumb2(AddrReg, Type, Call, PrefixNops);
+    break;
+  case ARM::KCFI_CHECK_Thumb1:
+    EmitKCFI_CHECK_Thumb1(AddrReg, Type, Call, PrefixNops);
+    break;
+  default:
+    llvm_unreachable("Unexpected KCFI_CHECK opcode");
+  }
+}
+
 void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   ARM_MC::verifyInstructionPredicates(MI->getOpcode(),
                                       getSubtargetInfo().getFeatureBits());
@@ -1504,6 +1933,11 @@ void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   switch (Opc) {
   case ARM::t2MOVi32imm: llvm_unreachable("Should be lowered by thumb2it pass");
   case ARM::DBG_VALUE: llvm_unreachable("Should be handled by generic printing");
+  case ARM::KCFI_CHECK_ARM:
+  case ARM::KCFI_CHECK_Thumb2:
+  case ARM::KCFI_CHECK_Thumb1:
+    LowerKCFI_CHECK(*MI);
+    return;
   case ARM::LEApcrel:
   case ARM::tLEApcrel:
   case ARM::t2LEApcrel: {
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index 2b067c753264f..9e92b5a36a672 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -123,9 +123,20 @@ class LLVM_LIBRARY_VISIBILITY ARMAsmPrinter : public AsmPrinter {
   void LowerPATCHABLE_FUNCTION_EXIT(const MachineInstr &MI);
   void LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI);
 
+  // KCFI check lowering
+  void LowerKCFI_CHECK(const MachineInstr &MI);
+
 private:
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
+  // KCFI check emission helpers
+  void EmitKCFI_CHECK_ARM32(Register AddrReg, int64_t Type,
+                            const MachineInstr &Call, int64_t PrefixNops);
+  void EmitKCFI_CHECK_Thumb2(Register AddrReg, int64_t Type,
+                             const MachineInstr &Call, int64_t PrefixNops);
+  void EmitKCFI_CHECK_Thumb1(Register AddrReg, int64_t Type,
+                             const MachineInstr &Call, int64_t PrefixNops);
+
   // Helpers for emitStartOfAsmFile() and emitEndOfAsmFile()
   void emitAttributes();
 
diff --git a/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 261070a74ba33..e21b2c935c010 100644
--- a/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -176,9 +176,7 @@ class ARMConstantPoolConstant : public ARMConstantPoolValue {
 
   using promoted_iterator = SmallPtrSet<const GlobalVariable *, 1>::iterator;
 
-  iterator_range<promoted_iterator> promotedGlobals() {
-    return iterator_range<promoted_iterator>(GVars.begin(), GVars.end());
-  }
+  iterator_range<promoted_iterator> promotedGlobals() { return GVars; }
 
   const Constant *getPromotedGlobalInit() const {
     return CVal;
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 0d7b6d1236442..fffb63738166d 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -2301,6 +2301,8 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       for (unsigned i = 2, e = MBBI->getNumOperands(); i != e; ++i)
         NewMI->addOperand(MBBI->getOperand(i));
 
+      NewMI->setCFIType(*MBB.getParent(), MI.getCFIType());
+
       // Update call info and delete the pseudo instruction TCRETURN.
       if (MI.isCandidateForAdditionalCallInfo())
         MI.getMF()->moveAdditionalCallInfo(&MI, &*NewMI);
diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 9b1fa5d7b99d8..e562b2170db5c 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -712,6 +712,11 @@ def HasV9_6aOps   : SubtargetFeature<"v9.6a", "HasV9_6aOps", "true",
                                    "Support ARM v9.6a instructions",
                                    [HasV9_5aOps]>;
 
+// Armv9.7-A is a v9-only architecture.
+def HasV9_7aOps   : SubtargetFeature<"v9.7a", "HasV9_7aOps", "true",
+                                   "Support ARM v9.7a instructions",
+                                   [HasV9_6aOps]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index b1a668eef608f..313ae3d68fb83 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -2849,6 +2849,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   if (isTailCall) {
     MF.getFrameInfo().setHasTailCall();
     SDValue Ret = DAG.getNode(ARMISD::TC_RETURN, dl, MVT::Other, Ops);
+    if (CLI.CFIType)
+      Ret.getNode()->setCFIType(CLI.CFIType->getZExtValue());
     DAG.addNoMergeSiteInfo(Ret.getNode(), CLI.NoMerge);
     DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
     return Ret;
@@ -2856,6 +2858,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, {MVT::Other, MVT::Glue}, Ops);
+  if (CLI.CFIType)
+    Chain.getNode()->setCFIType(CLI.CFIType->getZExtValue());
   DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InGlue = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
@@ -12008,6 +12012,71 @@ static void genTPLoopBody(MachineBasicBlock *TpLoopBody,
       .add(predOps(ARMCC::AL));
 }
 
+bool ARMTargetLowering::supportKCFIBundles() const {
+  // KCFI is supported in all ARM/Thumb modes
+  return true;
+}
+
+MachineInstr *
+ARMTargetLowering::EmitKCFICheck(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::instr_iterator &MBBI,
+                                 const TargetInstrInfo *TII) const {
+  assert(MBBI->isCall() && MBBI->getCFIType() &&
+         "Invalid call instruction for a KCFI check");
+
+  MachineOperand *TargetOp = nullptr;
+  switch (MBBI->getOpcode()) {
+  // ARM mode opcodes
+  case ARM::BLX:
+  case ARM::BLX_pred:
+  case ARM::BLX_noip:
+  case ARM::BLX_pred_noip:
+  case ARM::BX_CALL:
+    TargetOp = &MBBI->getOperand(0);
+    break;
+  case ARM::TCRETURNri:
+  case ARM::TCRETURNrinotr12:
+  case ARM::TAILJMPr:
+  case ARM::TAILJMPr4:
+    TargetOp = &MBBI->getOperand(0);
+    break;
+  // Thumb mode opcodes (Thumb1 and Thumb2)
+  // Note: Most Thumb call instructions have predicate operands before the
+  // target register Format: tBLXr pred, predreg, target_register, ...
+  case ARM::tBLXr:      // Thumb1/Thumb2: BLX register (requires V5T)
+  case ARM::tBLXr_noip: // Thumb1/Thumb2: BLX register, no IP clobber
+  case ARM::tBX_CALL:   // Thumb1 only: BX call (push LR, BX)
+    TargetOp = &MBBI->getOperand(2);
+    break;
+  // Tail call instructions don't have predicates, target is operand 0
+  case ARM::tTAILJMPr: // Thumb1/Thumb2: Tail call via register
+    TargetOp = &MBBI->getOperand(0);
+    break;
+  default:
+    llvm_unreachable("Unexpected CFI call opcode");
+  }
+
+  assert(TargetOp && TargetOp->isReg() && "Invalid target operand");
+  TargetOp->setIsRenamable(false);
+
+  // Select the appropriate KCFI_CHECK variant based on the instruction set
+  unsigned KCFICheckOpcode;
+  if (Subtarget->isThumb()) {
+    if (Subtarget->isThumb2()) {
+      KCFICheckOpcode = ARM::KCFI_CHECK_Thumb2;
+    } else {
+      KCFICheckOpcode = ARM::KCFI_CHECK_Thumb1;
+    }
+  } else {
+    KCFICheckOpcode = ARM::KCFI_CHECK_ARM;
+  }
+
+  return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(KCFICheckOpcode))
+      .addReg(TargetOp->getReg())
+      .addImm(MBBI->getCFIType())
+      .getInstr();
+}
+
 MachineBasicBlock *
 ARMTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
@@ -21312,15 +21381,6 @@ void ARMTargetLowering::insertSSPDeclarations(Module &M) const {
   TargetLowering::insertSSPDeclarations(M);
 }
 
-Function *ARMTargetLowering::getSSPStackGuardCheck(const Module &M) const {
-  // MSVC CRT has a function to validate security cookie.
-  RTLIB::LibcallImpl SecurityCheckCookie =
-      getLibcallImpl(RTLIB::SECURITY_CHECK_COOKIE);
-  if (SecurityCheckCookie != RTLIB::Unsupported)
-    return M.getFunction(getLibcallImplName(SecurityCheckCookie));
-  return TargetLowering::getSSPStackGuardCheck(M);
-}
-
 bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                                   unsigned &Cost) const {
   // If we do not have NEON, vector types are not natively supported.
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 70aa001a41885..357d2c5d2fad1 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -447,6 +447,12 @@ class VectorType;
     void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                        SDNode *Node) const override;
 
+    bool supportKCFIBundles() const override;
+
+    MachineInstr *EmitKCFICheck(MachineBasicBlock &MBB,
+                                MachineBasicBlock::instr_iterator &MBBI,
+                                const TargetInstrInfo *TII) const override;
+
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
@@ -702,7 +708,6 @@ class VectorType;
     bool useLoadStackGuardNode(const Module &M) const override;
 
     void insertSSPDeclarations(Module &M) const override;
-    Function *getSSPStackGuardCheck(const Module &M) const override;
 
     bool canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
                                    unsigned &Cost) const override;
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index 282ff534fc112..10d4cd5dd96c1 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -6535,6 +6535,38 @@ def CMP_SWAP_64 : PseudoInst<(outs GPRPair:$Rd, GPRPair:$addr_temp_out),
 
 def : Pat<(atomic_fence (timm), 0), (MEMBARRIER)>;
 
+//===----------------------------------------------------------------------===//
+// KCFI check pseudo-instruction.
+//===----------------------------------------------------------------------===//
+// KCFI_CHECK pseudo-instruction for Kernel Control-Flow Integrity.
+// Expands to a sequence that verifies the function pointer's type hash.
+// Different sizes for different architectures due to different expansions.
+
+def KCFI_CHECK_ARM
+    : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+      Sched<[]>,
+      Requires<[IsARM]> {
+  let Size = 40; // worst-case 10 instructions @ 4 bytes each
+                 // (push, bic, ldr, 4x eor, pop, beq, udf)
+}
+
+def KCFI_CHECK_Thumb2
+    : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+      Sched<[]>,
+      Requires<[IsThumb2]> {
+  let Size = 34; // worst-case (push.w[2], bic[4], ldr[4], 4x eor[16], pop.w[2],
+                 // beq.w[4], udf[2])
+}
+
+def KCFI_CHECK_Thumb1
+    : PseudoInst<(outs), (ins GPR:$ptr, i32imm:$type), NoItinerary, []>,
+      Sched<[]>,
+      Requires<[IsThumb1Only]> {
+  let Size = 38; // worst-case 19 instructions @ 2 bytes each
+                 // (2x push, 3x bic-helper, subs+ldr, 13x type-building, cmp,
+                 // 2x pop, beq, bkpt)
+}
+
 //===----------------------------------------------------------------------===//
 // Instructions used for emitting unwind opcodes on Windows.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 406f4c1f21983..597d311989b2f 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -1036,6 +1036,7 @@ bool LowOverheadLoop::ValidateLiveOuts() {
   while (!Worklist.empty()) {
     MachineInstr *MI = Worklist.pop_back_val();
     if (MI->getOpcode() == ARM::MQPRCopy) {
+      LLVM_DEBUG(dbgs() << " Must generate copy as VMOV: " << *MI);
       VMOVCopies.insert(MI);
       MachineInstr *CopySrc =
           RDI.getUniqueReachingMIDef(MI, MI->getOperand(1).getReg());
@@ -1045,6 +1046,20 @@ bool LowOverheadLoop::ValidateLiveOuts() {
       LLVM_DEBUG(dbgs() << " Unable to handle live out: " << *MI);
       VMOVCopies.clear();
       return false;
+    } else if (isVectorPredicated(MI)) {
+      // If this is a predicated instruction with merging semantics,
+      // check where it gets its false lanes from, if any.
+      int InactiveIdx = findVPTInactiveOperandIdx(*MI);
+      if (InactiveIdx != -1) {
+        SmallPtrSet<MachineInstr *, 2> Defs;
+        MachineInstr *FalseSrc = RDI.getUniqueReachingMIDef(
+            MI, MI->getOperand(InactiveIdx).getReg());
+        if (FalseSrc) {
+          LLVM_DEBUG(dbgs()
+                     << " Must check source of false lanes for: " << *MI);
+          Worklist.push_back(FalseSrc);
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index a9c4b53f9ae64..72eb3d0f8b7f4 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -312,7 +312,7 @@ struct ARMFunctionInfo final : public yaml::MachineFunctionInfo {
   ARMFunctionInfo(const llvm::ARMFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~ARMFunctionInfo() = default;
+  ~ARMFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<ARMFunctionInfo> {
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 86740a92b32c5..590d4c70592f8 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -111,6 +111,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   initializeMVELaneInterleavingPass(Registry);
   initializeARMFixCortexA57AES1742098Pass(Registry);
   initializeARMDAGToDAGISelLegacyPass(Registry);
+  initializeKCFIPass(Registry);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -487,6 +488,9 @@ void ARMPassConfig::addPreSched2() {
   // proper scheduling.
   addPass(createARMExpandPseudoPass());
 
+  // Emit KCFI checks for indirect calls.
+  addPass(createKCFIPass());
+
   if (getOptLevel() != CodeGenOptLevel::None) {
     // When optimising for size, always run the Thumb2SizeReduction pass before
     // IfConversion. Otherwise, check whether IT blocks are restricted
@@ -517,9 +521,12 @@ void ARMPassConfig::addPreSched2() {
 void ARMPassConfig::addPreEmitPass() {
   addPass(createThumb2SizeReductionPass());
 
-  // Constant island pass work on unbundled instructions.
+  // Unpack bundles for:
+  // - Thumb2: Constant island pass requires unbundled instructions
+  // - KCFI: KCFI_CHECK pseudo instructions need to be unbundled for AsmPrinter
   addPass(createUnpackMachineBundles([](const MachineFunction &MF) {
-    return MF.getSubtarget<ARMSubtarget>().isThumb2();
+    return MF.getSubtarget<ARMSubtarget>().isThumb2() ||
+           MF.getFunction().getParent()->getModuleFlag("kcfi");
   }));
 
   // Don't optimize barriers or block placement at -O0.
@@ -530,6 +537,7 @@ void ARMPassConfig::addPreEmitPass() {
 }
 
 void ARMPassConfig::addPreEmitPass2() {
+
   // Inserts fixup instructions before unsafe AES operations. Instructions may
   // be inserted at the start of blocks and at within blocks so this pass has to
   // come before those below.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index 0796746e89709..94b511a09e400 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -895,6 +895,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV9_4A:
   case ARM::ArchKind::ARMV9_5A:
   case ARM::ArchKind::ARMV9_6A:
+  case ARM::ArchKind::ARMV9_7A:
     S.setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     S.setAttributeItem(ARM_ISA_use, Allowed, false);
     S.setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index 431ce38ad6e99..f5653d459eac8 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -805,6 +805,16 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
   return -1;
 }
 
+int llvm::findVPTInactiveOperandIdx(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+
+  for (unsigned i = 0, e = MCID.getNumOperands(); i != e; ++i)
+    if (MCID.operands()[i].OperandType == ARM::OPERAND_VPRED_R)
+      return i + ARM::SUBOP_vpred_r_inactive;
+
+  return -1;
+}
+
 ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
                                             Register &PredReg) {
   int PIdx = findFirstVPTPredOperandIdx(MI);
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 3ec3a6216b9f6..1b0bf2d499510 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -90,6 +90,9 @@ inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
   Register PredReg;
   return getVPTInstrPredicate(MI, PredReg);
 }
+// Identify the input operand in an MVE predicated instruction which
+// contributes the values of any inactive vector lanes.
+int findVPTInactiveOperandIdx(const MachineInstr &MI);
 
 // Recomputes the Block Mask of Instr, a VPT or VPST instruction.
 // This rebuilds the block mask of the instruction depending on the predicates
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index 02fb905f5fb69..4a2f714278f15 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -1504,14 +1504,26 @@ let Defs = [SREG], hasSideEffects = 0 in
 def FRMIDX : Pseudo<(outs DLDREGS:$dst), (ins DLDREGS:$src, i16imm:$src2),
                     "frmidx\t$dst, $src, $src2", []>;
 
+// The instructions STDSPQRr and STDWSPQRr are used to store to the stack
+// frame. The most accurate implementation would be to load the SP into
+// a temporary pointer variable and then STDPtrQRr. However for efficiency,
+// we assume that R29R28 contains the current call frame pointer.
+// However in the PEI pass we sometimes rewrite a ADJCALLSTACKDOWN pseudo,
+// plus one or more STDSPQRr/STDWSPQRr pseudo instructions to use Z for a
+// stack adjustment then as a base pointer. To avoid corruption, we thus
+// specify special classes of registers, like GPR8 and DREGS, but with
+// the Z register removed, as the source/input to these instructions.
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8:$src),
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in
+def STDSPQRr : StorePseudo<(outs), (ins memspi:$dst, GPR8NOZ:$src),
                            "stdstk\t$dst, $src", [(store i8:$src, addr:$dst)]>;
 
+// See the comment on STDSPQRr.
 // This pseudo is either converted to a regular store or a push which clobbers
 // SP.
-def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGS:$src),
+let Defs = [SP], Uses = [SP], hasSideEffects = 0 in
+def STDWSPQRr : StorePseudo<(outs), (ins memspi:$dt, DREGSNOZ:$src),
                             "stdwstk\t$dt, $src", [(store i16:$src, addr:$dt)]>;
 
 // SP read/write pseudos.
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index 182f92c684dc0..9b935b10d2b51 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -211,6 +211,31 @@ def PTRDISPREGS : RegisterClass<"AVR", [i16], 8, (add R31R30, R29R28), ptr>;
 // model this using a register class containing only the Z register.
 def ZREG : RegisterClass<"AVR", [i16], 8, (add R31R30)>;
 
+// general registers excluding Z register lo/hi, these are the only
+// registers that are always safe for STDSPQr instructions
+def GPR8NOZ : RegisterClass<"AVR", [i8], 8,
+                         (// Return value and argument registers.
+                          add R24, R25, R18, R19, R20, R21, R22, R23,
+                          // Scratch registers.
+                          R26, R27,
+                          // Callee saved registers.
+                          R28, R29, R17, R16, R15, R14, R13, R12, R11, R10,
+                          R9, R8, R7, R6, R5, R4, R3, R2, R0, R1)>;
+
+// 16-bit pair register class excluding Z register lo/hi, these are the only
+// registers that are always safe for STDWSPQr instructions
+def DREGSNOZ : RegisterClass<"AVR", [i16], 8,
+                          (// Return value and arguments.
+                           add R25R24, R19R18, R21R20, R23R22,
+                           // Scratch registers.
+                           R27R26,
+                           // Callee saved registers.
+                           R29R28, R17R16, R15R14, R13R12, R11R10, R9R8,
+                           R7R6, R5R4, R3R2, R1R0,
+                           // Pseudo regs for unaligned 16-bits
+                           R26R25, R24R23, R22R21, R20R19, R18R17, R16R15,
+                           R14R13, R12R11, R10R9)>;
+
 // Register class used for the stack read pseudo instruction.
 def GPRSP : RegisterClass<"AVR", [i16], 8, (add SP)>;
 
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 3a840a3714975..5548ad1ebff5e 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -40,7 +40,7 @@ class AVRDisassembler : public MCDisassembler {
 public:
   AVRDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~AVRDisassembler() = default;
+  ~AVRDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
index 619efb376c613..03c60e8b76fa5 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRELFObjectWriter.cpp
@@ -24,7 +24,7 @@ class AVRELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   AVRELFObjectWriter(uint8_t OSABI);
 
-  virtual ~AVRELFObjectWriter() = default;
+  ~AVRELFObjectWriter() override = default;
 
   unsigned getRelocType(const MCFixup &, const MCValue &,
                         bool IsPCRel) const override;
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.h b/llvm/lib/Target/BPF/BPFAsmPrinter.h
index 0cfb2839c8ff9..90ef2073609a6 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.h
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.h
@@ -32,7 +32,7 @@ class BPFAsmPrinter : public AsmPrinter {
 
   void emitInstruction(const MachineInstr *MI) override;
   MCSymbol *getJTPublicSymbol(unsigned JTI);
-  virtual void emitJumpTableInfo() override;
+  void emitJumpTableInfo() override;
 
   static char ID;
 
diff --git a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
index e3c39a1b8dda5..b12985d81fa75 100644
--- a/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
+++ b/llvm/lib/Target/BPF/BPFCheckAndAdjustIR.cpp
@@ -46,7 +46,7 @@ class BPFCheckAndAdjustIR final : public ModulePass {
 public:
   static char ID;
   BPFCheckAndAdjustIR() : ModulePass(ID) {}
-  virtual void getAnalysisUsage(AnalysisUsage &AU) const override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
 
 private:
   void checkIR(Module &M);
diff --git a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
index f3064c0c8cb8a..af3542ed77389 100644
--- a/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
+++ b/llvm/lib/Target/BPF/BPFTargetLoweringObjectFile.h
@@ -16,7 +16,7 @@ namespace llvm {
 class BPFTargetLoweringObjectFileELF : public TargetLoweringObjectFileELF {
 
 public:
-  virtual MCSection *
+  MCSection *
   getSectionForJumpTable(const Function &F, const TargetMachine &TM,
                          const MachineJumpTableEntry *JTE) const override;
 };
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index 9b5fc9d05e336..a652b7e9c537f 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -95,7 +95,24 @@ void BTFTypeDerived::completeType(BTFDebug &BDebug) {
     return;
   IsCompleted = true;
 
-  BTFType.NameOff = BDebug.addString(Name);
+  switch (Kind) {
+  case BTF::BTF_KIND_PTR:
+  case BTF::BTF_KIND_CONST:
+  case BTF::BTF_KIND_VOLATILE:
+  case BTF::BTF_KIND_RESTRICT:
+    // Debug info might contain names for these types, but given that we want
+    // to keep BTF minimal and naming reference types doesn't bring any value
+    // (what matters is the completeness of the base type), we don't emit them.
+    //
+    // Furthermore, the Linux kernel refuses to load BPF programs that contain
+    // BTF with these types named:
+    // https://elixir.bootlin.com/linux/v6.17.1/source/kernel/bpf/btf.c#L2586
+    BTFType.NameOff = 0;
+    break;
+  default:
+    BTFType.NameOff = BDebug.addString(Name);
+    break;
+  }
 
   if (NeedsFixup || !DTy)
     return;
diff --git a/llvm/lib/Target/DirectX/DXILPrepare.cpp b/llvm/lib/Target/DirectX/DXILPrepare.cpp
index 42e90f0e27517..d6fa65fe54b50 100644
--- a/llvm/lib/Target/DirectX/DXILPrepare.cpp
+++ b/llvm/lib/Target/DirectX/DXILPrepare.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// \file This file contains pases and utilities to convert a modern LLVM
+/// \file This file contains passes and utilities to convert a modern LLVM
 /// module into a module compatible with the LLVM 3.7-based DirectX Intermediate
 /// Language (DXIL).
 //===----------------------------------------------------------------------===//
@@ -16,7 +16,6 @@
 #include "DirectX.h"
 #include "DirectXIRPasses/PointerTypeAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Analysis/DXILMetadataAnalysis.h"
 #include "llvm/Analysis/DXILResource.h"
@@ -27,7 +26,6 @@
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/VersionTuple.h"
 
 #define DEBUG_TYPE "dxil-prepare"
@@ -116,31 +114,6 @@ static void removeStringFunctionAttributes(Function &F,
   F.removeRetAttrs(DeadAttrs);
 }
 
-static void cleanModuleFlags(Module &M) {
-  NamedMDNode *MDFlags = M.getModuleFlagsMetadata();
-  if (!MDFlags)
-    return;
-
-  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries;
-  M.getModuleFlagsMetadata(FlagEntries);
-  bool Updated = false;
-  for (auto &Flag : FlagEntries) {
-    // llvm 3.7 only supports behavior up to AppendUnique.
-    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique)
-      continue;
-    Flag.Behavior = Module::ModFlagBehavior::Warning;
-    Updated = true;
-  }
-
-  if (!Updated)
-    return;
-
-  MDFlags->eraseFromParent();
-
-  for (auto &Flag : FlagEntries)
-    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
-}
-
 class DXILPrepareModule : public ModulePass {
 
   static Value *maybeGenerateBitcast(IRBuilder<> &Builder,
@@ -202,15 +175,6 @@ class DXILPrepareModule : public ModulePass {
                          Builder.getPtrTy(PtrTy->getAddressSpace())));
   }
 
-  static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
-    return {M.getMDKindID("dx.nonuniform"),
-            M.getMDKindID("dx.controlflow.hints"),
-            M.getMDKindID("dx.precise"),
-            llvm::LLVMContext::MD_range,
-            llvm::LLVMContext::MD_alias_scope,
-            llvm::LLVMContext::MD_noalias};
-  }
-
 public:
   bool runOnModule(Module &M) override {
     PointerTypeMap PointerTypes = PointerTypeAnalysis::run(M);
@@ -224,10 +188,7 @@ class DXILPrepareModule : public ModulePass {
     const dxil::ModuleMetadataInfo MetadataInfo =
         getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
     VersionTuple ValVer = MetadataInfo.ValidatorVersion;
-    bool SkipValidation = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;
-
-    // construct allowlist of valid metadata node kinds
-    std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
+    bool AllowExperimental = ValVer.getMajor() == 0 && ValVer.getMinor() == 0;
 
     for (auto &F : M.functions()) {
       F.removeFnAttrs(AttrMask);
@@ -235,7 +196,7 @@ class DXILPrepareModule : public ModulePass {
       // Only remove string attributes if we are not skipping validation.
       // This will reserve the experimental attributes when validation version
       // is 0.0 for experiment mode.
-      removeStringFunctionAttributes(F, SkipValidation);
+      removeStringFunctionAttributes(F, AllowExperimental);
       for (size_t Idx = 0, End = F.arg_size(); Idx < End; ++Idx)
         F.removeParamAttrs(Idx, AttrMask);
 
@@ -243,11 +204,17 @@ class DXILPrepareModule : public ModulePass {
         IRBuilder<> Builder(&BB);
         for (auto &I : make_early_inc_range(BB)) {
 
-          I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
+          if (auto *CB = dyn_cast<CallBase>(&I)) {
+            CB->removeFnAttrs(AttrMask);
+            CB->removeRetAttrs(AttrMask);
+            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
+              CB->removeParamAttrs(Idx, AttrMask);
+            continue;
+          }
 
           // Emtting NoOp bitcast instructions allows the ValueEnumerator to be
           // unmodified as it reserves instruction IDs during contruction.
-          if (auto LI = dyn_cast<LoadInst>(&I)) {
+          if (auto *LI = dyn_cast<LoadInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, LI->getPointerOperand(),
                     LI->getType())) {
@@ -257,7 +224,7 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto SI = dyn_cast<StoreInst>(&I)) {
+          if (auto *SI = dyn_cast<StoreInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, SI->getPointerOperand(),
                     SI->getValueOperand()->getType())) {
@@ -268,39 +235,16 @@ class DXILPrepareModule : public ModulePass {
             }
             continue;
           }
-          if (auto GEP = dyn_cast<GetElementPtrInst>(&I)) {
+          if (auto *GEP = dyn_cast<GetElementPtrInst>(&I)) {
             if (Value *NoOpBitcast = maybeGenerateBitcast(
                     Builder, PointerTypes, I, GEP->getPointerOperand(),
                     GEP->getSourceElementType()))
               GEP->setOperand(0, NoOpBitcast);
             continue;
           }
-          if (auto *CB = dyn_cast<CallBase>(&I)) {
-            CB->removeFnAttrs(AttrMask);
-            CB->removeRetAttrs(AttrMask);
-            for (size_t Idx = 0, End = CB->arg_size(); Idx < End; ++Idx)
-              CB->removeParamAttrs(Idx, AttrMask);
-            continue;
-          }
         }
       }
     }
-    // Remove flags not for DXIL.
-    cleanModuleFlags(M);
-
-    // dx.rootsignatures will have been parsed from its metadata form as its
-    // binary form as part of the RootSignatureAnalysisWrapper, so safely
-    // remove it as it is not recognized in DXIL
-    if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
-      RootSignature->eraseFromParent();
-
-    // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
-    // causes all tests using the DXIL Validator to fail.
-    //
-    // This is a temporary fix and should be replaced with a whitelist once
-    // we have determined all metadata that the DXIL Validator allows
-    if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
-      ErrNo->eraseFromParent();
 
     return true;
   }
@@ -308,11 +252,11 @@ class DXILPrepareModule : public ModulePass {
   DXILPrepareModule() : ModulePass(ID) {}
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<DXILMetadataAnalysisWrapperPass>();
-    AU.addRequired<RootSignatureAnalysisWrapper>();
-    AU.addPreserved<RootSignatureAnalysisWrapper>();
-    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
+
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
     AU.addPreserved<DXILResourceWrapperPass>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
+    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
   }
   static char ID; // Pass identification.
 };
@@ -323,7 +267,6 @@ char DXILPrepareModule::ID = 0;
 INITIALIZE_PASS_BEGIN(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
 INITIALIZE_PASS_END(DXILPrepareModule, DEBUG_TYPE, "DXIL Prepare Module", false,
                     false)
 
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
index 9eebcc9b13063..1e4797bbd05aa 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.cpp
@@ -7,8 +7,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DXILTranslateMetadata.h"
+#include "DXILRootSignature.h"
 #include "DXILShaderFlags.h"
 #include "DirectX.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/DXILMetadataAnalysis.h"
@@ -204,9 +206,9 @@ getEntryPropAsMetadata(const EntryProperties &EP, uint64_t EntryShaderFlags,
   return MDNode::get(Ctx, MDVals);
 }
 
-MDTuple *constructEntryMetadata(const Function *EntryFn, MDTuple *Signatures,
-                                MDNode *Resources, MDTuple *Properties,
-                                LLVMContext &Ctx) {
+static MDTuple *constructEntryMetadata(const Function *EntryFn,
+                                       MDTuple *Signatures, MDNode *Resources,
+                                       MDTuple *Properties, LLVMContext &Ctx) {
   // Each entry point metadata record specifies:
   //  * reference to the entry point function global symbol
   //  * unmangled name
@@ -290,42 +292,82 @@ static MDTuple *emitTopLevelLibraryNode(Module &M, MDNode *RMD,
   return constructEntryMetadata(nullptr, nullptr, RMD, Properties, Ctx);
 }
 
-// TODO: We might need to refactor this to be more generic,
-// in case we need more metadata to be replaced.
-static void translateBranchMetadata(Module &M) {
-  for (Function &F : M) {
-    for (BasicBlock &BB : F) {
-      Instruction *BBTerminatorInst = BB.getTerminator();
+static void translateBranchMetadata(Module &M, Instruction *BBTerminatorInst) {
+  MDNode *HlslControlFlowMD =
+      BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+
+  if (!HlslControlFlowMD)
+    return;
 
-      MDNode *HlslControlFlowMD =
-          BBTerminatorInst->getMetadata("hlsl.controlflow.hint");
+  assert(HlslControlFlowMD->getNumOperands() == 2 &&
+         "invalid operands for hlsl.controlflow.hint");
 
-      if (!HlslControlFlowMD)
-        continue;
+  MDBuilder MDHelper(M.getContext());
 
-      assert(HlslControlFlowMD->getNumOperands() == 2 &&
-             "invalid operands for hlsl.controlflow.hint");
+  llvm::Metadata *HintsStr = MDHelper.createString("dx.controlflow.hints");
+  llvm::Metadata *HintsValue = MDHelper.createConstant(
+      mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1)));
 
-      MDBuilder MDHelper(M.getContext());
-      ConstantInt *Op1 =
-          mdconst::extract<ConstantInt>(HlslControlFlowMD->getOperand(1));
+  MDNode *MDNode = llvm::MDNode::get(M.getContext(), {HintsStr, HintsValue});
 
-      SmallVector<llvm::Metadata *, 2> Vals(
-          ArrayRef<Metadata *>{MDHelper.createString("dx.controlflow.hints"),
-                               MDHelper.createConstant(Op1)});
+  BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
+  BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
+}
+
+static std::array<unsigned, 6> getCompatibleInstructionMDs(llvm::Module &M) {
+  return {
+      M.getMDKindID("dx.nonuniform"),    M.getMDKindID("dx.controlflow.hints"),
+      M.getMDKindID("dx.precise"),       llvm::LLVMContext::MD_range,
+      llvm::LLVMContext::MD_alias_scope, llvm::LLVMContext::MD_noalias};
+}
 
-      MDNode *MDNode = llvm::MDNode::get(M.getContext(), Vals);
+static void translateInstructionMetadata(Module &M) {
+  // construct allowlist of valid metadata node kinds
+  std::array<unsigned, 6> DXILCompatibleMDs = getCompatibleInstructionMDs(M);
 
-      BBTerminatorInst->setMetadata("dx.controlflow.hints", MDNode);
-      BBTerminatorInst->setMetadata("hlsl.controlflow.hint", nullptr);
+  for (Function &F : M) {
+    for (BasicBlock &BB : F) {
+      // This needs to be done first so that "hlsl.controlflow.hints" isn't
+      // removed in the whitelist below
+      if (auto *I = BB.getTerminator())
+        translateBranchMetadata(M, I);
+
+      for (auto &I : make_early_inc_range(BB)) {
+        I.dropUnknownNonDebugMetadata(DXILCompatibleMDs);
+      }
     }
   }
 }
 
-static void translateMetadata(Module &M, DXILResourceMap &DRM,
-                              DXILResourceTypeMap &DRTM,
-                              const ModuleShaderFlags &ShaderFlags,
-                              const ModuleMetadataInfo &MMDI) {
+static void cleanModuleFlags(Module &M) {
+  NamedMDNode *MDFlags = M.getModuleFlagsMetadata();
+  if (!MDFlags)
+    return;
+
+  SmallVector<llvm::Module::ModuleFlagEntry> FlagEntries;
+  M.getModuleFlagsMetadata(FlagEntries);
+  bool Updated = false;
+  for (auto &Flag : FlagEntries) {
+    // llvm 3.7 only supports behavior up to AppendUnique.
+    if (Flag.Behavior <= Module::ModFlagBehavior::AppendUnique)
+      continue;
+    Flag.Behavior = Module::ModFlagBehavior::Warning;
+    Updated = true;
+  }
+
+  if (!Updated)
+    return;
+
+  MDFlags->eraseFromParent();
+
+  for (auto &Flag : FlagEntries)
+    M.addModuleFlag(Flag.Behavior, Flag.Key->getString(), Flag.Val);
+}
+
+static void translateGlobalMetadata(Module &M, DXILResourceMap &DRM,
+                                    DXILResourceTypeMap &DRTM,
+                                    const ModuleShaderFlags &ShaderFlags,
+                                    const ModuleMetadataInfo &MMDI) {
   LLVMContext &Ctx = M.getContext();
   IRBuilder<> IRB(Ctx);
   SmallVector<MDNode *> EntryFnMDNodes;
@@ -381,6 +423,22 @@ static void translateMetadata(Module &M, DXILResourceMap &DRM,
       M.getOrInsertNamedMetadata("dx.entryPoints");
   for (auto *Entry : EntryFnMDNodes)
     EntryPointsNamedMD->addOperand(Entry);
+
+  cleanModuleFlags(M);
+
+  // dx.rootsignatures will have been parsed from its metadata form as its
+  // binary form as part of the RootSignatureAnalysisWrapper, so safely
+  // remove it as it is not recognized in DXIL
+  if (NamedMDNode *RootSignature = M.getNamedMetadata("dx.rootsignatures"))
+    RootSignature->eraseFromParent();
+
+  // llvm.errno.tbaa was recently added but is not supported in LLVM 3.7 and
+  // causes all tests using the DXIL Validator to fail.
+  //
+  // This is a temporary fix and should be replaced with a allowlist once
+  // we have determined all metadata that the DXIL Validator allows
+  if (NamedMDNode *ErrNo = M.getNamedMetadata("llvm.errno.tbaa"))
+    ErrNo->eraseFromParent();
 }
 
 PreservedAnalyses DXILTranslateMetadata::run(Module &M,
@@ -390,8 +448,8 @@ PreservedAnalyses DXILTranslateMetadata::run(Module &M,
   const ModuleShaderFlags &ShaderFlags = MAM.getResult<ShaderFlagsAnalysis>(M);
   const dxil::ModuleMetadataInfo MMDI = MAM.getResult<DXILMetadataAnalysis>(M);
 
-  translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
-  translateBranchMetadata(M);
+  translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+  translateInstructionMetadata(M);
 
   return PreservedAnalyses::all();
 }
@@ -409,10 +467,13 @@ class DXILTranslateMetadataLegacy : public ModulePass {
     AU.addRequired<DXILResourceWrapperPass>();
     AU.addRequired<ShaderFlagsAnalysisWrapper>();
     AU.addRequired<DXILMetadataAnalysisWrapperPass>();
-    AU.addPreserved<DXILResourceWrapperPass>();
+    AU.addRequired<RootSignatureAnalysisWrapper>();
+
     AU.addPreserved<DXILMetadataAnalysisWrapperPass>();
-    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
     AU.addPreserved<DXILResourceBindingWrapperPass>();
+    AU.addPreserved<DXILResourceWrapperPass>();
+    AU.addPreserved<RootSignatureAnalysisWrapper>();
+    AU.addPreserved<ShaderFlagsAnalysisWrapper>();
   }
 
   bool runOnModule(Module &M) override {
@@ -425,8 +486,8 @@ class DXILTranslateMetadataLegacy : public ModulePass {
     dxil::ModuleMetadataInfo MMDI =
         getAnalysis<DXILMetadataAnalysisWrapperPass>().getModuleMetadata();
 
-    translateMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
-    translateBranchMetadata(M);
+    translateGlobalMetadata(M, DRM, DRTM, ShaderFlags, MMDI);
+    translateInstructionMetadata(M);
     return true;
   }
 };
@@ -443,6 +504,7 @@ INITIALIZE_PASS_BEGIN(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
                       "DXIL Translate Metadata", false, false)
 INITIALIZE_PASS_DEPENDENCY(DXILResourceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ShaderFlagsAnalysisWrapper)
+INITIALIZE_PASS_DEPENDENCY(RootSignatureAnalysisWrapper)
 INITIALIZE_PASS_DEPENDENCY(DXILMetadataAnalysisWrapperPass)
 INITIALIZE_PASS_END(DXILTranslateMetadataLegacy, "dxil-translate-metadata",
                     "DXIL Translate Metadata", false, false)
diff --git a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
index f3f5eb1901406..4c1ffac1781e6 100644
--- a/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
+++ b/llvm/lib/Target/DirectX/DXILTranslateMetadata.h
@@ -13,7 +13,8 @@
 
 namespace llvm {
 
-/// A pass that transforms DXIL Intrinsics that don't have DXIL opCodes
+/// A pass that transforms LLVM Metadata in the module to it's DXIL equivalent,
+/// then emits all recognized DXIL Metadata
 class DXILTranslateMetadata : public PassInfoMixin<DXILTranslateMetadata> {
 public:
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &);
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index fb0928b860d05..ede8463ff644b 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -79,6 +79,12 @@ def ExtensionHVXV79: SubtargetFeature<"hvxv79", "HexagonHVXVersion",
        ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
        ExtensionHVXV73, ExtensionHVXV75]>;
 
+def ExtensionHVXV81: SubtargetFeature<"hvxv81", "HexagonHVXVersion",
+      "Hexagon::ArchEnum::V81", "Hexagon HVX instructions",
+      [ExtensionHVXV65, ExtensionHVXV66, ExtensionHVXV67,
+       ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
+       ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79]>;
+
 def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
       "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
 def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
@@ -151,6 +157,8 @@ def UseHVXV75          : Predicate<"HST->useHVXV75Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV75)>;
 def UseHVXV79          : Predicate<"HST->useHVXV79Ops()">,
                          AssemblerPredicate<(all_of ExtensionHVXV79)>;
+def UseHVXV81          : Predicate<"HST->useHVXV81Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV81)>;
 def UseAudio           : Predicate<"HST->useAudioOps()">,
                          AssemblerPredicate<(all_of ExtensionAudio)>;
 def UseZReg            : Predicate<"HST->useZRegOps()">,
@@ -488,6 +496,11 @@ def : Proc<"hexagonv79", HexagonModelV79,
            ArchV68, ArchV69, ArchV71, ArchV73, ArchV75, ArchV79,
            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv81", HexagonModelV81,
+           [ArchV65, ArchV66, ArchV67, ArchV68, ArchV69, ArchV71, ArchV73,
+            ArchV75, ArchV79, ArchV81,
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
 // Need to update the correct features for tiny core.
 // Disable NewValueJumps since the packetizer is unable to handle a packet with
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 89845348a9e31..9bf4034e2ab34 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -29,7 +29,8 @@ enum class ArchEnum {
   V71,
   V73,
   V75,
-  V79
+  V79,
+  V81
 };
 
 inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
@@ -50,6 +51,7 @@ inline std::optional<Hexagon::ArchEnum> getCpu(StringRef CPU) {
       .Case("hexagonv73", Hexagon::ArchEnum::V73)
       .Case("hexagonv75", Hexagon::ArchEnum::V75)
       .Case("hexagonv79", Hexagon::ArchEnum::V79)
+      .Case("hexagonv81", Hexagon::ArchEnum::V81)
       .Default(std::nullopt);
 }
 } // namespace Hexagon
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 8ec1d93f6c436..f623fd0039ebe 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -34,3 +34,5 @@ def ArchV75: SubtargetFeature<"v75", "HexagonArchVersion", "Hexagon::ArchEnum::V
 def HasV75 : Predicate<"HST->hasV75Ops()">, AssemblerPredicate<(all_of ArchV75)>;
 def ArchV79: SubtargetFeature<"v79", "HexagonArchVersion", "Hexagon::ArchEnum::V79", "Enable Hexagon V79 architecture">;
 def HasV79 : Predicate<"HST->hasV79Ops()">, AssemblerPredicate<(all_of ArchV79)>;
+def ArchV81: SubtargetFeature<"v81", "HexagonArchVersion", "Hexagon::ArchEnum::V81", "Enable Hexagon V81 architecture">;
+def HasV81 : Predicate<"HST->hasV81Ops()">, AssemblerPredicate<(all_of ArchV81)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index 93696e0e3b4ea..f4e36fa7dc767 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -7222,3 +7222,595 @@ class DepHVXItinV79 {
       [Hex_FWD, Hex_FWD, HVX_FWD]>
   ];
 }
+
+class DepHVXItinV81 {
+  list<InstrItinData> DepHVXItinV81_list = [
+    InstrItinData <tc_0390c1ca, /*SLOT01,LOAD,VA,VX_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0afc8be9, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_227864f7, /*SLOT0,STORE,VA,VX_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2b4c548e, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_37820f4c, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c8c15d0, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_4942646a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_531b383c, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5cdf8c84, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7095ecba, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_72e2b393, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_73efe966, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d68d5c2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9a1cab75, /*SLOT01,LOAD,VA,VX_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9aff7a2a, /*SLOT0,STORE,VA,VX_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE], 0>,
+       InstrStage<1, [CVI_MPY01]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a19b9305, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a28f32b5, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a69eeee1, /*SLOT01,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb599486, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cda936da, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_dcca380f, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2fdd6e6, /*SLOT0123*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f175e046, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index 7a1ad3ea7dd12..48b665c3ee461 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -13740,3 +13740,891 @@ class DepScalarItinV79 {
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
   ];
 }
+
+class DepScalarItinV81 {
+  list<InstrItinData> DepScalarItinV81_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a43be35, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_139ef484, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_2471c1c8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_33e7e673, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_46c18ecf, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4bf903b0, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_512b1653, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54f0cee2, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a222e89, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6aa823ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb52018, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_759e57be, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7d6a2568, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_78f87ed3, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_7af3a37e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c28bd7e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f58404a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8f36a2fd, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95f43c5e, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9b20a062, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a724463d, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af6af259, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b2196a3f, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b3d46584, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_b9bec29e, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb78483e, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d234b61a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d71ea8fa, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e60def48, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed3f8d2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f97707c1, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index ae96753f40cf2..f8f1c2ad07b75 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -39178,6 +39178,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsub_hf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf16 = vsub($Vu32.hf,$Vv32.qf16)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsub_qf16 : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -39269,6 +39282,19 @@ let opNewValue = 0;
 let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
+def V6_vsub_sf_mix : HInst<
+(outs HvxVR:$Vd32),
+(ins HvxVR:$Vu32, HvxVR:$Vv32),
+"$Vd32.qf32 = vsub($Vu32.sf,$Vv32.qf32)",
+tc_05ca8cfd, TypeCVI_VS>, Enc_45364e, Requires<[UseHVXV81,UseHVXQFloat]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b1;
+let Inst{31-21} = 0b00011010000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isCVI = 1;
+let DecoderNamespace = "EXT_mmvec";
+}
 def V6_vsub_sf_sf : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -41116,6 +41142,17 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let isSolo = 1;
 }
+def Y2_tlbpp : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32),
+"$Rd32 = tlbp($Rss32)",
+tc_6aa823ab, TypeCR>, Enc_90cd8b, Requires<[HasV81]> {
+let Inst{13-5} = 0b000000000;
+let Inst{31-21} = 0b01101100011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let isSolo = 1;
+}
 def Y2_tlbr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 17cb96cdee9f9..23f4b3aef7d10 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -3827,3 +3827,14 @@ def: Pat<(int_hexagon_V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsub_hf_f8_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsub_hf_f8 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV79, UseHVX128B]>;
+
+// V81 HVX Instructions.
+
+def: Pat<(int_hexagon_V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_hf_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_hf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX64B, UseHVXQFloat]>;
+def: Pat<(int_hexagon_V6_vsub_sf_mix_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsub_sf_mix HvxVR:$src1, HvxVR:$src2)>, Requires<[UseHVXV81, UseHVX128B, UseHVXQFloat]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index e285e04543694..7ee280d8fc8b0 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -654,7 +654,9 @@ void HexagonDAGToDAGISel::SelectIntrinsicWChain(SDNode *N) {
       IntNo == Intrinsic::hexagon_V6_vgathermh ||
       IntNo == Intrinsic::hexagon_V6_vgathermh_128B ||
       IntNo == Intrinsic::hexagon_V6_vgathermhw ||
-      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B) {
+      IntNo == Intrinsic::hexagon_V6_vgathermhw_128B ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh ||
+      IntNo == Intrinsic::hexagon_V6_vgather_vscattermh_128B) {
     SelectV65Gather(N);
     return;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index c7a4f6803a243..3cc146b13f8f8 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -2953,6 +2953,10 @@ void HexagonDAGToDAGISel::SelectV65Gather(SDNode *N) {
   case Intrinsic::hexagon_V6_vgathermhw_128B:
     Opcode = Hexagon::V6_vgathermhw_pseudo;
     break;
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B:
+    Opcode = Hexagon::V6_vgather_vscatter_mh_pseudo;
+    break;
   }
 
   SDVTList VTs = CurDAG->getVTList(MVT::Other);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 9f7f434b66fa1..526b4de975915 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -2145,7 +2145,9 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::hexagon_V6_vgathermhq:
   case Intrinsic::hexagon_V6_vgathermhq_128B:
   case Intrinsic::hexagon_V6_vgathermhwq:
-  case Intrinsic::hexagon_V6_vgathermhwq_128B: {
+  case Intrinsic::hexagon_V6_vgathermhwq_128B:
+  case Intrinsic::hexagon_V6_vgather_vscattermh:
+  case Intrinsic::hexagon_V6_vgather_vscattermh_128B: {
     const Module &M = *I.getParent()->getParent()->getParent();
     Info.opc = ISD::INTRINSIC_W_CHAIN;
     Type *VecTy = I.getArgOperand(1)->getType();
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 939841ae817c3..47726d6447ad8 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -1554,80 +1554,93 @@ HexagonInstrInfo::expandVGatherPseudo(MachineInstr &MI) const {
   MachineBasicBlock::iterator First;
 
   switch (Opc) {
-    case Hexagon::V6_vgathermh_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhw_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
-
-    case Hexagon::V6_vgathermhwq_pseudo:
-      First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
-                  .add(MI.getOperand(2))
-                  .add(MI.getOperand(3))
-                  .add(MI.getOperand(4))
-                  .add(MI.getOperand(5));
-      BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
-          .add(MI.getOperand(0))
-          .addImm(MI.getOperand(1).getImm())
-          .addReg(Hexagon::VTMP);
-      MBB.erase(MI);
-      return First.getInstrIterator();
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
+    // This is mainly a place holder. It will be extended.
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vscattermh))
+        .add(MI.getOperand(2))
+        .add(MI.getOperand(3))
+        .add(MI.getOperand(4))
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+  case Hexagon::V6_vgathermh_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermh))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhw_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhw))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
+
+  case Hexagon::V6_vgathermhwq_pseudo:
+    First = BuildMI(MBB, MI, DL, get(Hexagon::V6_vgathermhwq))
+                .add(MI.getOperand(2))
+                .add(MI.getOperand(3))
+                .add(MI.getOperand(4))
+                .add(MI.getOperand(5));
+    BuildMI(MBB, MI, DL, get(Hexagon::V6_vS32b_new_ai))
+        .add(MI.getOperand(0))
+        .addImm(MI.getOperand(1).getImm())
+        .addReg(Hexagon::VTMP);
+    MBB.erase(MI);
+    return First.getInstrIterator();
   }
 
   return MI.getIterator();
@@ -2806,6 +2819,7 @@ bool HexagonInstrInfo::isValidOffset(unsigned Opcode, int Offset,
   case Hexagon::V6_vL32b_nt_tmp_npred_ai:
   case Hexagon::V6_vS32Ub_npred_ai:
   case Hexagon::V6_vgathermh_pseudo:
+  case Hexagon::V6_vgather_vscatter_mh_pseudo:
   case Hexagon::V6_vgathermw_pseudo:
   case Hexagon::V6_vgathermhw_pseudo:
   case Hexagon::V6_vgathermhq_pseudo:
diff --git a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
index f927f9b9e7c34..42393d081f1a7 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatternsV65.td
@@ -40,6 +40,19 @@ defm V6_vgathermh_pseudo  : vgathermh<HvxVR>;
 defm V6_vgathermw_pseudo  : vgathermw<HvxVR>;
 defm V6_vgathermhw_pseudo  : vgathermhw<HvxWR>;
 
+
+multiclass vgather_scatter_mh<RegisterClass RC> {
+  let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
+  mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
+  def NAME : CVI_GATHER_TMP_LD_Resource_NoOpcode<(outs ),
+                           (ins IntRegs:$_dst_, s4_0Imm:$Ii,
+                                IntRegs:$Rt, ModRegs:$Mu, RC:$Vv),
+                           ".error \"should not emit\" ",
+                           []>;
+}
+
+defm V6_vgather_vscatter_mh_pseudo  : vgather_scatter_mh<HvxVR>;
+
 multiclass vgathermhq<RegisterClass RC1, RegisterClass RC2> {
   let isCodeGenOnly = 1, isPseudo = 1, mayLoad = 1,
   mayStore = 1, addrMode = BaseImmOffset, accessSize = HalfWordAccess in
diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td
index b8a9cf3c0f569..9bcd4bf7d6841 100644
--- a/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -75,3 +75,4 @@ include "HexagonScheduleV71T.td"
 include "HexagonScheduleV73.td"
 include "HexagonScheduleV75.td"
 include "HexagonScheduleV79.td"
+include "HexagonScheduleV81.td"
\ No newline at end of file
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV81.td b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
new file mode 100644
index 0000000000000..dd5f5a039f44b
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV81.td
@@ -0,0 +1,31 @@
+//=-HexagonScheduleV81.td - HexagonV81 Scheduling Definitions *- tablegen -*-=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+def HexagonV81ItinList : DepScalarItinV81, ScalarItin,
+                         DepHVXItinV81, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV81_list, ScalarItin_list,
+                DepHVXItinV81_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV81 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV81ItinList.ItinList>;
+
+def HexagonModelV81 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV81;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 7430567133179..995f66d0551b4 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -224,6 +224,15 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool useHVXV79Ops() const {
     return HexagonHVXVersion >= Hexagon::ArchEnum::V79;
   }
+  bool hasV81Ops() const {
+    return getHexagonArchVersion() >= Hexagon::ArchEnum::V81;
+  }
+  bool hasV81OpsOnly() const {
+    return getHexagonArchVersion() == Hexagon::ArchEnum::V81;
+  }
+  bool useHVXV81Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V81;
+  }
 
   bool useAudioOps() const { return UseAudioOps; }
   bool useCompound() const { return UseCompound; }
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 171e2949366ad..e925e041eb64e 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -31,6 +31,10 @@ using namespace llvm;
 static cl::opt<bool> HexagonAutoHVX("hexagon-autohvx", cl::init(false),
     cl::Hidden, cl::desc("Enable loop vectorizer for HVX"));
 
+cl::opt<bool> HexagonAllowScatterGatherHVX(
+    "hexagon-allow-scatter-gather-hvx", cl::init(false), cl::Hidden,
+    cl::desc("Allow auto-generation of HVX scatter-gather"));
+
 static cl::opt<bool> EnableV68FloatAutoHVX(
     "force-hvx-float", cl::Hidden,
     cl::desc("Enable auto-vectorization of floatint point types on v68."));
@@ -354,6 +358,61 @@ bool HexagonTTIImpl::isLegalMaskedLoad(Type *DataType, Align /*Alignment*/,
   return HexagonMaskedVMem && ST.isTypeForHVX(DataType);
 }
 
+bool HexagonTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) const {
+  // For now assume we can not deal with all HVX datatypes.
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64 || getTypeNumElements(Ty) == 32)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::isLegalMaskedScatter(Type *Ty, Align Alignment) const {
+  if (!Ty->isVectorTy() || !ST.isTypeForHVX(Ty) ||
+      !HexagonAllowScatterGatherHVX)
+    return false;
+  // This must be in sync with HexagonVectorCombine pass.
+  switch (Ty->getScalarSizeInBits()) {
+  case 8:
+    return (getTypeNumElements(Ty) == 128);
+  case 16:
+    if (getTypeNumElements(Ty) == 64)
+      return (Alignment >= 2);
+    break;
+  case 32:
+    if (getTypeNumElements(Ty) == 32)
+      return (Alignment >= 4);
+    break;
+  default:
+    break;
+  }
+  return false;
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedGather(VectorType *VTy,
+                                                Align Alignment) const {
+  return !isLegalMaskedGather(VTy, Alignment);
+}
+
+bool HexagonTTIImpl::forceScalarizeMaskedScatter(VectorType *VTy,
+                                                 Align Alignment) const {
+  return !isLegalMaskedScatter(VTy, Alignment);
+}
+
 /// --- Vector TTI end ---
 
 unsigned HexagonTTIImpl::getPrefetchDistance() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index dbf16c99c314c..cec2bf9656ffc 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -169,6 +169,12 @@ class HexagonTTIImpl final : public BasicTTIImplBase<HexagonTTIImpl> {
                           unsigned AddressSpace) const override;
   bool isLegalMaskedLoad(Type *DataType, Align Alignment,
                          unsigned AddressSpace) const override;
+  bool isLegalMaskedGather(Type *Ty, Align Alignment) const override;
+  bool isLegalMaskedScatter(Type *Ty, Align Alignment) const override;
+  bool forceScalarizeMaskedGather(VectorType *VTy,
+                                  Align Alignment) const override;
+  bool forceScalarizeMaskedScatter(VectorType *VTy,
+                                   Align Alignment) const override;
 
   /// @}
 
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index 9ab52020e2e36..5c50ec2425b7c 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -57,6 +57,11 @@
 
 #define DEBUG_TYPE "hexagon-vc"
 
+// This is a const that represents default HVX VTCM page size.
+// It is boot time configurable, so we probably want an API to
+// read it, but for now assume 128KB
+#define DEFAULT_HVX_VTCM_PAGE_SIZE 131072
+
 using namespace llvm;
 
 namespace {
@@ -418,6 +423,18 @@ raw_ostream &operator<<(raw_ostream &OS, const AlignVectors::ByteSpan &BS) {
 
 class HvxIdioms {
 public:
+  enum DstQualifier {
+    Undefined = 0,
+    Arithmetic,
+    LdSt,
+    LLVM_Gather,
+    LLVM_Scatter,
+    HEX_Gather_Scatter,
+    HEX_Gather,
+    HEX_Scatter,
+    Call
+  };
+
   HvxIdioms(const HexagonVectorCombine &HVC_) : HVC(HVC_) {
     auto *Int32Ty = HVC.getIntTy(32);
     HvxI32Ty = HVC.getHvxTy(Int32Ty, /*Pair=*/false);
@@ -473,6 +490,11 @@ class HvxIdioms {
   auto createMulLong(IRBuilderBase &Builder, ArrayRef<Value *> WordX,
                      Signedness SgnX, ArrayRef<Value *> WordY,
                      Signedness SgnY) const -> SmallVector<Value *>;
+  // Vector manipulations for Ripple
+  bool matchScatter(Instruction &In) const;
+  bool matchGather(Instruction &In) const;
+  Value *processVScatter(Instruction &In) const;
+  Value *processVGather(Instruction &In) const;
 
   VectorType *HvxI32Ty;
   VectorType *HvxP32Ty;
@@ -1545,7 +1567,7 @@ auto AlignVectors::isSectorTy(Type *Ty) const -> bool {
 }
 
 auto AlignVectors::run() -> bool {
-  LLVM_DEBUG(dbgs() << "Running HVC::AlignVectors on " << HVC.F.getName()
+  LLVM_DEBUG(dbgs() << "\nRunning HVC::AlignVectors on " << HVC.F.getName()
                     << '\n');
   if (!createAddressGroups())
     return false;
@@ -1797,6 +1819,846 @@ auto HvxIdioms::processFxpMul(Instruction &In, const FxpOp &Op) const
   return Ext;
 }
 
+inline bool HvxIdioms::matchScatter(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_scatter);
+}
+
+inline bool HvxIdioms::matchGather(Instruction &In) const {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&In);
+  if (!II)
+    return false;
+  return (II->getIntrinsicID() == Intrinsic::masked_gather);
+}
+
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual);
+
+// Binary instructions we want to handle as users of gather/scatter.
+inline bool isArithmetic(unsigned Opc) {
+  switch (Opc) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl:
+  case Instruction::UDiv:
+    return true;
+  }
+  return false;
+}
+
+// TODO: Maybe use MemoryLocation for this. See getLocOrNone above.
+inline Value *getPointer(Value *Ptr) {
+  assert(Ptr && "Unable to extract pointer");
+  if (isa<AllocaInst>(Ptr) || isa<Argument>(Ptr) || isa<GlobalValue>(Ptr))
+    return Ptr;
+  if (isa<LoadInst>(Ptr) || isa<StoreInst>(Ptr))
+    return getLoadStorePointerOperand(Ptr);
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Ptr)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(1);
+  }
+  return nullptr;
+}
+
+static Instruction *selectDestination(Instruction *In,
+                                      HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  if (isa<StoreInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::LdSt;
+  } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_gather) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Gather;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_scatter) {
+      Destination = In;
+      Qual = HvxIdioms::LLVM_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::masked_store) {
+      Destination = In;
+      Qual = HvxIdioms::LdSt;
+    } else if (II->getIntrinsicID() ==
+               Intrinsic::hexagon_V6_vgather_vscattermh) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vscattermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Scatter;
+    } else if (II->getIntrinsicID() == Intrinsic::hexagon_V6_vgathermh_128B) {
+      Destination = In;
+      Qual = HvxIdioms::HEX_Gather;
+    }
+  } else if (isa<ZExtInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CastInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isa<CallInst>(In)) {
+    Destination = In;
+    Qual = HvxIdioms::Call;
+  } else if (isa<GetElementPtrInst>(In)) {
+    return locateDestination(In, Qual);
+  } else if (isArithmetic(In->getOpcode())) {
+    Destination = In;
+    Qual = HvxIdioms::Arithmetic;
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled destination : " << *In << "\n");
+  }
+  return Destination;
+}
+
+// This method attempts to find destination (user) for a given intrinsic.
+// Given that these are produced only by Ripple, the number of options is
+// limited. Simplest case is explicit store which in fact is redundant (since
+// HVX gater creates its own store during packetization). Nevertheless we need
+// to figure address where we storing. Other cases are more complicated, but
+// still few.
+Instruction *locateDestination(Instruction *In, HvxIdioms::DstQualifier &Qual) {
+  Instruction *Destination = nullptr;
+  if (!In)
+    return Destination;
+  // Get all possible destinations
+  SmallVector<Instruction *> Users;
+  // Iterate over the uses of the instruction
+  for (auto &U : In->uses()) {
+    if (auto *UI = dyn_cast<Instruction>(U.getUser())) {
+      Destination = selectDestination(UI, Qual);
+      if (Destination)
+        Users.push_back(Destination);
+    }
+  }
+  // Now see which of the users (if any) is a memory destination.
+  for (auto *I : Users)
+    if (getPointer(I))
+      return I;
+  return Destination;
+}
+
+// The two intrinsics we handle here have GEP in a different position.
+inline GetElementPtrInst *locateGepFromIntrinsic(Instruction *In) {
+  assert(In && "Bad instruction");
+  IntrinsicInst *IIn = dyn_cast<IntrinsicInst>(In);
+  assert((IIn && (IIn->getIntrinsicID() == Intrinsic::masked_gather ||
+                  IIn->getIntrinsicID() == Intrinsic::masked_scatter)) &&
+         "Not a gather Intrinsic");
+  GetElementPtrInst *GEPIndex = nullptr;
+  if (IIn->getIntrinsicID() == Intrinsic::masked_gather)
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(0));
+  else
+    GEPIndex = dyn_cast<GetElementPtrInst>(IIn->getOperand(1));
+  return GEPIndex;
+}
+
+// Given the intrinsic find its GEP argument and extract base address it uses.
+// The method relies on the way how Ripple typically forms the GEP for
+// scatter/gather.
+static Value *locateAddressFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *BaseAddress = GEPIndex->getPointerOperand();
+  auto *IndexLoad = dyn_cast<LoadInst>(BaseAddress);
+  if (IndexLoad)
+    return IndexLoad;
+
+  auto *IndexZEx = dyn_cast<ZExtInst>(BaseAddress);
+  if (IndexZEx) {
+    IndexLoad = dyn_cast<LoadInst>(IndexZEx->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(IndexZEx->getOperand(0));
+    if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+      return locateAddressFromIntrinsic(II);
+  }
+  auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(BaseAddress);
+  if (BaseShuffle) {
+    IndexLoad = dyn_cast<LoadInst>(BaseShuffle->getOperand(0));
+    if (IndexLoad)
+      return IndexLoad;
+    auto *IE = dyn_cast<InsertElementInst>(BaseShuffle->getOperand(0));
+    if (IE) {
+      auto *Src = IE->getOperand(1);
+      IndexLoad = dyn_cast<LoadInst>(Src);
+      if (IndexLoad)
+        return IndexLoad;
+      auto *Alloca = dyn_cast<AllocaInst>(Src);
+      if (Alloca)
+        return Alloca;
+      if (isa<Argument>(Src)) {
+        return Src;
+      }
+      if (isa<GlobalValue>(Src)) {
+        return Src;
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "  Unable to locate Address from intrinsic\n");
+  return nullptr;
+}
+
+static Type *getIndexType(Value *In) {
+  if (!In)
+    return nullptr;
+
+  if (isa<LoadInst>(In) || isa<StoreInst>(In))
+    return getLoadStoreType(In);
+
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return II->getType();
+    if (II->getIntrinsicID() == Intrinsic::masked_store)
+      return II->getOperand(0)->getType();
+  }
+  return In->getType();
+}
+
+static Value *locateIndexesFromGEP(Value *In) {
+  if (!In)
+    return nullptr;
+  if (isa<LoadInst>(In))
+    return In;
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(In)) {
+    if (II->getIntrinsicID() == Intrinsic::masked_load)
+      return In;
+    if (II->getIntrinsicID() == Intrinsic::masked_gather)
+      return In;
+  }
+  if (auto *IndexZEx = dyn_cast<ZExtInst>(In))
+    return locateIndexesFromGEP(IndexZEx->getOperand(0));
+  if (auto *IndexSEx = dyn_cast<SExtInst>(In))
+    return locateIndexesFromGEP(IndexSEx->getOperand(0));
+  if (auto *BaseShuffle = dyn_cast<ShuffleVectorInst>(In))
+    return locateIndexesFromGEP(BaseShuffle->getOperand(0));
+  if (auto *IE = dyn_cast<InsertElementInst>(In))
+    return locateIndexesFromGEP(IE->getOperand(1));
+  if (auto *cstDataVector = dyn_cast<ConstantDataVector>(In))
+    return cstDataVector;
+  if (auto *GEPIndex = dyn_cast<GetElementPtrInst>(In))
+    return GEPIndex->getOperand(0);
+  return nullptr;
+}
+
+// Given the intrinsic find its GEP argument and extract offsetts from the base
+// address it uses.
+static Value *locateIndexesFromIntrinsic(Instruction *In) {
+  GetElementPtrInst *GEPIndex = locateGepFromIntrinsic(In);
+  if (!GEPIndex) {
+    LLVM_DEBUG(dbgs() << "  No GEP in intrinsic\n");
+    return nullptr;
+  }
+  Value *Indexes = GEPIndex->getOperand(1);
+  if (auto *IndexLoad = locateIndexesFromGEP(Indexes))
+    return IndexLoad;
+
+  LLVM_DEBUG(dbgs() << "  Unable to locate Index from intrinsic\n");
+  return nullptr;
+}
+
+// Because of aukward definition of many Hex intrinsics we often have to
+// reinterprete HVX native <64 x i16> as <32 x i32> which in practice is a NOP
+// for all use cases, so this only exist to make IR builder happy.
+inline Value *getReinterpretiveCast_i16_to_i32(const HexagonVectorCombine &HVC,
+                                               IRBuilderBase &Builder,
+                                               LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 64; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst64_i16_to_32_i32");
+}
+
+// Recast <128 x i8> as <32 x i32>
+inline Value *getReinterpretiveCast_i8_to_i32(const HexagonVectorCombine &HVC,
+                                              IRBuilderBase &Builder,
+                                              LLVMContext &Ctx, Value *I) {
+  assert(I && "Unable to reinterprete cast");
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  std::vector<unsigned> shuffleMask;
+  for (unsigned i = 0; i < 128; ++i)
+    shuffleMask.push_back(i);
+  Constant *Mask = llvm::ConstantDataVector::get(Ctx, shuffleMask);
+  Value *CastShuffle =
+      Builder.CreateShuffleVector(I, I, Mask, "identity_shuffle");
+  return Builder.CreateBitCast(CastShuffle, NT, "cst128_i8_to_32_i32");
+}
+
+// Create <32 x i32> mask reinterpreted as <128 x i1> with a given pattern
+inline Value *get_i32_Mask(const HexagonVectorCombine &HVC,
+                           IRBuilderBase &Builder, LLVMContext &Ctx,
+                           unsigned int pattern) {
+  std::vector<unsigned int> byteMask;
+  for (unsigned i = 0; i < 32; ++i)
+    byteMask.push_back(pattern);
+
+  return Builder.CreateIntrinsic(
+      HVC.getBoolTy(128), HVC.HST.getIntrinsicId(Hexagon::V6_vandvrt),
+      {llvm::ConstantDataVector::get(Ctx, byteMask), HVC.getConstInt(~0)},
+      nullptr);
+}
+
+Value *HvxIdioms::processVScatter(Instruction &In) const {
+  auto *InpTy = dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.scatter/gather");
+  unsigned InpSize = HVC.getSizeOf(InpTy);
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  auto *ElemTy = dyn_cast<IntegerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.scatter needs integer type argument");
+  unsigned ElemWidth = HVC.DL.getTypeAllocSize(ElemTy);
+  LLVM_DEBUG({
+    unsigned Elements = HVC.length(InpTy);
+    dbgs() << "\n[Process scatter](" << In << ")\n" << *In.getParent() << "\n";
+    dbgs() << "  Input type(" << *InpTy << ") elements(" << Elements
+           << ") VecLen(" << InpSize << ") type(" << *ElemTy << ") ElemWidth("
+           << ElemWidth << ")\n";
+  });
+
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  auto *ValueToScatter = In.getOperand(0);
+  LLVM_DEBUG(dbgs() << "  ValueToScatter   : " << *ValueToScatter << "\n");
+
+  if (HVC.HST.getVectorLength() != InpSize) {
+    LLVM_DEBUG(dbgs() << "Unhandled vector size(" << InpSize
+                      << ") for vscatter\n");
+    return nullptr;
+  }
+
+  // Base address of indexes.
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad        : " << *IndexLoad << "\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(IndexLoad);
+  if (!Ptr)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Ptr              : " << *Ptr << "\n");
+  // Indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes          : " << *Indexes << "\n");
+  Value *CastedDst = Builder.CreateBitOrPointerCast(Ptr, Type::getInt32Ty(Ctx),
+                                                    "cst_ptr_to_i32");
+  LLVM_DEBUG(dbgs() << "  CastedDst        : " << *CastedDst << "\n");
+  // Adjust Indexes
+  auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+  Value *CastIndex = nullptr;
+  if (cstDataVector) {
+    // Our indexes are represented as a constant. We need it in a reg.
+    AllocaInst *IndexesAlloca =
+        Builder.CreateAlloca(HVC.getHvxTy(HVC.getIntTy(32), false));
+    [[maybe_unused]] auto *StoreIndexes =
+        Builder.CreateStore(cstDataVector, IndexesAlloca);
+    LLVM_DEBUG(dbgs() << "  StoreIndexes     : " << *StoreIndexes << "\n");
+    CastIndex = Builder.CreateLoad(IndexesAlloca->getAllocatedType(),
+                                   IndexesAlloca, "reload_index");
+  } else {
+    if (ElemWidth == 2)
+      CastIndex = getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    else
+      CastIndex = Indexes;
+  }
+  LLVM_DEBUG(dbgs() << "  Cast index       : " << *CastIndex << ")\n");
+
+  if (ElemWidth == 1) {
+    // v128i8 There is no native instruction for this.
+    // Do this as two Hi/Lo gathers with masking.
+    Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+    // Extend indexes. We assume that indexes are in 128i8 format - need to
+    // expand them to Hi/Lo 64i16
+    Value *CastIndexes = Builder.CreateBitCast(CastIndex, NT, "cast_to_32i32");
+    auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+    auto *UnpackedIndexes = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastIndexes, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndexes  : " << *UnpackedIndexes << ")\n");
+
+    auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+    auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+    [[maybe_unused]] Value *IndexHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+    [[maybe_unused]] Value *IndexLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+    LLVM_DEBUG(dbgs() << "  UnpackedIndHi    : " << *IndexHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UnpackedIndLo    : " << *IndexLo << ")\n");
+    // Now unpack values to scatter
+    Value *CastSrc =
+        getReinterpretiveCast_i8_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc          : " << *CastSrc << ")\n");
+    auto *UnpackedValueToScatter = Builder.CreateIntrinsic(
+        HVC.getHvxTy(HVC.getIntTy(32), true), V6_vunpack, CastSrc, nullptr);
+    LLVM_DEBUG(dbgs() << "  UnpackedValToScat: " << *UnpackedValueToScatter
+                      << ")\n");
+
+    [[maybe_unused]] Value *UVSHi =
+        HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedValueToScatter);
+    [[maybe_unused]] Value *UVSLo =
+        HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedValueToScatter);
+    LLVM_DEBUG(dbgs() << "  UVSHi            : " << *UVSHi << ")\n");
+    LLVM_DEBUG(dbgs() << "  UVSLo            : " << *UVSLo << ")\n");
+
+    // Create the mask for individual bytes
+    auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+    LLVM_DEBUG(dbgs() << "  QByteMask        : " << *QByteMask << "\n");
+    [[maybe_unused]] auto *ResHi = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexHi, UVSHi},
+        nullptr);
+    LLVM_DEBUG(dbgs() << "  ResHi            : " << *ResHi << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermhq_128B,
+        {QByteMask, CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         IndexLo, UVSLo},
+        nullptr);
+  } else if (ElemWidth == 2) {
+    Value *CastSrc =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, ValueToScatter);
+    LLVM_DEBUG(dbgs() << "  CastSrc        : " << *CastSrc << ")\n");
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermh_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         CastSrc},
+        nullptr);
+  } else if (ElemWidth == 4) {
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vscattermw_128B,
+        {CastedDst, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), CastIndex,
+         ValueToScatter},
+        nullptr);
+  } else {
+    LLVM_DEBUG(dbgs() << "Unhandled element type for vscatter\n");
+    return nullptr;
+  }
+}
+
+Value *HvxIdioms::processVGather(Instruction &In) const {
+  [[maybe_unused]] auto *InpTy =
+      dyn_cast<VectorType>(In.getOperand(0)->getType());
+  assert(InpTy && "Cannot handle no vector type for llvm.gather");
+  [[maybe_unused]] auto *ElemTy =
+      dyn_cast<PointerType>(InpTy->getElementType());
+  assert(ElemTy && "llvm.gather needs vector of ptr argument");
+  auto *F = In.getFunction();
+  LLVMContext &Ctx = F->getContext();
+  LLVM_DEBUG(dbgs() << "\n[Process gather](" << In << ")\n"
+                    << *In.getParent() << "\n");
+  LLVM_DEBUG(dbgs() << "  Input type(" << *InpTy << ") elements("
+                    << HVC.length(InpTy) << ") VecLen(" << HVC.getSizeOf(InpTy)
+                    << ") type(" << *ElemTy << ") Access alignment("
+                    << *In.getOperand(1) << ") AddressSpace("
+                    << ElemTy->getAddressSpace() << ")\n");
+
+  // TODO: Handle masking of elements.
+  assert(dyn_cast<VectorType>(In.getOperand(2)->getType()) &&
+         "llvm.gather needs vector for mask");
+  IRBuilder Builder(In.getParent(), In.getIterator(),
+                    InstSimplifyFolder(HVC.DL));
+
+  // See who is using the result. The difference between LLVM and HVX vgather
+  // Intrinsic makes it impossible to handle all cases with temp storage. Alloca
+  // in VTCM is not yet supported, so for now we just bail out for those cases.
+  HvxIdioms::DstQualifier Qual = HvxIdioms::Undefined;
+  Instruction *Dst = locateDestination(&In, Qual);
+  if (!Dst) {
+    LLVM_DEBUG(dbgs() << "  Unable to locate vgather destination\n");
+    return nullptr;
+  }
+  LLVM_DEBUG(dbgs() << "  Destination    : " << *Dst << " Qual(" << Qual
+                    << ")\n");
+
+  // Address of destination. Must be in VTCM.
+  auto *Ptr = getPointer(Dst);
+  if (!Ptr) {
+    LLVM_DEBUG(dbgs() << "Could not locate vgather destination ptr\n");
+    return nullptr;
+  }
+
+  // Result type. Assume it is a vector type.
+  auto *DstType = cast<VectorType>(getIndexType(Dst));
+  assert(DstType && "Cannot handle non vector dst type for llvm.gather");
+
+  // Base address for sources to be loaded
+  auto *IndexLoad = locateAddressFromIntrinsic(&In);
+  if (!IndexLoad)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  IndexLoad      : " << *IndexLoad << "\n");
+
+  // Gather indexes/offsets
+  auto *Indexes = locateIndexesFromIntrinsic(&In);
+  if (!Indexes)
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "  Indexes        : " << *Indexes << "\n");
+
+  Instruction *Gather = nullptr;
+  Type *NT = HVC.getHvxTy(HVC.getIntTy(32), false);
+  if (Qual == HvxIdioms::LdSt || Qual == HvxIdioms::Arithmetic) {
+    // We fully assume the address space is in VTCM. We also assume that all
+    // pointers in Operand(0) have the same base(!).
+    // This is the most basic case of all the above.
+    unsigned OutputSize = HVC.getSizeOf(DstType);
+    auto *DstElemTy = cast<IntegerType>(DstType->getElementType());
+    unsigned ElemWidth = HVC.DL.getTypeAllocSize(DstElemTy);
+    LLVM_DEBUG(dbgs() << "  Buffer type    : " << *Ptr->getType()
+                      << "  Address space ("
+                      << Ptr->getType()->getPointerAddressSpace() << ")\n"
+                      << "  Result type    : " << *DstType
+                      << "\n  Size in bytes  : " << OutputSize
+                      << " element type(" << *DstElemTy
+                      << ")\n  ElemWidth      : " << ElemWidth << " bytes\n");
+
+    auto *IndexType = cast<VectorType>(getIndexType(Indexes));
+    assert(IndexType && "Cannot handle non vector index type for llvm.gather");
+    unsigned IndexWidth = HVC.DL.getTypeAllocSize(IndexType->getElementType());
+    LLVM_DEBUG(dbgs() << "  IndexWidth(" << IndexWidth << ")\n");
+
+    // Intrinsic takes i32 instead of pointer so cast.
+    Value *CastedPtr = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    // [llvm_ptr_ty, llvm_i32_ty, llvm_i32_ty, ...]
+    // int_hexagon_V6_vgathermh       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermh_128B  [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw      [... , llvm_v32i32_ty]
+    // int_hexagon_V6_vgathermhw_128B [... , llvm_v64i32_ty]
+    // int_hexagon_V6_vgathermw       [... , llvm_v16i32_ty]
+    // int_hexagon_V6_vgathermw_128B  [... , llvm_v32i32_ty]
+    if (HVC.HST.getVectorLength() == OutputSize) {
+      if (ElemWidth == 1) {
+        // v128i8 There is no native instruction for this.
+        // Do this as two Hi/Lo gathers with masking.
+        // Unpack indexes. We assume that indexes are in 128i8 format - need to
+        // expand them to Hi/Lo 64i16
+        Value *CastIndexes =
+            Builder.CreateBitCast(Indexes, NT, "cast_to_32i32");
+        auto V6_vunpack = HVC.HST.getIntrinsicId(Hexagon::V6_vunpackub);
+        auto *UnpackedIndexes =
+            Builder.CreateIntrinsic(HVC.getHvxTy(HVC.getIntTy(32), true),
+                                    V6_vunpack, CastIndexes, nullptr);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndexes : " << *UnpackedIndexes
+                          << ")\n");
+
+        auto V6_hi = HVC.HST.getIntrinsicId(Hexagon::V6_hi);
+        auto V6_lo = HVC.HST.getIntrinsicId(Hexagon::V6_lo);
+        [[maybe_unused]] Value *IndexHi =
+            HVC.createHvxIntrinsic(Builder, V6_hi, NT, UnpackedIndexes);
+        [[maybe_unused]] Value *IndexLo =
+            HVC.createHvxIntrinsic(Builder, V6_lo, NT, UnpackedIndexes);
+        LLVM_DEBUG(dbgs() << "  UnpackedIndHi   : " << *IndexHi << ")\n");
+        LLVM_DEBUG(dbgs() << "  UnpackedIndLo   : " << *IndexLo << ")\n");
+        // Create the mask for individual bytes
+        auto *QByteMask = get_i32_Mask(HVC, Builder, Ctx, 0x00ff00ff);
+        LLVM_DEBUG(dbgs() << "  QByteMask       : " << *QByteMask << "\n");
+        // We use our destination allocation as a temp storage
+        // This is unlikely to work properly for masked gather.
+        auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermhq);
+        [[maybe_unused]] auto GatherHi = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexHi},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherHi        : " << *GatherHi << ")\n");
+        // Rematerialize the result
+        [[maybe_unused]] Value *LoadedResultHi = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_hi");
+        LLVM_DEBUG(dbgs() << "  LoadedResultHi : " << *LoadedResultHi << "\n");
+        // Same for the low part. Here we use Gather to return non-NULL result
+        // from this function and continue to iterate. We also are deleting Dst
+        // store below.
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), V6_vgather,
+            {Ptr, QByteMask, CastedPtr,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), IndexLo},
+            nullptr);
+        LLVM_DEBUG(dbgs() << "  GatherLo        : " << *Gather << ")\n");
+        Value *LoadedResultLo = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(32), false), Ptr, "temp_result_lo");
+        LLVM_DEBUG(dbgs() << "  LoadedResultLo : " << *LoadedResultLo << "\n");
+        // Now we have properly sized bytes in every other position
+        // B b A a c a A b B c f F g G h H is presented as
+        // B . b . A . a . c . a . A . b . B . c . f . F . g . G . h . H
+        // Use vpack to gather them
+        auto V6_vpackeb = HVC.HST.getIntrinsicId(Hexagon::V6_vpackeb);
+        [[maybe_unused]] auto Res = Builder.CreateIntrinsic(
+            NT, V6_vpackeb, {LoadedResultHi, LoadedResultLo}, nullptr);
+        LLVM_DEBUG(dbgs() << "  ScaledRes      : " << *Res << "\n");
+        [[maybe_unused]] auto *StoreRes = Builder.CreateStore(Res, Ptr);
+        LLVM_DEBUG(dbgs() << "  StoreRes       : " << *StoreRes << "\n");
+      } else if (ElemWidth == 2) {
+        // v32i16
+        if (IndexWidth == 2) {
+          // Reinterprete 64i16 as 32i32. Only needed for syntactic IR match.
+          Value *CastIndex =
+              getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+          LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+          // shift all i16 left by 1 to match short addressing mode instead of
+          // byte.
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+          auto V6_vgather = HVC.HST.getIntrinsicId(Hexagon::V6_vgathermh);
+          // The 3rd argument is the size of the region to gather from. Probably
+          // want to set it to max VTCM size.
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), V6_vgather,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+          for (auto &U : Dst->uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    dst used by: " << *UI << "\n";
+          }
+          for (auto &U : In.uses()) {
+            if (auto *UI = dyn_cast<Instruction>(U.getUser()))
+              dbgs() << "    In used by : " << *UI << "\n";
+          }
+          // Create temp load from result in case the result is used by any
+          // other instruction.
+          Value *LoadedResult = Builder.CreateLoad(
+              HVC.getHvxTy(HVC.getIntTy(16), false), Ptr, "temp_result");
+          LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+          In.replaceAllUsesWith(LoadedResult);
+        } else {
+          dbgs() << "    Unhandled index type for vgather\n";
+          return nullptr;
+        }
+      } else if (ElemWidth == 4) {
+        if (IndexWidth == 4) {
+          // v32i32
+          auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+          Value *AdjustedIndex = HVC.createHvxIntrinsic(
+              Builder, V6_vaslh, NT, {Indexes, HVC.getConstInt(2)});
+          LLVM_DEBUG(dbgs()
+                     << "  Shifted word index: " << *AdjustedIndex << ")\n");
+          Gather = Builder.CreateIntrinsic(
+              Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermw_128B,
+              {Ptr, CastedPtr, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+               AdjustedIndex},
+              nullptr);
+        } else {
+          LLVM_DEBUG(dbgs() << "    Unhandled index type for vgather\n");
+          return nullptr;
+        }
+      } else {
+        LLVM_DEBUG(dbgs() << "    Unhandled element type for vgather\n");
+        return nullptr;
+      }
+    } else if (HVC.HST.getVectorLength() == OutputSize * 2) {
+      // This is half of the reg width, duplicate low in high
+      LLVM_DEBUG(dbgs() << "    Unhandled half of register size\n");
+      return nullptr;
+    } else if (HVC.HST.getVectorLength() * 2 == OutputSize) {
+      LLVM_DEBUG(dbgs() << "    Unhandle twice the register size\n");
+      return nullptr;
+    }
+    // Erase the original intrinsic and store that consumes it.
+    // HVX will create a pseudo for gather that is expanded to gather + store
+    // during packetization.
+    Dst->eraseFromParent();
+  } else if (Qual == HvxIdioms::LLVM_Scatter) {
+    // Gather feeds directly into scatter.
+    LLVM_DEBUG({
+      auto *DstInpTy = cast<VectorType>(Dst->getOperand(1)->getType());
+      assert(DstInpTy && "Cannot handle no vector type for llvm.scatter");
+      unsigned DstInpSize = HVC.getSizeOf(DstInpTy);
+      unsigned DstElements = HVC.length(DstInpTy);
+      auto *DstElemTy = cast<PointerType>(DstInpTy->getElementType());
+      assert(DstElemTy && "llvm.scatter needs vector of ptr argument");
+      dbgs() << "  Gather feeds into scatter\n  Values to scatter : "
+             << *Dst->getOperand(0) << "\n";
+      dbgs() << "  Dst type(" << *DstInpTy << ") elements(" << DstElements
+             << ") VecLen(" << DstInpSize << ") type(" << *DstElemTy
+             << ") Access alignment(" << *Dst->getOperand(2) << ")\n";
+    });
+    // Address of source
+    auto *Src = getPointer(IndexLoad);
+    if (!Src)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Src            : " << *Src << "\n");
+
+    if (!isa<PointerType>(Src->getType())) {
+      LLVM_DEBUG(dbgs() << "    Source is not a pointer type...\n");
+      return nullptr;
+    }
+
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+    auto *DstLoad = locateAddressFromIntrinsic(Dst);
+    if (!DstLoad) {
+      LLVM_DEBUG(dbgs() << "  Unable to locate DstLoad\n");
+      return nullptr;
+    }
+    LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+
+    Value *Ptr = getPointer(DstLoad);
+    if (!Ptr)
+      return nullptr;
+    LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, IndexLoad);
+    LLVM_DEBUG(dbgs() << "  Cast index: " << *CastIndex << ")\n");
+    // Shift all i16 left by 1 to match short addressing mode instead of
+    // byte.
+    auto V6_vaslh = HVC.HST.getIntrinsicId(Hexagon::V6_vaslh);
+    Value *AdjustedIndex = HVC.createHvxIntrinsic(
+        Builder, V6_vaslh, NT, {CastIndex, HVC.getConstInt(1)});
+    LLVM_DEBUG(dbgs() << "  Shifted half index: " << *AdjustedIndex << ")\n");
+
+    return Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         AdjustedIndex},
+        nullptr);
+  } else if (Qual == HvxIdioms::HEX_Gather_Scatter) {
+    // Gather feeds into previously inserted pseudo intrinsic.
+    // These could not be in the same packet, so we need to generate another
+    // pseudo that is expanded to .tmp + store V6_vgathermh_pseudo
+    // V6_vgathermh_pseudo (ins IntRegs:$_dst_, s4_0Imm:$Ii, IntRegs:$Rt,
+    // ModRegs:$Mu, HvxVR:$Vv)
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need THEM in a reg.
+        // This most likely will not work properly since alloca gives us DDR
+        // stack location. This will be fixed once we teach compiler about VTCM.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+        [[maybe_unused]] auto *StoreIndexes =
+            Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca << "\n");
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    } else {
+      // Address of source
+      auto *Src = getPointer(IndexLoad);
+      if (!Src)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Src      : " << *Src << "\n");
+
+      Value *CastedSrc = Builder.CreateBitOrPointerCast(
+          Src, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+      LLVM_DEBUG(dbgs() << "  CastedSrc: " << *CastedSrc << "\n");
+
+      auto *DstLoad = locateAddressFromIntrinsic(Dst);
+      if (!DstLoad)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  DstLoad  : " << *DstLoad << "\n");
+      auto *Ptr = getPointer(DstLoad);
+      if (!Ptr)
+        return nullptr;
+      LLVM_DEBUG(dbgs() << "  Ptr      : " << *Ptr << "\n");
+
+      Gather = Builder.CreateIntrinsic(
+          Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgather_vscattermh,
+          {Ptr, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+           Indexes},
+          nullptr);
+    }
+    return Gather;
+  } else if (Qual == HvxIdioms::HEX_Scatter) {
+    // This is the case when result of a gather is used as an argument to
+    // Intrinsic::hexagon_V6_vscattermh_128B. Most likely we just inserted it
+    // ourselves. We have to create alloca, store to it, and replace all uses
+    // with that.
+    AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+    Value *CastedSrc = Builder.CreateBitOrPointerCast(
+        IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+    LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+    Value *CastIndex =
+        getReinterpretiveCast_i16_to_i32(HVC, Builder, Ctx, Indexes);
+    LLVM_DEBUG(dbgs() << "  Cast index     : " << *CastIndex << ")\n");
+
+    Gather = Builder.CreateIntrinsic(
+        Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+        {ResultAlloca, CastedSrc, HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE),
+         CastIndex},
+        nullptr);
+    Value *LoadedResult = Builder.CreateLoad(
+        HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+    LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+    In.replaceAllUsesWith(LoadedResult);
+  } else if (Qual == HvxIdioms::HEX_Gather) {
+    // Gather feeds to another gather but already replaced with
+    // hexagon_V6_vgathermh_128B
+    if (isa<AllocaInst>(IndexLoad)) {
+      auto *cstDataVector = dyn_cast<ConstantDataVector>(Indexes);
+      if (cstDataVector) {
+        // Our indexes are represented as a constant. We need it in a reg.
+        AllocaInst *IndexesAlloca = Builder.CreateAlloca(NT);
+
+        [[maybe_unused]] auto *StoreIndexes =
+            Builder.CreateStore(cstDataVector, IndexesAlloca);
+        LLVM_DEBUG(dbgs() << "  StoreIndexes   : " << *StoreIndexes << "\n");
+        Value *LoadedIndex = Builder.CreateLoad(
+            IndexesAlloca->getAllocatedType(), IndexesAlloca, "reload_index");
+        AllocaInst *ResultAlloca = Builder.CreateAlloca(NT);
+        LLVM_DEBUG(dbgs() << "  ResultAlloca   : " << *ResultAlloca
+                          << "\n  AddressSpace: "
+                          << ResultAlloca->getAddressSpace() << "\n";);
+
+        Value *CastedSrc = Builder.CreateBitOrPointerCast(
+            IndexLoad, Type::getInt32Ty(Ctx), "cst_ptr_to_i32");
+        LLVM_DEBUG(dbgs() << "  CastedSrc      : " << *CastedSrc << "\n");
+
+        Gather = Builder.CreateIntrinsic(
+            Type::getVoidTy(Ctx), Intrinsic::hexagon_V6_vgathermh_128B,
+            {ResultAlloca, CastedSrc,
+             HVC.getConstInt(DEFAULT_HVX_VTCM_PAGE_SIZE), LoadedIndex},
+            nullptr);
+        Value *LoadedResult = Builder.CreateLoad(
+            HVC.getHvxTy(HVC.getIntTy(16), false), ResultAlloca, "temp_result");
+        LLVM_DEBUG(dbgs() << "  LoadedResult   : " << *LoadedResult << "\n");
+        LLVM_DEBUG(dbgs() << "  Gather         : " << *Gather << "\n");
+        In.replaceAllUsesWith(LoadedResult);
+      }
+    }
+  } else if (Qual == HvxIdioms::LLVM_Gather) {
+    // Gather feeds into another gather
+    errs() << " Underimplemented vgather to vgather sequence\n";
+    return nullptr;
+  } else
+    llvm_unreachable("Unhandled Qual enum");
+
+  return Gather;
+}
+
 auto HvxIdioms::processFxpMulChopped(IRBuilderBase &Builder, Instruction &In,
                                      const FxpOp &Op) const -> Value * {
   assert(Op.X.Val->getType() == Op.Y.Val->getType());
@@ -2138,6 +3000,26 @@ auto HvxIdioms::run() -> bool {
         It = StartOver ? B.rbegin()
                        : cast<Instruction>(New)->getReverseIterator();
         Changed = true;
+      } else if (matchGather(*It)) {
+        Value *New = processVGather(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Gather : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
+      } else if (matchScatter(*It)) {
+        Value *New = processVScatter(*It);
+        if (!New)
+          continue;
+        LLVM_DEBUG(dbgs() << "  Scatter : " << *New << "\n");
+        // We replace original intrinsic with a new pseudo call.
+        It->eraseFromParent();
+        It = cast<Instruction>(New)->getReverseIterator();
+        RecursivelyDeleteTriviallyDeadInstructions(&*It, &HVC.TLI);
+        Changed = true;
       }
     }
   }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index 645575720e73a..2f59b7c0fdb15 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -186,6 +186,9 @@ static unsigned featureToArchVersion(unsigned Feature) {
   case Hexagon::ArchV79:
   case Hexagon::ExtensionHVXV79:
     return 79;
+  case Hexagon::ArchV81:
+  case Hexagon::ExtensionHVXV81:
+    return 81;
   }
   llvm_unreachable("Expected valid arch feature");
   return 0;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index 6b48a218efe80..b8075bd9c3cd8 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -96,6 +96,8 @@ cl::opt<bool> MV75("mv75", cl::Hidden, cl::desc("Build for Hexagon V75"),
                    cl::init(false));
 cl::opt<bool> MV79("mv79", cl::Hidden, cl::desc("Build for Hexagon V79"),
                    cl::init(false));
+cl::opt<bool> MV81("mv81", cl::Hidden, cl::desc("Build for Hexagon V81"),
+                   cl::init(false));
 } // namespace
 
 static cl::opt<Hexagon::ArchEnum> EnableHVX(
@@ -111,6 +113,7 @@ static cl::opt<Hexagon::ArchEnum> EnableHVX(
                clEnumValN(Hexagon::ArchEnum::V73, "v73", "Build for HVX v73"),
                clEnumValN(Hexagon::ArchEnum::V75, "v75", "Build for HVX v75"),
                clEnumValN(Hexagon::ArchEnum::V79, "v79", "Build for HVX v79"),
+               clEnumValN(Hexagon::ArchEnum::V81, "v81", "Build for HVX v81"),
                // Sentinel for no value specified.
                clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
     // Sentinel for flag not present.
@@ -159,6 +162,8 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv75";
   if (MV79)
     return "hexagonv79";
+  if (MV81)
+    return "hexagonv81";
 
   return "";
 }
@@ -474,6 +479,9 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V79:
     Result.push_back("+hvxv79");
     break;
+  case Hexagon::ArchEnum::V81:
+    Result.push_back("+hvxv81");
+    break;
 
   case Hexagon::ArchEnum::Generic: {
     Result.push_back(StringSwitch<StringRef>(CPU)
@@ -489,7 +497,8 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
                          .Case("hexagonv71t", "+hvxv71")
                          .Case("hexagonv73", "+hvxv73")
                          .Case("hexagonv75", "+hvxv75")
-                         .Case("hexagonv79", "+hvxv79"));
+                         .Case("hexagonv79", "+hvxv79")
+                         .Case("hexagonv81", "+hvxv81"));
     break;
   }
   case Hexagon::ArchEnum::NoArch:
@@ -538,8 +547,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   FeatureBitset FB = S;
   unsigned CpuArch = ArchV5;
   for (unsigned F :
-       {ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67, ArchV66,
-        ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
+       {ArchV81, ArchV79, ArchV75, ArchV73, ArchV71, ArchV69, ArchV68, ArchV67,
+        ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -556,7 +565,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   for (unsigned F :
        {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66,
         ExtensionHVXV67, ExtensionHVXV68, ExtensionHVXV69, ExtensionHVXV71,
-        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79}) {
+        ExtensionHVXV73, ExtensionHVXV75, ExtensionHVXV79, ExtensionHVXV81}) {
     if (!FB.test(F))
       continue;
     HasHvxVer = true;
@@ -569,6 +578,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
 
   // HasHvxVer is false, and UseHvx is true.
   switch (CpuArch) {
+  case ArchV81:
+    FB.set(ExtensionHVXV81);
+    [[fallthrough]];
   case ArchV79:
     FB.set(ExtensionHVXV79);
     [[fallthrough]];
@@ -668,12 +680,12 @@ void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI, StringRef FS) {
 
 std::optional<unsigned>
 Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
-  for (auto Arch : {Hexagon::ExtensionHVXV79, Hexagon::ExtensionHVXV75,
-                    Hexagon::ExtensionHVXV73, Hexagon::ExtensionHVXV71,
-                    Hexagon::ExtensionHVXV69, Hexagon::ExtensionHVXV68,
-                    Hexagon::ExtensionHVXV67, Hexagon::ExtensionHVXV66,
-                    Hexagon::ExtensionHVXV65, Hexagon::ExtensionHVXV62,
-                    Hexagon::ExtensionHVXV60})
+  for (auto Arch : {Hexagon::ExtensionHVXV81, Hexagon::ExtensionHVXV79,
+                    Hexagon::ExtensionHVXV75, Hexagon::ExtensionHVXV73,
+                    Hexagon::ExtensionHVXV71, Hexagon::ExtensionHVXV69,
+                    Hexagon::ExtensionHVXV68, Hexagon::ExtensionHVXV67,
+                    Hexagon::ExtensionHVXV66, Hexagon::ExtensionHVXV65,
+                    Hexagon::ExtensionHVXV62, Hexagon::ExtensionHVXV60})
     if (Features.test(Arch))
       return Arch;
   return {};
@@ -681,13 +693,13 @@ Hexagon_MC::getHVXVersion(const FeatureBitset &Features) {
 
 unsigned Hexagon_MC::getArchVersion(const FeatureBitset &Features) {
   for (auto Arch :
-       {Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73, Hexagon::ArchV71,
-        Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67, Hexagon::ArchV66,
-        Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60, Hexagon::ArchV55,
-        Hexagon::ArchV5})
+       {Hexagon::ArchV81, Hexagon::ArchV79, Hexagon::ArchV75, Hexagon::ArchV73,
+        Hexagon::ArchV71, Hexagon::ArchV69, Hexagon::ArchV68, Hexagon::ArchV67,
+        Hexagon::ArchV66, Hexagon::ArchV65, Hexagon::ArchV62, Hexagon::ArchV60,
+        Hexagon::ArchV55, Hexagon::ArchV5})
     if (Features.test(Arch))
       return Arch;
-  llvm_unreachable("Expected arch v5-v79");
+  llvm_unreachable("Expected arch v5-v81");
   return 0;
 }
 
@@ -708,7 +720,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
       .Case("hexagonv71t", llvm::ELF::EF_HEXAGON_MACH_V71T)
       .Case("hexagonv73", llvm::ELF::EF_HEXAGON_MACH_V73)
       .Case("hexagonv75", llvm::ELF::EF_HEXAGON_MACH_V75)
-      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79);
+      .Case("hexagonv79", llvm::ELF::EF_HEXAGON_MACH_V79)
+      .Case("hexagonv81", llvm::ELF::EF_HEXAGON_MACH_V81);
 }
 
 llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index f7deeafc9ccfc..ca4a655f06587 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -2614,6 +2614,9 @@ static SDValue lower256BitShuffle(const SDLoc &DL, ArrayRef<int> Mask, MVT VT,
     if ((Result = lowerVECTOR_SHUFFLE_XVSHUF4I(DL, Mask, VT, V1, V2, DAG,
                                                Subtarget)))
       return Result;
+    // Try to widen vectors to gain more optimization opportunities.
+    if (SDValue NewShuffle = widenShuffleMask(DL, Mask, VT, V1, V2, DAG))
+      return NewShuffle;
     if ((Result =
              lowerVECTOR_SHUFFLE_XVPERMI(DL, Mask, VT, V1, DAG, Subtarget)))
       return Result;
diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
index c5e26c106b5df..9de4c9d83792b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchTargetMachine.cpp
@@ -62,8 +62,7 @@ static cl::opt<bool>
                           cl::desc("Enable the merge base offset pass"),
                           cl::init(true), cl::Hidden);
 
-static Reloc::Model getEffectiveRelocModel(const Triple &TT,
-                                           std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
@@ -92,7 +91,7 @@ LoongArchTargetMachine::LoongArchTargetMachine(
     const TargetOptions &Options, std::optional<Reloc::Model> RM,
     std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
-                               getEffectiveRelocModel(TT, RM),
+                               getEffectiveRelocModel(RM),
                                getEffectiveLoongArchCodeModel(TT, CM), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
   initAsmInfo();
diff --git a/llvm/lib/Target/M68k/M68kTargetMachine.cpp b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
index 847c27bac2cba..f525d43dea41c 100644
--- a/llvm/lib/Target/M68k/M68kTargetMachine.cpp
+++ b/llvm/lib/Target/M68k/M68kTargetMachine.cpp
@@ -46,13 +46,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeM68kTarget() {
 
 namespace {
 
-Reloc::Model getEffectiveRelocModel(const Triple &TT,
-                                    std::optional<Reloc::Model> RM) {
+Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   // If not defined we default to static
-  if (!RM.has_value())
-    return Reloc::Static;
-
-  return *RM;
+  return RM.value_or(Reloc::Static);
 }
 
 CodeModel::Model getEffectiveCodeModel(std::optional<CodeModel::Model> CM,
@@ -73,7 +69,7 @@ M68kTargetMachine::M68kTargetMachine(const Target &T, const Triple &TT,
                                      std::optional<CodeModel::Model> CM,
                                      CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
-                               getEffectiveRelocModel(TT, RM),
+                               getEffectiveRelocModel(RM),
                                ::getEffectiveCodeModel(CM, JIT), OL),
       TLOF(std::make_unique<M68kELFTargetObjectFile>()),
       Subtarget(TT, CPU, FS, *this) {
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 4aecaf18db480..8e791e6986272 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -635,7 +635,7 @@ class InstManager : public GISelChangeObserver {
     B.setChangeObserver(*this);
   }
 
-  ~InstManager() { B.stopObservingChanges(); }
+  ~InstManager() override { B.stopObservingChanges(); }
 
   void createdInstr(MachineInstr &MI) override { InstList.insert(&MI); }
   void erasingInstr(MachineInstr &MI) override {}
diff --git a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
index 22cf3a7eef2c1..598735f5972bc 100644
--- a/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
+++ b/llvm/lib/Target/NVPTX/NVPTXIntrinsics.td
@@ -4675,7 +4675,7 @@ class WMMA_INSTR<string _Intr, list<dag> _Args>
 //
 
 class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride>
-  : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record,
+  : WMMA_INSTR<WMMA_NAME_LDST<"load", Frag, Layout, WithStride>.record_name,
                               [!con((ins ADDR:$src),
                                     !if(WithStride, (ins B32:$ldm), (ins)))]>,
     Requires<Frag.Predicates> {
@@ -4714,7 +4714,7 @@ class WMMA_LOAD<WMMA_REGINFO Frag, string Layout, string Space, bit WithStride>
 //
 class WMMA_STORE_D<WMMA_REGINFO Frag, string Layout, string Space,
                    bit WithStride>
-  : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record,
+  : WMMA_INSTR<WMMA_NAME_LDST<"store", Frag, Layout, WithStride>.record_name,
                [!con((ins ADDR:$dst),
                      Frag.Ins,
                      !if(WithStride, (ins B32:$ldm), (ins)))]>,
@@ -4778,7 +4778,7 @@ class MMA_OP_PREDICATES<WMMA_REGINFO FragA, string b1op> {
 class WMMA_MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                WMMA_REGINFO FragC, WMMA_REGINFO FragD,
                string ALayout, string BLayout, int Satfinite, string rnd, string b1op>
-  : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record,
+  : WMMA_INSTR<WMMA_NAME<ALayout, BLayout, Satfinite, rnd, b1op, FragA, FragB, FragC, FragD>.record_name,
                          [FragA.Ins, FragB.Ins, FragC.Ins]>,
     // Requires does not seem to have effect on Instruction w/o Patterns.
     // We set it here anyways and propagate to the Pat<> we construct below.
@@ -4837,7 +4837,7 @@ defset list<WMMA_INSTR> WMMAs  = {
 class MMA<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
                WMMA_REGINFO FragC, WMMA_REGINFO FragD,
                string ALayout, string BLayout, int Satfinite, string b1op, string Kind>
-  : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record,
+  : WMMA_INSTR<MMA_NAME<ALayout, BLayout, Satfinite, b1op, Kind, FragA, FragB, FragC, FragD>.record_name,
                         [FragA.Ins, FragB.Ins, FragC.Ins]>,
     // Requires does not seem to have effect on Instruction w/o Patterns.
     // We set it here anyways and propagate to the Pat<> we construct below.
@@ -4891,7 +4891,7 @@ class MMA_SP<WMMA_REGINFO FragA, WMMA_REGINFO FragB,
              WMMA_REGINFO FragC, WMMA_REGINFO FragD,
              string Metadata, string Kind, int Satfinite>
   : WMMA_INSTR<MMA_SP_NAME<Metadata, Kind, Satfinite,
-                           FragA, FragB, FragC, FragD>.record,
+                           FragA, FragB, FragC, FragD>.record_name,
                [FragA.Ins, FragB.Ins, FragC.Ins,
                 (ins B32:$metadata, i32imm:$selector)]>,
     // Requires does not seem to have effect on Instruction w/o Patterns.
@@ -4946,7 +4946,7 @@ defset list<WMMA_INSTR> MMA_SPs = {
 // ldmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
 //
 class LDMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
-  : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record, [(ins ADDR:$src)]>,
+  : WMMA_INSTR<LDMATRIX_NAME<Frag, Transposed>.record_name, [(ins ADDR:$src)]>,
     Requires<Frag.Predicates> {
   // Build PatFrag that only matches particular address space.
   PatFrag IntrFrag = PatFrag<(ops node:$src), (Intr node:$src),
@@ -4981,7 +4981,7 @@ defset list<WMMA_INSTR> LDMATRIXs  = {
 // stmatrix.sync.aligned.m8n8[|.trans][|.shared].b16
 //
 class STMATRIX<WMMA_REGINFO Frag, bit Transposed, string Space>
-  : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record, [!con((ins ADDR:$dst), Frag.Ins)]>,
+  : WMMA_INSTR<STMATRIX_NAME<Frag, Transposed>.record_name, [!con((ins ADDR:$dst), Frag.Ins)]>,
     Requires<Frag.Predicates> {
   // Build PatFrag that only matches particular address space.
   dag PFOperands = !con((ops node:$dst),
@@ -5376,7 +5376,7 @@ class Tcgen05MMAInst<bit Sp, string KindStr, string ASpace,
          Requires<PTXPredicates> {
 
   Intrinsic Intrin = !cast<Intrinsic>(
-                        NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record
+                        NVVM_TCGEN05_MMA<Sp, ASpace, AShift, ScaleInputD>.record_name
                      );
 
   dag ScaleInpIns = !if(!eq(ScaleInputD, 1), (ins i64imm:$scale_input_d), (ins));
@@ -5618,7 +5618,7 @@ class Tcgen05MMABlockScaleInst<bit Sp, string ASpace, string KindStr,
          Requires<[hasTcgen05Instructions, PTXPredicate]> {
 
   Intrinsic Intrin = !cast<Intrinsic>(
-                             NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record);
+                             NVVM_TCGEN05_MMA_BLOCKSCALE<Sp, ASpace, KindStr, ScaleVecSize>.record_name);
 
   dag SparseMetadataIns = !if(!eq(Sp, 1), (ins B32:$spmetadata), (ins));
   dag SparseMetadataIntr = !if(!eq(Sp, 1), (Intrin i32:$spmetadata), (Intrin));
@@ -5702,7 +5702,7 @@ class Tcgen05MMAWSInst<bit Sp, string ASpace, string KindStr,
          Requires<[hasTcgen05Instructions]> {
 
   Intrinsic Intrin = !cast<Intrinsic>(
-                            NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record);
+                            NVVM_TCGEN05_MMA_WS<Sp, ASpace, HasZeroColMask>.record_name);
 
   dag ZeroColMaskIns = !if(!eq(HasZeroColMask, 1),
                               (ins B64:$zero_col_mask), (ins));
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index d47752267ba83..17f04d0fd05e8 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -14736,8 +14736,8 @@ SDValue PPCTargetLowering::getRecipEstimate(SDValue Operand, SelectionDAG &DAG,
 }
 
 unsigned PPCTargetLowering::combineRepeatedFPDivisors() const {
-  // Note: This functionality is used only when unsafe-fp-math is enabled, and
-  // on cores with reciprocal estimates (which are used when unsafe-fp-math is
+  // Note: This functionality is used only when arcp is enabled, and
+  // on cores with reciprocal estimates (which are used when arcp is
   // enabled for division), this functionality is redundant with the default
   // combiner logic (once the division -> reciprocal/multiply transformation
   // has taken place). As a result, this matters more for older cores than for
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index aca7abd5a45a7..44d1a445f82be 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -4578,6 +4578,8 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
 
 def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
 def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
+def : InstAlias<"mtpidr $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsISA3_0]>;
+def : InstAlias<"mfpidr $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsISA3_0]>;
 
 foreach SPRG = 4-7 in {
   def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index e857b2dd78deb..edde7ac487da3 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -2406,7 +2406,8 @@ ParseStatus RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
 }
 
 bool RISCVAsmParser::generateVTypeError(SMLoc ErrorLoc) {
-  if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa))
+  if (STI->hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+      STI->hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))
     return Error(
         ErrorLoc,
         "operand must be "
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index b8ec0bbfcd3bb..4bea4c48dddd1 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -654,7 +654,10 @@ static constexpr FeatureBitset XqciFeatureGroup = {
 static constexpr FeatureBitset XSfVectorGroup = {
     RISCV::FeatureVendorXSfvcp,          RISCV::FeatureVendorXSfvqmaccdod,
     RISCV::FeatureVendorXSfvqmaccqoq,    RISCV::FeatureVendorXSfvfwmaccqqq,
-    RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase};
+    RISCV::FeatureVendorXSfvfnrclipxfqf, RISCV::FeatureVendorXSfmmbase,
+    RISCV::FeatureVendorXSfvfexpa,       RISCV::FeatureVendorXSfvfexpa64e,
+    RISCV::FeatureVendorXSfvfbfexp16e,   RISCV::FeatureVendorXSfvfexp16e,
+    RISCV::FeatureVendorXSfvfexp32e};
 static constexpr FeatureBitset XSfSystemGroup = {
     RISCV::FeatureVendorXSiFivecdiscarddlone,
     RISCV::FeatureVendorXSiFivecflushdlone,
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
index 67b510dc80f1e..f2b216be1db15 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVPostLegalizerCombiner.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/FormatVariadic.h"
 
 #define GET_GICOMBINER_DEPS
 #include "RISCVGenPostLegalizeGICombiner.inc"
@@ -42,6 +43,56 @@ namespace {
 #include "RISCVGenPostLegalizeGICombiner.inc"
 #undef GET_GICOMBINER_TYPES
 
+/// Match: G_STORE (G_FCONSTANT +0.0), addr
+/// Return the source vreg in MatchInfo if matched.
+bool matchFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+                          const RISCVSubtarget &STI, Register &MatchInfo) {
+  if (MI.getOpcode() != TargetOpcode::G_STORE)
+    return false;
+
+  Register SrcReg = MI.getOperand(0).getReg();
+  if (!SrcReg.isVirtual())
+    return false;
+
+  MachineInstr *Def = MRI.getVRegDef(SrcReg);
+  if (!Def || Def->getOpcode() != TargetOpcode::G_FCONSTANT)
+    return false;
+
+  auto *CFP = Def->getOperand(1).getFPImm();
+  if (!CFP || !CFP->getValueAPF().isPosZero())
+    return false;
+
+  unsigned ValBits = MRI.getType(SrcReg).getSizeInBits();
+  if ((ValBits == 16 && !STI.hasStdExtZfh()) ||
+      (ValBits == 32 && !STI.hasStdExtF()) ||
+      (ValBits == 64 && (!STI.hasStdExtD() || !STI.is64Bit())))
+    return false;
+
+  MatchInfo = SrcReg;
+  return true;
+}
+
+/// Apply: rewrite to G_STORE (G_CONSTANT 0 [XLEN]), addr
+void applyFoldFPZeroStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+                          MachineIRBuilder &B, const RISCVSubtarget &STI,
+                          Register &MatchInfo) {
+  const unsigned XLen = STI.getXLen();
+
+  auto Zero = B.buildConstant(LLT::scalar(XLen), 0);
+  MI.getOperand(0).setReg(Zero.getReg(0));
+
+  MachineInstr *Def = MRI.getVRegDef(MatchInfo);
+  if (Def && MRI.use_nodbg_empty(MatchInfo))
+    Def->eraseFromParent();
+
+#ifndef NDEBUG
+  unsigned ValBits = MRI.getType(MatchInfo).getSizeInBits();
+  LLVM_DEBUG(dbgs() << formatv("[{0}] Fold FP zero store -> int zero "
+                               "(XLEN={1}, ValBits={2}):\n  {3}\n",
+                               DEBUG_TYPE, XLen, ValBits, MI));
+#endif
+}
+
 class RISCVPostLegalizerCombinerImpl : public Combiner {
 protected:
   const CombinerHelper Helper;
diff --git a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
index 34efa0b2ebad5..4ccd3cfc185a6 100644
--- a/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
+++ b/llvm/lib/Target/RISCV/MCA/RISCVCustomBehaviour.h
@@ -33,7 +33,7 @@ class RISCVLMULInstrument : public Instrument {
 
   explicit RISCVLMULInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
 
-  ~RISCVLMULInstrument() = default;
+  ~RISCVLMULInstrument() override = default;
 
   uint8_t getLMUL() const;
 };
@@ -45,7 +45,7 @@ class RISCVSEWInstrument : public Instrument {
 
   explicit RISCVSEWInstrument(StringRef Data) : Instrument(DESC_NAME, Data) {}
 
-  ~RISCVSEWInstrument() = default;
+  ~RISCVSEWInstrument() override = default;
 
   uint8_t getSEW() const;
 };
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 50f5a5d09a69f..7b9c4b3e800cd 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -220,7 +220,8 @@ void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
   if (RISCVVType::getVLMUL(Imm) == RISCVVType::VLMUL::LMUL_RESERVED ||
       RISCVVType::getSEW(Imm) > 64 ||
       (RISCVVType::isAltFmt(Imm) &&
-       !STI.hasFeature(RISCV::FeatureStdExtZvfbfa)) ||
+       !(STI.hasFeature(RISCV::FeatureStdExtZvfbfa) ||
+         STI.hasFeature(RISCV::FeatureVendorXSfvfbfexp16e))) ||
       (Imm >> 9) != 0) {
     O << formatImm(Imm);
     return;
diff --git a/llvm/lib/Target/RISCV/RISCVCombine.td b/llvm/lib/Target/RISCV/RISCVCombine.td
index 995dd0c5d82eb..a06b60d8cce07 100644
--- a/llvm/lib/Target/RISCV/RISCVCombine.td
+++ b/llvm/lib/Target/RISCV/RISCVCombine.td
@@ -19,11 +19,20 @@ def RISCVO0PreLegalizerCombiner: GICombiner<
   "RISCVO0PreLegalizerCombinerImpl", [optnone_combines]> {
 }
 
+// Rule: fold store (fp +0.0) -> store (int zero [XLEN])
+def fp_zero_store_matchdata : GIDefMatchData<"Register">;
+def fold_fp_zero_store : GICombineRule<
+  (defs root:$root, fp_zero_store_matchdata:$matchinfo),
+  (match (G_STORE $src, $addr):$root,
+         [{ return matchFoldFPZeroStore(*${root}, MRI, STI, ${matchinfo}); }]),
+  (apply [{ applyFoldFPZeroStore(*${root}, MRI, B, STI, ${matchinfo}); }])>;
+
 // Post-legalization combines which are primarily optimizations.
 // TODO: Add more combines.
 def RISCVPostLegalizerCombiner
     : GICombiner<"RISCVPostLegalizerCombinerImpl",
                  [sub_to_add, combines_for_extload, redundant_and,
                   identity_combines, shift_immed_chain,
-                  commute_constant_to_rhs, simplify_neg_minmax]> {
+                  commute_constant_to_rhs, simplify_neg_minmax,
+                  fold_fp_zero_store]> {
 }
diff --git a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
index b69904d661fbe..b2ce57a35002f 100644
--- a/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
+++ b/llvm/lib/Target/RISCV/RISCVConstantPoolValue.h
@@ -37,7 +37,7 @@ class RISCVConstantPoolValue : public MachineConstantPoolValue {
   RISCVCPKind Kind;
 
 public:
-  ~RISCVConstantPoolValue() = default;
+  ~RISCVConstantPoolValue() override = default;
 
   static RISCVConstantPoolValue *Create(const GlobalValue *GV);
   static RISCVConstantPoolValue *Create(LLVMContext &C, StringRef S);
diff --git a/llvm/lib/Target/RISCV/RISCVFeatures.td b/llvm/lib/Target/RISCV/RISCVFeatures.td
index 3abbbb36315a8..2754d789b9899 100644
--- a/llvm/lib/Target/RISCV/RISCVFeatures.td
+++ b/llvm/lib/Target/RISCV/RISCVFeatures.td
@@ -1124,7 +1124,8 @@ def HasStdExtZbkbOrP
                          "'Base P' (Packed-SIMD)">;
 
 def HasStdExtZbbOrZbkbOrP
-    : Predicate<"Subtarget->HasStdExtZbbOrZbkb()|| Subtarget->hasStdExtP()">,
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbkb() || "
+                "Subtarget->hasStdExtP()">,
       AssemblerPredicate<(any_of FeatureStdExtZbb, FeatureStdExtZbkb, FeatureStdExtP),
                          "'Zbb' (Basic Bit-Manipulation) or "
                          "'Zbkb' (Bitmanip instructions for Cryptography) or "
@@ -1335,6 +1336,44 @@ def HasVendorXSfvfnrclipxfqf
       AssemblerPredicate<(all_of FeatureVendorXSfvfnrclipxfqf),
                          "'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions)">;
 
+// Note: XSfvfbfexp16e depends on either Zvfbfmin _or_ Zvfbfa, which cannot be expressed here in
+// TableGen. Instead, we check that in RISCVISAInfo.
+def FeatureVendorXSfvfbfexp16e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, BFloat16">;
+def HasVendorXSfvfbfexp16e : Predicate<"Subtarget->hasVendorXSfvfbfexp16e()">;
+
+def FeatureVendorXSfvfexp16e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, Half Precision",
+                     [FeatureStdExtZvfh]>;
+def HasVendorXSfvfexp16e : Predicate<"Subtarget->hasVendorXSfvfexp16e()">;
+
+def FeatureVendorXSfvfexp32e
+    : RISCVExtension<0, 5,
+                     "SiFive Vector Floating-Point Exponential Function Instruction, Single Precision",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexp32e : Predicate<"Subtarget->hasVendorXSfvfexp32e()">;
+
+def HasVendorXSfvfexpAnyFloat : Predicate<"Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">;
+def HasVendorXSfvfexpAny : Predicate<"Subtarget->hasVendorXSfvfbfexp16e() || Subtarget->hasVendorXSfvfexp16e() || Subtarget->hasVendorXSfvfexp32e()">,
+                           AssemblerPredicate<(any_of FeatureVendorXSfvfbfexp16e, FeatureVendorXSfvfexp16e, FeatureVendorXSfvfexp32e),
+                           "'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction)">;
+
+def FeatureVendorXSfvfexpa
+    : RISCVExtension<0, 2,
+                     "SiFive Vector Floating-Point Exponential Approximation Instruction",
+                     [FeatureStdExtZve32f]>;
+def HasVendorXSfvfexpa : Predicate<"Subtarget->hasVendorXSfvfexpa()">,
+                         AssemblerPredicate<(all_of FeatureVendorXSfvfexpa),
+                         "'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction)">;
+
+def FeatureVendorXSfvfexpa64e
+    : RISCVExtension<0, 2,
+                     "SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision",
+                     [FeatureVendorXSfvfexpa, FeatureStdExtZve64d]>;
+def HasVendorXSfvfexpa64e : Predicate<"Subtarget->hasVendorXSfvfexpa64e()">;
+
 def FeatureVendorXSiFivecdiscarddlone
     : RISCVExtension<1, 0,
                      "SiFive sf.cdiscard.d.l1 Instruction", []>;
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 26fe9edb6bd5d..1c930acd9c4a0 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -318,8 +318,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::EH_DWARF_CFA, MVT::i32, Custom);
 
-  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasVendorXTHeadBb() &&
-      !Subtarget.hasVendorXqcibm() && !Subtarget.hasVendorXAndesPerf() &&
+  if (!Subtarget.hasStdExtZbb() && !Subtarget.hasStdExtP() &&
+      !Subtarget.hasVendorXTHeadBb() && !Subtarget.hasVendorXqcibm() &&
+      !Subtarget.hasVendorXAndesPerf() &&
       !(Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()))
     setOperationAction(ISD::SIGN_EXTEND_INREG, {MVT::i8, MVT::i16}, Expand);
 
@@ -392,7 +393,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::BITREVERSE, MVT::i8, Custom);
   }
 
-  if (Subtarget.hasStdExtZbb() ||
+  if (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP() ||
       (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction({ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX}, XLenVT,
                        Legal);
@@ -403,6 +404,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::CTTZ, XLenVT, Expand);
+    // If have a CLZW, but not CTZW, custom promote i32.
+    if (Subtarget.hasStdExtP() && Subtarget.is64Bit())
+      setOperationAction({ISD::CTTZ, ISD::CTTZ_ZERO_UNDEF}, MVT::i32, Custom);
   }
 
   if (!Subtarget.hasCPOPLike()) {
@@ -419,13 +423,15 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     // We need the custom lowering to make sure that the resulting sequence
     // for the 32bit case is efficient on 64bit targets.
     // Use default promotion for i32 without Zbb.
-    if (Subtarget.is64Bit() && Subtarget.hasStdExtZbb())
+    if (Subtarget.is64Bit() &&
+        (Subtarget.hasStdExtZbb() || Subtarget.hasStdExtP()))
       setOperationAction({ISD::CTLZ, ISD::CTLZ_ZERO_UNDEF}, MVT::i32, Custom);
   } else {
     setOperationAction(ISD::CTLZ, XLenVT, Expand);
   }
 
-  if (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit()) {
+  if (Subtarget.hasStdExtP() ||
+      (Subtarget.hasVendorXCValu() && !Subtarget.is64Bit())) {
     setOperationAction(ISD::ABS, XLenVT, Legal);
   } else if (Subtarget.hasShortForwardBranchOpt()) {
     // We can use PseudoCCSUB to implement ABS.
@@ -14669,6 +14675,25 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
         DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i64, N->getOperand(0));
     bool IsCTZ =
         N->getOpcode() == ISD::CTTZ || N->getOpcode() == ISD::CTTZ_ZERO_UNDEF;
+
+    // Without Zbb, lower as 32 - clzw(~X & (X-1))
+    if (IsCTZ && !Subtarget.hasStdExtZbb()) {
+      assert(Subtarget.hasStdExtP());
+
+      NewOp0 = DAG.getFreeze(NewOp0);
+      SDValue Not = DAG.getNOT(DL, NewOp0, MVT::i64);
+      SDValue Minus1 = DAG.getNode(ISD::SUB, DL, MVT::i64, NewOp0,
+                                   DAG.getConstant(1, DL, MVT::i64));
+      SDValue And = DAG.getNode(ISD::AND, DL, MVT::i64, Not, Minus1);
+      SDValue CLZW = DAG.getNode(RISCVISD::CLZW, DL, MVT::i64, And);
+      SDValue Sub = DAG.getNode(ISD::SUB, DL, MVT::i64,
+                                DAG.getConstant(32, DL, MVT::i64), CLZW);
+      SDValue Res = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i64, Sub,
+                                DAG.getValueType(MVT::i32));
+      Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
+      return;
+    }
+
     unsigned Opc = IsCTZ ? RISCVISD::CTZW : RISCVISD::CLZW;
     SDValue Res = DAG.getNode(Opc, DL, MVT::i64, NewOp0);
     Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Res));
@@ -14797,7 +14822,7 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
       // to NEGW+MAX here requires a Freeze which breaks ComputeNumSignBits.
       SDValue Src = DAG.getNode(ISD::SIGN_EXTEND, DL, MVT::i64,
                                 N->getOperand(0));
-      SDValue Abs = DAG.getNode(RISCVISD::ABSW, DL, MVT::i64, Src);
+      SDValue Abs = DAG.getNode(RISCVISD::NEGW_MAX, DL, MVT::i64, Src);
       Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, Abs));
       return;
     }
@@ -21813,7 +21838,7 @@ unsigned RISCVTargetLowering::ComputeNumSignBitsForTargetNode(
     // Output is either all zero or operand 0. We can propagate sign bit count
     // from operand 0.
     return DAG.ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-  case RISCVISD::ABSW: {
+  case RISCVISD::NEGW_MAX: {
     // We expand this at isel to negw+max. The result will have 33 sign bits
     // if the input has at least 33 sign bits.
     unsigned Tmp =
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 12f776bbd4fa4..912b82d294f44 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -1689,42 +1689,44 @@ bool RISCVInstrInfo::isBranchOffsetInRange(unsigned BranchOp,
 // instruction opcode. Otherwise, return RISCV::INSTRUCTION_LIST_END.
 // TODO: Support more operations.
 unsigned getPredicatedOpcode(unsigned Opcode) {
+  // clang-format off
   switch (Opcode) {
-  case RISCV::ADD:   return RISCV::PseudoCCADD;   break;
-  case RISCV::SUB:   return RISCV::PseudoCCSUB;   break;
-  case RISCV::SLL:   return RISCV::PseudoCCSLL;   break;
-  case RISCV::SRL:   return RISCV::PseudoCCSRL;   break;
-  case RISCV::SRA:   return RISCV::PseudoCCSRA;   break;
-  case RISCV::AND:   return RISCV::PseudoCCAND;   break;
-  case RISCV::OR:    return RISCV::PseudoCCOR;    break;
-  case RISCV::XOR:   return RISCV::PseudoCCXOR;   break;
-
-  case RISCV::ADDI:  return RISCV::PseudoCCADDI;  break;
-  case RISCV::SLLI:  return RISCV::PseudoCCSLLI;  break;
-  case RISCV::SRLI:  return RISCV::PseudoCCSRLI;  break;
-  case RISCV::SRAI:  return RISCV::PseudoCCSRAI;  break;
-  case RISCV::ANDI:  return RISCV::PseudoCCANDI;  break;
-  case RISCV::ORI:   return RISCV::PseudoCCORI;   break;
-  case RISCV::XORI:  return RISCV::PseudoCCXORI;  break;
-
-  case RISCV::ADDW:  return RISCV::PseudoCCADDW;  break;
-  case RISCV::SUBW:  return RISCV::PseudoCCSUBW;  break;
-  case RISCV::SLLW:  return RISCV::PseudoCCSLLW;  break;
-  case RISCV::SRLW:  return RISCV::PseudoCCSRLW;  break;
-  case RISCV::SRAW:  return RISCV::PseudoCCSRAW;  break;
-
-  case RISCV::ADDIW: return RISCV::PseudoCCADDIW; break;
-  case RISCV::SLLIW: return RISCV::PseudoCCSLLIW; break;
-  case RISCV::SRLIW: return RISCV::PseudoCCSRLIW; break;
-  case RISCV::SRAIW: return RISCV::PseudoCCSRAIW; break;
-
-  case RISCV::ANDN:  return RISCV::PseudoCCANDN;  break;
-  case RISCV::ORN:   return RISCV::PseudoCCORN;   break;
-  case RISCV::XNOR:  return RISCV::PseudoCCXNOR;  break;
-
-  case RISCV::NDS_BFOS:  return RISCV::PseudoCCNDS_BFOS;  break;
-  case RISCV::NDS_BFOZ:  return RISCV::PseudoCCNDS_BFOZ;  break;
+  case RISCV::ADD:   return RISCV::PseudoCCADD;
+  case RISCV::SUB:   return RISCV::PseudoCCSUB;
+  case RISCV::SLL:   return RISCV::PseudoCCSLL;
+  case RISCV::SRL:   return RISCV::PseudoCCSRL;
+  case RISCV::SRA:   return RISCV::PseudoCCSRA;
+  case RISCV::AND:   return RISCV::PseudoCCAND;
+  case RISCV::OR:    return RISCV::PseudoCCOR;
+  case RISCV::XOR:   return RISCV::PseudoCCXOR;
+
+  case RISCV::ADDI:  return RISCV::PseudoCCADDI;
+  case RISCV::SLLI:  return RISCV::PseudoCCSLLI;
+  case RISCV::SRLI:  return RISCV::PseudoCCSRLI;
+  case RISCV::SRAI:  return RISCV::PseudoCCSRAI;
+  case RISCV::ANDI:  return RISCV::PseudoCCANDI;
+  case RISCV::ORI:   return RISCV::PseudoCCORI;
+  case RISCV::XORI:  return RISCV::PseudoCCXORI;
+
+  case RISCV::ADDW:  return RISCV::PseudoCCADDW;
+  case RISCV::SUBW:  return RISCV::PseudoCCSUBW;
+  case RISCV::SLLW:  return RISCV::PseudoCCSLLW;
+  case RISCV::SRLW:  return RISCV::PseudoCCSRLW;
+  case RISCV::SRAW:  return RISCV::PseudoCCSRAW;
+
+  case RISCV::ADDIW: return RISCV::PseudoCCADDIW;
+  case RISCV::SLLIW: return RISCV::PseudoCCSLLIW;
+  case RISCV::SRLIW: return RISCV::PseudoCCSRLIW;
+  case RISCV::SRAIW: return RISCV::PseudoCCSRAIW;
+
+  case RISCV::ANDN:  return RISCV::PseudoCCANDN;
+  case RISCV::ORN:   return RISCV::PseudoCCORN;
+  case RISCV::XNOR:  return RISCV::PseudoCCXNOR;
+
+  case RISCV::NDS_BFOS:  return RISCV::PseudoCCNDS_BFOS;
+  case RISCV::NDS_BFOZ:  return RISCV::PseudoCCNDS_BFOZ;
   }
+  // clang-format on
 
   return RISCV::INSTRUCTION_LIST_END;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
index 7d8a9192d9847..cc085bb6c9fd7 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoP.td
@@ -1455,3 +1455,11 @@ let Predicates = [HasStdExtP, IsRV32] in {
   def PMAXU_DW     : RVPPairBinaryExchanged_rr<0b1111, 0b01, "pmaxu.dw">;
   def PMAXU_DB     : RVPPairBinaryExchanged_rr<0b1111, 0b10, "pmaxu.db">;
 } // Predicates = [HasStdExtP, IsRV32]
+
+
+//===----------------------------------------------------------------------===//
+// Codegen patterns
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtP] in
+def : PatGpr<abs, ABS>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
index 6a4119a34ac09..f7b4914135711 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXSf.td
@@ -217,6 +217,16 @@ let Predicates = [HasVendorXSfvcp], mayLoad = 0, mayStore = 0,
   defm FVW : CustomSiFiveVCIX<"fvw", VCIX_XVW, VR,    VR,    FPR32>, Sched<[]>;
 }
 
+let Predicates = [HasVendorXSfvfexpAny], DecoderNamespace = "XSfvector" in {
+  def SF_VFEXP_V : VALUVs2<0b010011, 0b00111, OPFVV, "sf.vfexp.v">,
+                   SchedUnaryMC<"WriteSF_VFExp", "ReadSF_VFExp">;
+}
+
+let Predicates = [HasVendorXSfvfexpa], DecoderNamespace = "XSfvector" in {
+  def SF_VFEXPA_V : VALUVs2<0b010011, 0b00110, OPFVV, "sf.vfexpa.v">,
+                    SchedUnaryMC<"WriteSF_VFExpa", "ReadSF_VFExpa">;
+}
+
 let Predicates = [HasVendorXSfvqmaccdod], DecoderNamespace = "XSfvector",
     DestEEW = EEWSEWx4, RVVConstraint=VS2Constraint in {
   def SF_VQMACCU_2x8x2  : CustomSiFiveVMACC<0b101100, OPMVV, "sf.vqmaccu.2x8x2">;
@@ -474,11 +484,53 @@ let Predicates = [HasVendorXSfvfwmaccqqq] in {
   defm SF_VFWMACC_4x4x4 : VPseudoSiFiveVFWMACC;
 }
 
-let Predicates = [HasVendorXSfvfnrclipxfqf] in {
+let Predicates = [HasVendorXSfvfnrclipxfqf], AltFmtType = IS_NOT_ALTFMT in {
   defm SF_VFNRCLIP_XU_F_QF : VPseudoSiFiveVFNRCLIP;
   defm SF_VFNRCLIP_X_F_QF : VPseudoSiFiveVFNRCLIP;
 }
 
+class VFExpSchedSEWSet<string mx, bit IsBF16, bit IsApprox> {
+  defvar BaseSet = SchedSEWSet<mx, isF=1>.val;
+  list<int> val = !if(IsBF16, !listremove(BaseSet, [32, 64]),
+                      !if(IsApprox, BaseSet, !listremove(BaseSet, [64])));
+}
+multiclass VPseudoVFExp_V<bit IsBF16 = false, bit IsApprox = false> {
+  defvar SchedSuffix = !if(IsApprox, "VFExpa", "VFExp");
+
+  foreach m = MxListF in {
+    defvar mx = m.MX;
+    foreach e = VFExpSchedSEWSet<mx, IsBF16, IsApprox>.val in {
+      let VLMul = m.value in {
+        def "_V_" # mx # "_E" # e
+            : VPseudoUnaryNoMask<m.vrclass, m.vrclass>,
+              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+                         mx, e, forcePassthruRead=true>;
+        def "_V_" # mx # "_E" # e # "_MASK"
+            : VPseudoUnaryMask<m.vrclass, m.vrclass>,
+              RISCVMaskedPseudo<MaskIdx = 2>,
+              SchedUnary<"WriteSF_" # SchedSuffix, "ReadSF_" # SchedSuffix,
+                         mx, e, forcePassthruRead=true>;
+      }
+    }
+  }
+}
+
+let Predicates = [HasVendorXSfvfbfexp16e], hasSideEffects = 0 in {
+  let AltFmtType = IS_ALTFMT in {
+    defm PseudoSF_VFEXP_ALT : VPseudoVFExp_V<IsBF16=true>;
+  }
+}
+
+let Predicates = [HasVendorXSfvfexpAnyFloat], hasSideEffects = 0 in {
+  let AltFmtType = IS_NOT_ALTFMT in {
+    defm PseudoSF_VFEXP : VPseudoVFExp_V;
+  }
+}
+
+let Predicates = [HasVendorXSfvfexpa], AltFmtType = IS_NOT_ALTFMT in {
+  defm PseudoSF_VFEXPA : VPseudoVFExp_V<IsApprox=true>;
+}
+
 // SDNode
 def SDT_SF_VC_V_X : SDTypeProfile<1, 4, [SDTCisVec<0>,
                                          SDTCisVT<1, XLenVT>,
@@ -885,3 +937,36 @@ let Predicates = [HasVendorXSfcease] in {
     let rs2 = 0b00101;
 }
 }
+
+let Predicates = [HasVendorXSfvfbfexp16e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP_ALT",
+                      AllBF16Vectors,
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp16e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexp32e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexp", "PseudoSF_VFEXP",
+                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF32MF2, VF32M1, VF32M2, VF32M4, VF32M8], isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa, HasVInstructionsF16] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF16MF4, VF16MF2, VF16M1, VF16M2, VF16M4, VF16M8],
+                      isSEWAware=1>;
+}
+
+let Predicates = [HasVendorXSfvfexpa64e] in {
+  defm : VPatUnaryV_V<"int_riscv_sf_vfexpa", "PseudoSF_VFEXPA",
+                      [VF64M1, VF64M2, VF64M4, VF64M8], isSEWAware=1>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
index 62b7bcd67283a..5429c2a1a21b0 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZb.td
@@ -51,7 +51,7 @@ def riscv_zip     : RVSDNode<"ZIP",     SDTIntUnaryOp>;
 def riscv_unzip   : RVSDNode<"UNZIP",   SDTIntUnaryOp>;
 
 // RV64IZbb absolute value for i32. Expanded to (max (negw X), X) during isel.
-def riscv_absw    : RVSDNode<"ABSW",    SDTIntUnaryOp>;
+def riscv_negw_max : RVSDNode<"NEGW_MAX",    SDTIntUnaryOp>;
 
 // Scalar cryptography
 def riscv_clmul   : RVSDNode<"CLMUL",   SDTIntBinOp>;
@@ -599,37 +599,43 @@ def : PatGpr<riscv_zip, ZIP_RV32, i32>;
 def : PatGpr<riscv_unzip, UNZIP_RV32, i32>;
 } // Predicates = [HasStdExtZbkb, IsRV32]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : PatGpr<ctlz, CLZ>;
+}
+
+let Predicates = [HasStdExtZbb] in {
 def : PatGpr<cttz, CTZ>;
 def : PatGpr<ctpop, CPOP>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb, IsRV64] in {
+let Predicates = [HasStdExtZbbOrP, IsRV64] in {
 def : PatGpr<riscv_clzw, CLZW>;
+}
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
 def : PatGpr<riscv_ctzw, CTZW>;
 def : Pat<(i64 (ctpop (i64 (zexti32 (i64 GPR:$rs1))))), (CPOPW GPR:$rs1)>;
 
-def : Pat<(i64 (riscv_absw GPR:$rs1)),
+def : Pat<(i64 (riscv_negw_max GPR:$rs1)),
           (MAX GPR:$rs1, (XLenVT (SUBW (XLenVT X0), GPR:$rs1)))>;
 } // Predicates = [HasStdExtZbb, IsRV64]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : Pat<(XLenVT (sext_inreg GPR:$rs1, i8)), (SEXT_B GPR:$rs1)>;
 def : Pat<(XLenVT (sext_inreg GPR:$rs1, i16)), (SEXT_H GPR:$rs1)>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbb] in {
+let Predicates = [HasStdExtZbbOrP] in {
 def : PatGprGpr<smin, MIN>;
 def : PatGprGpr<smax, MAX>;
 def : PatGprGpr<umin, MINU>;
 def : PatGprGpr<umax, MAXU>;
 } // Predicates = [HasStdExtZbb]
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV32] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV32] in
 def : PatGpr<bswap, REV8_RV32, i32>;
 
-let Predicates = [HasStdExtZbbOrZbkb, IsRV64] in
+let Predicates = [HasStdExtZbbOrZbkbOrP, IsRV64] in
 def : PatGpr<bswap, REV8_RV64, i64>;
 
 let Predicates = [HasStdExtZbkb] in {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
index f7d1a099c69d9..b9c5b75983b1f 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoZvfbf.td
@@ -668,4 +668,38 @@ foreach vti = NoGroupBF16Vectors in {
   def : Pat<(vti.Scalar (extractelt (vti.Vector vti.RegClass:$rs2), 0)),
             (vfmv_f_s_inst vti.RegClass:$rs2, vti.Log2SEW)>;
 }
+
+let Predicates = [HasStdExtZvfbfa] in {
+  foreach fvtiToFWti = AllWidenableBF16ToFloatVectors in {
+    defvar fvti = fvtiToFWti.Vti;
+    defvar fwti = fvtiToFWti.Wti;
+    def : Pat<(fwti.Vector (any_riscv_fpextend_vl
+                               (fvti.Vector fvti.RegClass:$rs1),
+                               (fvti.Mask VMV0:$vm),
+                               VLOpFrag)),
+              (!cast<Instruction>("PseudoVFWCVT_F_F_ALT_V_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                  (fwti.Vector (IMPLICIT_DEF)), fvti.RegClass:$rs1,
+                  (fvti.Mask VMV0:$vm),
+                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
+
+    def : Pat<(fvti.Vector (any_riscv_fpround_vl
+                               (fwti.Vector fwti.RegClass:$rs1),
+                               (fwti.Mask VMV0:$vm), VLOpFrag)),
+              (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW#"_MASK")
+                  (fvti.Vector (IMPLICIT_DEF)), fwti.RegClass:$rs1,
+                  (fwti.Mask VMV0:$vm),
+                  // Value to indicate no rounding mode change in
+                  // RISCVInsertReadWriteCSR
+                  FRM_DYN,
+                  GPR:$vl, fvti.Log2SEW, TA_MA)>;
+    def : Pat<(fvti.Vector (fpround (fwti.Vector fwti.RegClass:$rs1))),
+              (!cast<Instruction>("PseudoVFNCVT_F_F_ALT_W_"#fvti.LMul.MX#"_E"#fvti.SEW)
+                  (fvti.Vector (IMPLICIT_DEF)),
+                  fwti.RegClass:$rs1,
+                  // Value to indicate no rounding mode change in
+                  // RISCVInsertReadWriteCSR
+                  FRM_DYN,
+                  fvti.AVL, fvti.Log2SEW, TA_MA)>;
+  }
+}
 } // Predicates = [HasStdExtZvfbfa]
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 4fa93f157f52b..f9be80feae211 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -31,7 +31,7 @@ struct RISCVMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   RISCVMachineFunctionInfo(const llvm::RISCVMachineFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~RISCVMachineFunctionInfo() = default;
+  ~RISCVMachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<RISCVMachineFunctionInfo> {
diff --git a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
index 637d61fe96b47..36a2f46416674 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedSiFive7.td
@@ -1588,6 +1588,10 @@ multiclass SiFive7SchedResources<int vlen, bit dualVALU,
   //===----------------------------------------------------------------------===//
   // Unsupported extensions
   defm : UnsupportedSchedQ;
+  // TODO: scheduling info of XSfvfexp* and XSfvfexpa*
+  // for SiFive7 will be added in follow-up patches.
+  defm : UnsupportedSchedXSfvfexp;
+  defm : UnsupportedSchedXSfvfexpa;
   defm : UnsupportedSchedZabha;
   defm : UnsupportedSchedZbc;
   defm : UnsupportedSchedZbkb;
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
index 9ab9636a79cbe..64ccfd805040c 100644
--- a/llvm/lib/Target/RISCV/RISCVSchedule.td
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -523,6 +523,8 @@ include "RISCVScheduleZvk.td"
 // Vendor Extensions
 multiclass UnsupportedSchedXsf {
   defm : UnsupportedSchedXsfvcp;
+  defm : UnsupportedSchedXSfvfexp;
+  defm : UnsupportedSchedXSfvfexpa;
   defm : UnsupportedSchedXSfvfnrclipxfqf;
   defm : UnsupportedSchedXSfvfwmaccqqq;
   defm : UnsupportedSchedXSfvqmaccdod;
diff --git a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
index 99632e410b4c7..1ee6dc19af134 100644
--- a/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
+++ b/llvm/lib/Target/RISCV/RISCVScheduleXSf.td
@@ -99,3 +99,23 @@ defm : LMULWriteRes<"WriteSF_VFWMACC_QQQ", []>;
 defm : LMULReadAdvance<"ReadSF_VFWMACC_QQQ", 0>;
 } // Unsupported = true
 }
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExp">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExp">;
+
+multiclass UnsupportedSchedXSfvfexp {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExp", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExp", 0>;
+} // Unsupported = true
+}
+
+defm "" : LMULSEWSchedWritesF<"WriteSF_VFExpa">;
+defm "" : LMULSEWSchedReadsF<"ReadSF_VFExpa">;
+
+multiclass UnsupportedSchedXSfvfexpa {
+let Unsupported = true in {
+defm : LMULSEWWriteResF<"WriteSF_VFExpa", []>;
+defm : LMULSEWReadAdvanceF<"ReadSF_VFExpa", 0>;
+} // Unsupported = true
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 334db4bfb75d9..4b4fc8f0d8e76 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -187,7 +187,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   }
 
   bool hasCLZLike() const {
-    return HasStdExtZbb || HasVendorXTHeadBb ||
+    return HasStdExtZbb || HasStdExtP || HasVendorXTHeadBb ||
            (HasVendorXCVbitmanip && !IsRV64);
   }
   bool hasCTZLike() const {
@@ -197,7 +197,7 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
     return HasStdExtZbb || (HasVendorXCVbitmanip && !IsRV64);
   }
   bool hasREV8Like() const {
-    return HasStdExtZbb || HasStdExtZbkb || HasVendorXTHeadBb;
+    return HasStdExtZbb || HasStdExtZbkb || HasStdExtP || HasVendorXTHeadBb;
   }
 
   bool hasBEXTILike() const { return HasStdExtZbs || HasVendorXTHeadBs; }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index f81b1e1260ee3..ae54ff1515121 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -141,8 +141,7 @@ extern "C" LLVM_ABI LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTarget() {
   initializeRISCVAsmPrinterPass(*PR);
 }
 
-static Reloc::Model getEffectiveRelocModel(const Triple &TT,
-                                           std::optional<Reloc::Model> RM) {
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
   return RM.value_or(Reloc::Static);
 }
 
@@ -154,7 +153,7 @@ RISCVTargetMachine::RISCVTargetMachine(const Target &T, const Triple &TT,
                                        CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(
           T, TT.computeDataLayout(Options.MCOptions.getABIName()), TT, CPU, FS,
-          Options, getEffectiveRelocModel(TT, RM),
+          Options, getEffectiveRelocModel(RM),
           getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<RISCVELFTargetObjectFile>()) {
   initAsmInfo();
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
index d91923b41ddd3..56a38bb49b7e7 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.cpp
@@ -1499,18 +1499,25 @@ static bool generateKernelClockInst(const SPIRV::IncomingCall *Call,
 
   Register ResultReg = Call->ReturnRegister;
 
-  // Deduce the `Scope` operand from the builtin function name.
-  SPIRV::Scope::Scope ScopeArg =
-      StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
-          .EndsWith("device", SPIRV::Scope::Scope::Device)
-          .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
-          .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
-  Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR);
-
-  MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
-      .addDef(ResultReg)
-      .addUse(GR->getSPIRVTypeID(Call->ReturnType))
-      .addUse(ScopeReg);
+  if (Builtin->Name == "__spirv_ReadClockKHR") {
+    MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+        .addDef(ResultReg)
+        .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+        .addUse(Call->Arguments[0]);
+  } else {
+    // Deduce the `Scope` operand from the builtin function name.
+    SPIRV::Scope::Scope ScopeArg =
+        StringSwitch<SPIRV::Scope::Scope>(Builtin->Name)
+            .EndsWith("device", SPIRV::Scope::Scope::Device)
+            .EndsWith("work_group", SPIRV::Scope::Scope::Workgroup)
+            .EndsWith("sub_group", SPIRV::Scope::Scope::Subgroup);
+    Register ScopeReg = buildConstantIntReg32(ScopeArg, MIRBuilder, GR);
+
+    MIRBuilder.buildInstr(SPIRV::OpReadClockKHR)
+        .addDef(ResultReg)
+        .addUse(GR->getSPIRVTypeID(Call->ReturnType))
+        .addUse(ScopeReg);
+  }
 
   return true;
 }
diff --git a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
index 3b8764a6401c6..c259ccee359b4 100644
--- a/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
+++ b/llvm/lib/Target/SPIRV/SPIRVBuiltins.td
@@ -1174,6 +1174,7 @@ defm : DemangledNativeBuiltin<"clock_read_sub_group", OpenCL_std, KernelClock, 0
 defm : DemangledNativeBuiltin<"clock_read_hilo_device", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_work_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
 defm : DemangledNativeBuiltin<"clock_read_hilo_sub_group", OpenCL_std, KernelClock, 0, 0, OpReadClockKHR>;
+defm : DemangledNativeBuiltin<"__spirv_ReadClockKHR", OpenCL_std, KernelClock, 1, 1, OpReadClockKHR>;
 
 //===----------------------------------------------------------------------===//
 // Class defining an atomic instruction on floating-point numbers.
diff --git a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
index f7fb886e7391d..3ca0b40cac93e 100644
--- a/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVCBufferAccess.cpp
@@ -35,6 +35,7 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/IntrinsicsSPIRV.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/ReplaceConstant.h"
 
 #define DEBUG_TYPE "spirv-cbuffer-access"
 using namespace llvm;
@@ -57,6 +58,12 @@ static bool replaceCBufferAccesses(Module &M) {
   if (!CBufMD)
     return false;
 
+  SmallVector<Constant *> CBufferGlobals;
+  for (const hlsl::CBufferMapping &Mapping : *CBufMD)
+    for (const hlsl::CBufferMember &Member : Mapping.Members)
+      CBufferGlobals.push_back(Member.GV);
+  convertUsersOfConstantsToInstructions(CBufferGlobals);
+
   for (const hlsl::CBufferMapping &Mapping : *CBufMD) {
     Instruction *HandleDef = findHandleDef(Mapping.Handle);
     if (!HandleDef) {
@@ -80,12 +87,7 @@ static bool replaceCBufferAccesses(Module &M) {
       Value *GetPointerCall = Builder.CreateIntrinsic(
           PtrType, Intrinsic::spv_resource_getpointer, {HandleDef, IndexVal});
 
-      // We cannot use replaceAllUsesWith here because some uses may be
-      // ConstantExprs, which cannot be replaced with non-constants.
-      SmallVector<User *, 4> Users(MemberGV->users());
-      for (User *U : Users) {
-        U->replaceUsesOfWith(MemberGV, GetPointerCall);
-      }
+      MemberGV->replaceAllUsesWith(GetPointerCall);
     }
   }
 
diff --git a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
index 28a1690ef0be1..6e444c98de8da 100644
--- a/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVLegalizePointerCast.cpp
@@ -347,7 +347,7 @@ class SPIRVLegalizePointerCast : public FunctionPass {
 public:
   SPIRVLegalizePointerCast(SPIRVTargetMachine *TM) : FunctionPass(ID), TM(TM) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     const SPIRVSubtarget &ST = TM->getSubtarget<SPIRVSubtarget>(F);
     GR = ST.getSPIRVGlobalRegistry();
     DeadInstructions.clear();
diff --git a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
index 60d39c9005b5f..aba9cf7962e68 100644
--- a/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVMergeRegionExitTargets.cpp
@@ -234,7 +234,7 @@ class SPIRVMergeRegionExitTargets : public FunctionPass {
   }
 #endif
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     auto *TopLevelRegion =
         getAnalysis<SPIRVConvergenceRegionAnalysisWrapperPass>()
diff --git a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
index e621bcd442e9b..b1a8d1ab8a297 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStripConvergentIntrinsics.cpp
@@ -28,7 +28,7 @@ class SPIRVStripConvergentIntrinsics : public FunctionPass {
 
   SPIRVStripConvergentIntrinsics() : FunctionPass(ID) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     DenseSet<Instruction *> ToRemove;
 
     // Is the instruction is a convergent intrinsic, add it to kill-list and
diff --git a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
index 5b149f8897eec..ea634fb616f10 100644
--- a/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVStructurizer.cpp
@@ -1113,7 +1113,7 @@ class SPIRVStructurizer : public FunctionPass {
 
   SPIRVStructurizer() : FunctionPass(ID) {}
 
-  virtual bool runOnFunction(Function &F) override {
+  bool runOnFunction(Function &F) override {
     bool Modified = false;
 
     // In LLVM, Switches are allowed to have several cases branching to the same
diff --git a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
index e585e5af42d32..b4dadaad8914c 100644
--- a/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
+++ b/llvm/lib/Target/Sparc/Disassembler/SparcDisassembler.cpp
@@ -36,7 +36,7 @@ class SparcDisassembler : public MCDisassembler {
 public:
   SparcDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~SparcDisassembler() = default;
+  ~SparcDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
index 1ff6cc81503ce..ba325b5d22951 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineScheduler.h
@@ -111,7 +111,7 @@ class SystemZPostRASchedStrategy : public MachineSchedStrategy {
 
 public:
   SystemZPostRASchedStrategy(const MachineSchedContext *C);
-  virtual ~SystemZPostRASchedStrategy();
+  ~SystemZPostRASchedStrategy() override;
 
   /// Called for a region before scheduling.
   void initPolicy(MachineBasicBlock::iterator Begin,
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index b1de145db3d31..bea8b9f8ae74a 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -101,7 +101,7 @@ class SystemZXPLINK64Registers : public SystemZCallingConventionRegisters {
   int getStackPointerBias() final { return 2048; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZXPLINK64Registers() = default;
+  ~SystemZXPLINK64Registers() override = default;
 };
 
 /// ELF calling convention specific use registers
@@ -124,7 +124,7 @@ class SystemZELFRegisters : public SystemZCallingConventionRegisters {
   int getStackPointerBias() final { return 0; }
 
   /// Destroys the object. Bogus destructor overriding base class destructor
-  ~SystemZELFRegisters() = default;
+  ~SystemZELFRegisters() override = default;
 };
 
 struct SystemZRegisterInfo : public SystemZGenRegisterInfo {
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index cf8569194d778..9bda8a42fd89e 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -158,7 +158,6 @@ void TargetMachine::resetTargetOptions(const Function &F) const {
     Options.X = F.getFnAttribute(Y).getValueAsBool();     \
   } while (0)
 
-  RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
   RESET_OPTION(NoInfsFPMath, "no-infs-fp-math");
   RESET_OPTION(NoNaNsFPMath, "no-nans-fp-math");
   RESET_OPTION(NoSignedZerosFPMath, "no-signed-zeros-fp-math");
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
index aad826b5f285f..465e074c2b50e 100644
--- a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -36,7 +36,7 @@ class VEDisassembler : public MCDisassembler {
 public:
   VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
       : MCDisassembler(STI, Ctx) {}
-  virtual ~VEDisassembler() = default;
+  ~VEDisassembler() override = default;
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index 45bbf128ce0b7..9175b2731dac0 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -102,7 +102,7 @@ struct WebAssemblyOperand : public MCParsedAsmOperand {
   WebAssemblyOperand(SMLoc Start, SMLoc End, CaLOp C)
       : Kind(CatchList), StartLoc(Start), EndLoc(End), CaL(C) {}
 
-  ~WebAssemblyOperand() {
+  ~WebAssemblyOperand() override {
     if (isBrList())
       BrL.~BrLOp();
     if (isCatchList())
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index f9739492e7fe3..7ec463bdc3b84 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -183,6 +183,11 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       for (auto T : {MVT::i32, MVT::i64})
         setOperationAction(Op, T, Custom);
 
+  if (Subtarget->hasRelaxedSIMD()) {
+    setOperationAction(
+        {ISD::FMINNUM, ISD::FMINIMUMNUM, ISD::FMAXNUM, ISD::FMAXIMUMNUM},
+        {MVT::v4f32, MVT::v2f64}, Legal);
+  }
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 784062066ed64..14097d7b40a9c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -1336,22 +1336,25 @@ def pmax : PatFrags<(ops node:$lhs, node:$rhs), [
 ]>;
 defm PMAX : SIMDBinaryFP<pmax, "pmax", 235>;
 
+multiclass PMinMaxInt<Vec vec, NI baseMinInst, NI baseMaxInst> {
+  def : Pat<(vec.int_vt (vselect
+              (setolt (vec.vt (bitconvert V128:$rhs)),
+                      (vec.vt (bitconvert V128:$lhs))),
+              V128:$rhs, V128:$lhs)),
+            (baseMinInst $lhs, $rhs)>;
+  def : Pat<(vec.int_vt (vselect
+              (setolt (vec.vt (bitconvert V128:$lhs)),
+                      (vec.vt (bitconvert V128:$rhs))),
+              V128:$rhs, V128:$lhs)),
+            (baseMaxInst $lhs, $rhs)>;
+}
 // Also match the pmin/pmax cases where the operands are int vectors (but the
 // comparison is still a floating point comparison). This can happen when using
 // the wasm_simd128.h intrinsics because v128_t is an integer vector.
 foreach vec = [F32x4, F64x2, F16x8] in {
-defvar pmin = !cast<NI>("PMIN_"#vec);
-defvar pmax = !cast<NI>("PMAX_"#vec);
-def : Pat<(vec.int_vt (vselect
-            (setolt (vec.vt (bitconvert V128:$rhs)),
-                    (vec.vt (bitconvert V128:$lhs))),
-            V128:$rhs, V128:$lhs)),
-          (pmin $lhs, $rhs)>;
-def : Pat<(vec.int_vt (vselect
-            (setolt (vec.vt (bitconvert V128:$lhs)),
-                    (vec.vt (bitconvert V128:$rhs))),
-            V128:$rhs, V128:$lhs)),
-          (pmax $lhs, $rhs)>;
+  defvar pmin = !cast<NI>("PMIN_"#vec);
+  defvar pmax = !cast<NI>("PMAX_"#vec); 
+  defm : PMinMaxInt<vec, pmin, pmax>;
 }
 
 // And match the pmin/pmax LLVM intrinsics as well
@@ -1742,6 +1745,32 @@ defm SIMD_RELAXED_FMIN :
 defm SIMD_RELAXED_FMAX :
    RelaxedBinary<F64x2, int_wasm_relaxed_max, "relaxed_max", 0x110>;
 
+let Predicates = [HasRelaxedSIMD] in {
+  foreach vec = [F32x4, F64x2] in {
+    defvar relaxed_min = !cast<NI>("SIMD_RELAXED_FMIN_"#vec);
+    defvar relaxed_max = !cast<NI>("SIMD_RELAXED_FMAX_"#vec);
+
+    // Transform standard fminimum/fmaximum to relaxed versions
+    def : Pat<(vec.vt (fminnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_min V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fminimumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_min V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fmaxnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_max V128:$lhs, V128:$rhs)>;
+    def : Pat<(vec.vt (fmaximumnum (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+              (relaxed_max V128:$lhs, V128:$rhs)>;
+
+    // Transform pmin/max-supposed patterns to relaxed min max
+    let AddedComplexity = 1 in {
+      def : Pat<(vec.vt (pmin (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+                (relaxed_min $lhs, $rhs)>;
+      def : Pat<(vec.vt (pmax (vec.vt V128:$lhs), (vec.vt V128:$rhs))),
+                (relaxed_max $lhs, $rhs)>;
+      defm : PMinMaxInt<vec, relaxed_min, relaxed_max>;
+    }
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Relaxed rounding q15 multiplication
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 40ae4aef1d7f2..ff4d64693284a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -192,7 +192,7 @@ struct WebAssemblyFunctionInfo final : public yaml::MachineFunctionInfo {
                           const llvm::WebAssemblyFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~WebAssemblyFunctionInfo() = default;
+  ~WebAssemblyFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<WebAssemblyFunctionInfo> {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index a9c638cde1259..621640c12f695 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -127,16 +127,11 @@ LLVMInitializeWebAssemblyTarget() {
 // WebAssembly Lowering public interface.
 //===----------------------------------------------------------------------===//
 
-static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM,
-                                           const Triple &TT) {
-  if (!RM) {
-    // Default to static relocation model.  This should always be more optimial
-    // than PIC since the static linker can determine all global addresses and
-    // assume direct function calls.
-    return Reloc::Static;
-  }
-
-  return *RM;
+static Reloc::Model getEffectiveRelocModel(std::optional<Reloc::Model> RM) {
+  // Default to static relocation model.  This should always be more optimial
+  // than PIC since the static linker can determine all global addresses and
+  // assume direct function calls.
+  return RM.value_or(Reloc::Static);
 }
 
 using WebAssembly::WasmEnableEH;
@@ -197,7 +192,7 @@ WebAssemblyTargetMachine::WebAssemblyTargetMachine(
     const TargetOptions &Options, std::optional<Reloc::Model> RM,
     std::optional<CodeModel::Model> CM, CodeGenOptLevel OL, bool JIT)
     : CodeGenTargetMachineImpl(T, TT.computeDataLayout(), TT, CPU, FS, Options,
-                               getEffectiveRelocModel(RM, TT),
+                               getEffectiveRelocModel(RM),
                                getEffectiveCodeModel(CM, CodeModel::Large), OL),
       TLOF(new WebAssemblyTargetObjectFile()),
       UsesMultivalueABI(Options.MCOptions.getABIName() == "experimental-mv") {
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index ac251fdb3aa87..127ee67517aea 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -3533,10 +3533,10 @@ bool X86AsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   while (isLockRepeatNtPrefix(Name.lower())) {
     unsigned Prefix =
         StringSwitch<unsigned>(Name)
-            .Cases("lock", "lock", X86::IP_HAS_LOCK)
-            .Cases("rep", "repe", "repz", X86::IP_HAS_REPEAT)
-            .Cases("repne", "repnz", X86::IP_HAS_REPEAT_NE)
-            .Cases("notrack", "notrack", X86::IP_HAS_NOTRACK)
+            .Case("lock", X86::IP_HAS_LOCK)
+            .Cases({"rep", "repe", "repz"}, X86::IP_HAS_REPEAT)
+            .Cases({"repne", "repnz"}, X86::IP_HAS_REPEAT_NE)
+            .Case("notrack", X86::IP_HAS_NOTRACK)
             .Default(X86::IP_NO_PREFIX); // Invalid prefix (impossible)
     Flags |= Prefix;
     if (getLexer().is(AsmToken::EndOfStatement)) {
diff --git a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
index d6197f3344bbb..05a1c22736a17 100644
--- a/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
+++ b/llvm/lib/Target/X86/MCA/X86CustomBehaviour.h
@@ -37,7 +37,7 @@ class X86InstrPostProcess : public InstrPostProcess {
   X86InstrPostProcess(const MCSubtargetInfo &STI, const MCInstrInfo &MCII)
       : InstrPostProcess(STI, MCII) {}
 
-  ~X86InstrPostProcess() = default;
+  ~X86InstrPostProcess() override = default;
 
   void postProcessInstruction(Instruction &Inst, const MCInst &MCI) override;
 };
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 1c06dc4923c2d..af5a69899844c 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -491,7 +491,7 @@ namespace X86_MC {
 class X86MCInstrAnalysis : public MCInstrAnalysis {
   X86MCInstrAnalysis(const X86MCInstrAnalysis &) = delete;
   X86MCInstrAnalysis &operator=(const X86MCInstrAnalysis &) = delete;
-  virtual ~X86MCInstrAnalysis() = default;
+  ~X86MCInstrAnalysis() override = default;
 
 public:
   X86MCInstrAnalysis(const MCInstrInfo *MCII) : MCInstrAnalysis(MCII) {}
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 6261fadf10a7a..706ab2b62bc1b 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -160,6 +160,14 @@ FunctionPass *createX86PartialReductionPass();
 /// // Analyzes and emits pseudos to support Win x64 Unwind V2.
 FunctionPass *createX86WinEHUnwindV2Pass();
 
+/// The pass transforms load/store <256 x i32> to AMX load/store intrinsics
+/// or split the data to two <128 x i32>.
+FunctionPass *createX86LowerAMXTypePass();
+
+/// The pass transforms amx intrinsics to scalar operation if the function has
+/// optnone attribute or it is O0.
+FunctionPass *createX86LowerAMXIntrinsicsPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   const X86Subtarget &,
                                                   const X86RegisterBankInfo &);
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 339e2f3b7209e..5d190114615de 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -324,9 +324,7 @@ class Closure {
   bool insertEdge(Register Reg) { return Edges.insert(Reg).second; }
 
   using const_edge_iterator = DenseSet<Register>::const_iterator;
-  iterator_range<const_edge_iterator> edges() const {
-    return iterator_range<const_edge_iterator>(Edges.begin(), Edges.end());
-  }
+  iterator_range<const_edge_iterator> edges() const { return Edges; }
 
   void addInstruction(MachineInstr *I) {
     Instrs.push_back(I);
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index 62073ec125e8f..4393f6ecaa033 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4721,9 +4721,6 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
     return false;
 
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-
   auto getFoldableLogicOp = [](SDValue Op) {
     // Peek through single use bitcast.
     if (Op.getOpcode() == ISD::BITCAST && Op.hasOneUse())
@@ -4740,13 +4737,47 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
     return SDValue();
   };
 
-  SDValue A, FoldableOp;
-  if ((FoldableOp = getFoldableLogicOp(N1))) {
-    A = N0;
-  } else if ((FoldableOp = getFoldableLogicOp(N0))) {
-    A = N1;
-  } else
-    return false;
+  SDValue N0, N1, A, FoldableOp;
+
+  // Identify and (optionally) peel an outer NOT that wraps a pure logic tree
+  auto tryPeelOuterNotWrappingLogic = [&](SDNode *Op) {
+    if (Op->getOpcode() == ISD::XOR && Op->hasOneUse() &&
+        ISD::isBuildVectorAllOnes(Op->getOperand(1).getNode())) {
+      SDValue InnerOp = Op->getOperand(0);
+
+      if (!getFoldableLogicOp(InnerOp))
+        return SDValue();
+
+      N0 = InnerOp.getOperand(0);
+      N1 = InnerOp.getOperand(1);
+      if ((FoldableOp = getFoldableLogicOp(N1))) {
+        A = N0;
+        return InnerOp;
+      }
+      if ((FoldableOp = getFoldableLogicOp(N0))) {
+        A = N1;
+        return InnerOp;
+      }
+    }
+    return SDValue();
+  };
+
+  bool PeeledOuterNot = false;
+  SDNode *OriN = N;
+  if (SDValue InnerOp = tryPeelOuterNotWrappingLogic(N)) {
+    PeeledOuterNot = true;
+    N = InnerOp.getNode();
+  } else {
+    N0 = N->getOperand(0);
+    N1 = N->getOperand(1);
+
+    if ((FoldableOp = getFoldableLogicOp(N1)))
+      A = N0;
+    else if ((FoldableOp = getFoldableLogicOp(N0)))
+      A = N1;
+    else
+      return false;
+  }
 
   SDValue B = FoldableOp.getOperand(0);
   SDValue C = FoldableOp.getOperand(1);
@@ -4798,7 +4829,10 @@ bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
   case ISD::XOR: Imm ^= TernlogMagicA; break;
   }
 
-  return matchVPTERNLOG(N, ParentA, ParentB, ParentC, A, B, C, Imm);
+  if (PeeledOuterNot)
+    Imm = ~Imm;
+
+  return matchVPTERNLOG(OriN, ParentA, ParentB, ParentC, A, B, C, Imm);
 }
 
 /// If the high bits of an 'and' operand are known zero, try setting the
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index b5f8ee50cba3d..410f20edc6281 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2632,6 +2632,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
         setOperationAction(Op, MVT::f32, Promote);
   }
 
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f16, MVT::i16);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f32, MVT::i32);
+  setOperationPromotedToType(ISD::ATOMIC_LOAD, MVT::f64, MVT::i64);
+
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine({ISD::VECTOR_SHUFFLE,
                        ISD::SCALAR_TO_VECTOR,
@@ -20558,7 +20562,7 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, const SDLoc &DL,
 
   //     float4 fhi = (float4) hi - (0x1.0p39f + 0x1.0p23f);
   // NOTE: By using fsub of a positive constant instead of fadd of a negative
-  // constant, we avoid reassociation in MachineCombiner when unsafe-fp-math is
+  // constant, we avoid reassociation in MachineCombiner when reassoc is
   // enabled. See PR24512.
   SDValue HighBitcast = DAG.getBitcast(VecFloatVT, High);
   // TODO: Are there any fast-math-flags to propagate here?
@@ -29516,11 +29520,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
   if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
     return MinMax;
 
-  if (DAG.isKnownNeverNaN(NewX))
-    NewX = NewY;
-
-  SDValue IsNaN =
-      DAG.getSetCC(DL, SetCCType, NewX, NewX, IsNum ? ISD::SETO : ISD::SETUO);
+  SDValue NaNSrc = IsNum ? MinMax : NewX;
+  SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
 
   return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);
 }
@@ -57616,10 +57617,10 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
   }
 
   // Fold any similar generic ADD/SUB opcodes to reuse this node.
-  auto MatchGeneric = [&](SDValue N0, SDValue N1, bool Negate) {
+  auto MatchGeneric = [&](unsigned Opc, SDValue N0, SDValue N1, bool Negate) {
     SDValue Ops[] = {N0, N1};
     SDVTList VTs = DAG.getVTList(N->getValueType(0));
-    if (SDNode *GenericAddSub = DAG.getNodeIfExists(GenericOpc, VTs, Ops)) {
+    if (SDNode *GenericAddSub = DAG.getNodeIfExists(Opc, VTs, Ops)) {
       SDValue Op(N, 0);
       if (Negate) {
         // Bail if this is only used by a user of the x86 add/sub.
@@ -57631,8 +57632,25 @@ static SDValue combineX86AddSub(SDNode *N, SelectionDAG &DAG,
       DCI.CombineTo(GenericAddSub, Op);
     }
   };
-  MatchGeneric(LHS, RHS, false);
-  MatchGeneric(RHS, LHS, X86ISD::SUB == N->getOpcode());
+  MatchGeneric(GenericOpc, LHS, RHS, false);
+  MatchGeneric(GenericOpc, RHS, LHS, X86ISD::SUB == N->getOpcode());
+
+  if (auto *Const = dyn_cast<ConstantSDNode>(RHS)) {
+    SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+    if (X86ISD::SUB == N->getOpcode()) {
+      // Fold generic add(LHS, -C) to X86ISD::SUB(LHS, C).
+      MatchGeneric(ISD::ADD, LHS, NegC, false);
+    } else {
+      // Negate X86ISD::ADD(LHS, C) and replace generic sub(-C, LHS).
+      MatchGeneric(ISD::SUB, NegC, LHS, true);
+    }
+  } else if (auto *Const = dyn_cast<ConstantSDNode>(LHS)) {
+    if (X86ISD::SUB == N->getOpcode()) {
+      SDValue NegC = DAG.getConstant(-Const->getAPIntValue(), DL, VT);
+      // Negate X86ISD::SUB(C, RHS) and replace generic add(RHS, -C).
+      MatchGeneric(ISD::ADD, RHS, NegC, true);
+    }
+  }
 
   // TODO: Can we drop the ZeroSecondOpOnly limit? This is to guarantee that the
   // EFLAGS result doesn't change.
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index e28b9c11a04cd..b7151f65942b4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1592,7 +1592,6 @@ namespace llvm {
     bool useLoadStackGuardNode(const Module &M) const override;
     bool useStackGuardXorFP() const override;
     void insertSSPDeclarations(Module &M) const override;
-    Function *getSSPStackGuardCheck(const Module &M) const override;
     SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
                                 const SDLoc &DL) const override;
 
diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 37d77728882b1..a61bbe56d9c26 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -640,15 +640,6 @@ void X86TargetLowering::insertSSPDeclarations(Module &M) const {
   TargetLowering::insertSSPDeclarations(M);
 }
 
-Function *X86TargetLowering::getSSPStackGuardCheck(const Module &M) const {
-  // MSVC CRT has a function to validate security cookie.
-  if (Subtarget.getTargetTriple().isWindowsMSVCEnvironment() ||
-      Subtarget.getTargetTriple().isWindowsItaniumEnvironment()) {
-    return M.getFunction("__security_check_cookie");
-  }
-  return TargetLowering::getSSPStackGuardCheck(M);
-}
-
 Value *
 X86TargetLowering::getSafeStackPointerLocation(IRBuilderBase &IRB) const {
   // Android provides a fixed TLS slot for the SafeStack pointer. See the
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 83bd6ac26cc59..1b748b7355716 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -5519,7 +5519,7 @@ defm VMIN : avx512_binop_s_sae<0x5D, "vmin", X86any_fmin, X86fmins, X86fminSAEs,
 defm VMAX : avx512_binop_s_sae<0x5F, "vmax", X86any_fmax, X86fmaxs, X86fmaxSAEs,
                                SchedWriteFCmpSizes, 0>;
 
-// MIN/MAX nodes are commutable under "unsafe-fp-math". In this case we use
+// MIN/MAX nodes are commutable under (nnan + ninf). In this case we use
 // X86fminc and X86fmaxc instead of X86fmin and X86fmax
 multiclass avx512_comutable_binop_s<bits<8> opc, string OpcodeStr,
                                     X86VectorVTInfo _, SDNode OpNode,
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 5f974e5de9a19..1bda505ed39f1 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -43,7 +43,7 @@ struct X86MachineFunctionInfo final : public yaml::MachineFunctionInfo {
   X86MachineFunctionInfo(const llvm::X86MachineFunctionInfo &MFI);
 
   void mappingImpl(yaml::IO &YamlIO) override;
-  ~X86MachineFunctionInfo() = default;
+  ~X86MachineFunctionInfo() override = default;
 };
 
 template <> struct MappingTraits<X86MachineFunctionInfo> {
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver4.td b/llvm/lib/Target/X86/X86ScheduleZnver4.td
index cc300548a50e6..ac4d31de8dbfe 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver4.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver4.td
@@ -15,7 +15,7 @@
 //===----------------------------------------------------------------------===//
 
 def Znver4Model : SchedMachineModel {
-  // AMD SOG Zen4, 2.9.6 Dispatch
+  // AMD SOG Zen4, 2.9.8 Dispatch
   // The processor may dispatch up to 6 macro ops per cycle
   // into the execution engine.
   let IssueWidth = 6;
@@ -46,8 +46,9 @@ def Znver4Model : SchedMachineModel {
   int VecLoadLatency = 7;
   // Latency of a simple store operation.
   int StoreLatency = 1;
-  // FIXME:
-  let HighLatency = 25; // FIXME: any better choice?
+  // Mean and median value for all instructions with latencies >6
+  // Source: Zen4 Instruction Latencies spreadsheet (included with SOG)
+  let HighLatency = 13;
   // AMD SOG Zen4, 2.8 Optimizing Branching
   // The branch misprediction penalty is in the range from 11 to 18 cycles,
   // <...>. The common case penalty is 13 cycles.
@@ -612,6 +613,7 @@ def Zn4WriteLEA : SchedWriteVariant<[
 
 def : InstRW<[Zn4WriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
 
+// values from uops.info
 def Zn4SlowLEA16r : SchedWriteRes<[Zn4ALU0123]> {
   let Latency = 2; // FIXME: not from llvm-exegesis
   let ReleaseAtCycles = [4];
@@ -659,15 +661,15 @@ def : InstRW<[Zn4WriteCMPXCHG8rm_LCMPXCHG8], (instrs CMPXCHG8rm, LCMPXCHG8)>;
 
 def Zn4WriteCMPXCHG8B : SchedWriteRes<[Zn4ALU0123]> {
   let Latency = 3; // FIXME: not from llvm-exegesis
-  let ReleaseAtCycles = [24];
-  let NumMicroOps = 19;
+  let ReleaseAtCycles = [20];
+  let NumMicroOps = 15;
 }
 def : InstRW<[Zn4WriteCMPXCHG8B], (instrs CMPXCHG8B)>;
 
 def Zn4WriteCMPXCHG16B_LCMPXCHG16B : SchedWriteRes<[Zn4ALU0123]> {
-  let Latency = 4; // FIXME: not from llvm-exegesis
-  let ReleaseAtCycles = [59];
-  let NumMicroOps = 28;
+  let Latency = 2; // FIXME: not from llvm-exegesis
+  let ReleaseAtCycles = [40];
+  let NumMicroOps = 26;
 }
 def : InstRW<[Zn4WriteCMPXCHG16B_LCMPXCHG16B], (instrs CMPXCHG16B, LCMPXCHG16B)>;
 
@@ -681,7 +683,7 @@ def : InstRW<[Zn4WriteWriteXCHGUnrenameable], (instrs XCHG8rr, XCHG16rr, XCHG16a
 def Zn4WriteXCHG8rm_XCHG16rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]> {
   let Latency = !add(Znver4Model.LoadLatency, 3); // FIXME: not from llvm-exegesis
   let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = 5;
+  let NumMicroOps = 2;
 }
 def : InstRW<[Zn4WriteXCHG8rm_XCHG16rm], (instrs XCHG8rm, XCHG16rm)>;
 
@@ -693,19 +695,17 @@ def Zn4WriteXCHG32rm_XCHG64rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4ALU0123]>
 def : InstRW<[Zn4WriteXCHG32rm_XCHG64rm], (instrs XCHG32rm, XCHG64rm)>;
 
 // Integer division.
-// FIXME: uops for 8-bit division measures as 2. for others it's a guess.
-// FIXME: latency for 8-bit division measures as 10. for others it's a guess.
-defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 17, [17], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 10, [10], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 11, [11], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 13, [13], 2>;
-defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 17, [17], 2>;
-
-defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan forward.
-defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 6, /*LoadUOps=*/1>; // Bit scan reverse.
+defm : Zn4WriteResIntPair<WriteDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteDiv64, [Zn4Divider], 18, [18], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv8, [Zn4Divider], 9, [9], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv16, [Zn4Divider], 10, [10], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv32, [Zn4Divider], 12, [12], 2>;
+defm : Zn4WriteResIntPair<WriteIDiv64, [Zn4Divider], 18, [18], 2>;
+
+defm : Zn4WriteResIntPair<WriteBSF, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan forward.
+defm : Zn4WriteResIntPair<WriteBSR, [Zn4ALU1], 1, [1], 1, /*LoadUOps=*/1>; // Bit scan reverse.
 
 defm : Zn4WriteResIntPair<WritePOPCNT, [Zn4ALU0123], 1, [1], 1>; // Bit population count.
 
@@ -725,12 +725,12 @@ def Zn4WriteLZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
 }
 def : InstRW<[Zn4WriteLZCNT16rr], (instrs LZCNT16rr)>;
 
-defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 2, [1], 2>; // Trailing zero count.
+defm : Zn4WriteResIntPair<WriteTZCNT, [Zn4ALU12], 1, [1], 1>; // Trailing zero count.
 
 def Zn4WriteTZCNT16rr : SchedWriteRes<[Zn4ALU0123]> {
-  let Latency = 2;
-  let ReleaseAtCycles = [4];
-  let NumMicroOps = 2;
+  let Latency = 1;
+  let ReleaseAtCycles = [1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteTZCNT16rr], (instrs TZCNT16rr)>;
 
@@ -1109,15 +1109,31 @@ def Zn4WriteVecOpMaskKRMov : SchedWriteRes<[Zn4FPOpMask4]> {
 }
 def : InstRW<[Zn4WriteVecOpMaskKRMov], (instrs KMOVBkr, KMOVDkr, KMOVQkr, KMOVWkr)>;
 
-def Zn4WriteVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
-  // TODO: All align instructions are expected to be of 4 cycle latency
-  let Latency = 4;
+// 128-bit VALIGN
+def Zn4WriteXMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 2;
   let ReleaseAtCycles = [1];
   let NumMicroOps = 1;
 }
-def : InstRW<[Zn4WriteVecALU2Slow], (instrs VALIGNDZrri, VALIGNDZ128rri, VALIGNDZ256rri,
-                                            VALIGNQZrri, VALIGNQZ128rri, VALIGNQZ256rri)
-                                            >;
+
+// 256-bit VALIGN
+def Zn4WriteYMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 3;
+  let ReleaseAtCycles = [1];
+  let NumMicroOps = 1;
+}
+
+// 512-bit VALIGN
+def Zn4WriteZMMVecALU2Slow : SchedWriteRes<[Zn4FPVAdd12]> {
+  let Latency = 4;
+  let ReleaseAtCycles = [2];
+  let NumMicroOps = 1;
+}
+
+def : InstRW<[Zn4WriteXMMVecALU2Slow], (instrs VALIGNDZrri, VALIGNQZrri)>;
+def : InstRW<[Zn4WriteYMMVecALU2Slow], (instrs VALIGNDZ128rri, VALIGNQZ128rri)>;
+def : InstRW<[Zn4WriteZMMVecALU2Slow], (instrs VALIGNDZ256rri, VALIGNQZ256rri)>;
+
 defm : Zn4WriteResYMMPair<WriteVecALUY, [Zn4FPVAdd0123], 1, [1], 1>; // Vector integer ALU op, no logicals (YMM).
 
 def Zn4WriteVecALUYSlow : SchedWriteRes<[Zn4FPVAdd01]> {
@@ -1326,9 +1342,9 @@ def : InstRW<[Zn4WriteSHA256RNDS2rr], (instrs SHA256RNDS2rr)>;
 
 // Strings instructions.
 // Packed Compare Implicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 6, [8], 3, /*LoadUOps=*/1>;
+defm : Zn4WriteResXMMPair<WritePCmpIStrM, [Zn4FPVAdd0123], 7, [8], 3, /*LoadUOps=*/1>;
 // Packed Compare Explicit Length Strings, Return Mask
-defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 6, [12], 7, /*LoadUOps=*/5>;
+defm : Zn4WriteResXMMPair<WritePCmpEStrM, [Zn4FPVAdd0123], 7, [12], 7, /*LoadUOps=*/5>;
 // Packed Compare Implicit Length Strings, Return Index
 defm : Zn4WriteResXMMPair<WritePCmpIStrI, [Zn4FPVAdd0123], 2, [8], 4>;
 // Packed Compare Explicit Length Strings, Return Index
@@ -1340,7 +1356,7 @@ defm : Zn4WriteResXMMPair<WriteAESIMC, [Zn4FPAES01], 4, [1], 1>; // InvMixColumn
 defm : Zn4WriteResXMMPair<WriteAESKeyGen, [Zn4FPAES01], 4, [1], 1>; // Key Generation.
 
 // Carry-less multiplication instructions.
-defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [4], 4>;
+defm : Zn4WriteResXMMPair<WriteCLMul, [Zn4FPCLM01], 4, [3], 4>;
 
 // EMMS/FEMMS
 defm : Zn4WriteResInt<WriteEMMS, [Zn4ALU0123], 2, [1], 1>; // FIXME: latency not from llvm-exegesis
@@ -1386,44 +1402,44 @@ def Zn4WriteVPERM2F128rm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
 def : InstRW<[Zn4WriteVPERM2F128rm], (instrs VPERM2F128rmi)>;
 
 def Zn4WriteVPERMPSYrr : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 7;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPSYrr], (instrs VPERMPSYrr)>;
 
 def Zn4WriteVPERMPSYrm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMPSYrr.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMPSYrr.NumMicroOps, 1);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPSYrm], (instrs VPERMPSYrm)>;
 
 def Zn4WriteVPERMYri : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 6;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMYri], (instrs VPERMPDYri, VPERMQYri)>;
 
 def Zn4WriteVPERMPDYmi : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMYri.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMYri.NumMicroOps, 1);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMPDYmi], (instrs VPERMPDYmi)>;
 
 def Zn4WriteVPERMDYrr : SchedWriteRes<[Zn4FPVShuf]> {
-  let Latency = 5;
+  let Latency = 4;
   let ReleaseAtCycles = [1];
-  let NumMicroOps = 2;
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMDYrr], (instrs VPERMDYrr)>;
 
 def Zn4WriteVPERMYm : SchedWriteRes<[Zn4AGU012, Zn4Load, Zn4FPVShuf]> {
   let Latency = !add(Znver4Model.VecLoadLatency, Zn4WriteVPERMDYrr.Latency);
-  let ReleaseAtCycles = [1, 1, 2];
-  let NumMicroOps = !add(Zn4WriteVPERMDYrr.NumMicroOps, 0);
+  let ReleaseAtCycles = [1, 1, 1];
+  let NumMicroOps = 1;
 }
 def : InstRW<[Zn4WriteVPERMYm], (instrs VPERMQYmi, VPERMDYrm)>;
 
diff --git a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
index edcf2473d45cd..632c6a2fba1b6 100644
--- a/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
+++ b/llvm/lib/Target/Xtensa/XtensaInstrInfo.td
@@ -1407,7 +1407,7 @@ let isBarrier = 1, isTerminator = 1 in {
     let r = 0x04;
   }
 
-  def BREAK_N : RRRN_Inst<0x0C, (outs), (ins uimm4:$imm),
+  def BREAK_N : RRRN_Inst<0x0D, (outs), (ins uimm4:$imm),
                          "break.n\t$imm", []>, Requires<[HasDensity, HasDebug]> {
     bits<4> imm;
 
diff --git a/llvm/lib/TargetParser/ARMTargetParser.cpp b/llvm/lib/TargetParser/ARMTargetParser.cpp
index 0fce5b9d51730..709e5f05e3bb9 100644
--- a/llvm/lib/TargetParser/ARMTargetParser.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParser.cpp
@@ -88,6 +88,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV9_4A:
   case ArchKind::ARMV9_5A:
   case ArchKind::ARMV9_6A:
+  case ArchKind::ARMV9_7A:
     return 9;
   case ArchKind::INVALID:
     return 0;
@@ -127,6 +128,7 @@ static ARM::ProfileKind getProfileKind(ARM::ArchKind AK) {
   case ARM::ArchKind::ARMV9_4A:
   case ARM::ArchKind::ARMV9_5A:
   case ARM::ArchKind::ARMV9_6A:
+  case ARM::ArchKind::ARMV9_7A:
     return ARM::ProfileKind::A;
   case ARM::ArchKind::ARMV4:
   case ARM::ArchKind::ARMV4T:
diff --git a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
index f6cea85e5ee2b..15ba1ebd0c037 100644
--- a/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
+++ b/llvm/lib/TargetParser/ARMTargetParserCommon.cpp
@@ -46,6 +46,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v9.4a", "v9.4-a")
       .Case("v9.5a", "v9.5-a")
       .Case("v9.6a", "v9.6-a")
+      .Case("v9.7a", "v9.7-a")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
       .Case("v8.1m.main", "v8.1-m.main")
diff --git a/llvm/lib/TargetParser/RISCVISAInfo.cpp b/llvm/lib/TargetParser/RISCVISAInfo.cpp
index 31126cc698b3f..f08a0c0ddd680 100644
--- a/llvm/lib/TargetParser/RISCVISAInfo.cpp
+++ b/llvm/lib/TargetParser/RISCVISAInfo.cpp
@@ -765,6 +765,12 @@ Error RISCVISAInfo::checkDependency() {
   if (HasZvl && !HasVector)
     return getExtensionRequiresError("zvl*b", "v' or 'zve*");
 
+  if (Exts.count("xsfvfbfexp16e") &&
+      !(Exts.count("zvfbfmin") || Exts.count("zvfbfa")))
+    return createStringError(errc::invalid_argument,
+                             "'xsfvfbfexp16e' requires 'zvfbfmin' or "
+                             "'zvfbfa' extension to also be specified");
+
   if (HasD && (HasC || Exts.count("zcd")))
     for (auto Ext : ZcdOverlaps)
       if (Exts.count(Ext.str()))
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 62a3c88b7b0a7..975a271cfd6dc 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -433,6 +433,8 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
     Features["fp8e5m3-insts"] = true;
     Features["permlane16-swap"] = true;
     Features["ashr-pk-insts"] = true;
+    Features["add-min-max-insts"] = true;
+    Features["pk-add-min-max-insts"] = true;
     Features["atomic-buffer-pk-add-bf16-inst"] = true;
     Features["vmem-pref-insts"] = true;
     Features["atomic-fadd-rtn-insts"] = true;
diff --git a/llvm/lib/TargetParser/Triple.cpp b/llvm/lib/TargetParser/Triple.cpp
index 1068ce422d9d0..11ba9ee32f66a 100644
--- a/llvm/lib/TargetParser/Triple.cpp
+++ b/llvm/lib/TargetParser/Triple.cpp
@@ -937,6 +937,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v9_5a;
   case ARM::ArchKind::ARMV9_6A:
     return Triple::ARMSubArch_v9_6a;
+  case ARM::ArchKind::ARMV9_7A:
+    return Triple::ARMSubArch_v9_7a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index bbbac45e225a6..7a95df4b2a47c 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -907,10 +907,20 @@ static bool mergeConsecutivePartStores(ArrayRef<PartStore> Parts,
   StoreInst *Store = Builder.CreateAlignedStore(
       Val, First.Store->getPointerOperand(), First.Store->getAlign());
 
+  // Merge various metadata onto the new store.
   AAMDNodes AATags = First.Store->getAAMetadata();
-  for (const PartStore &Part : drop_begin(Parts))
+  SmallVector<Instruction *> Stores = {First.Store};
+  Stores.reserve(Parts.size());
+  SmallVector<DebugLoc> DbgLocs = {First.Store->getDebugLoc()};
+  DbgLocs.reserve(Parts.size());
+  for (const PartStore &Part : drop_begin(Parts)) {
     AATags = AATags.concat(Part.Store->getAAMetadata());
+    Stores.push_back(Part.Store);
+    DbgLocs.push_back(Part.Store->getDebugLoc());
+  }
   Store->setAAMetadata(AATags);
+  Store->mergeDIAssignID(Stores);
+  Store->setDebugLoc(DebugLoc::getMergedLocations(DbgLocs));
 
   // Remove the old stores.
   for (const PartStore &Part : Parts)
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index 077d29f7499a4..3b59ebbbb9322 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -272,6 +272,9 @@ AA::getInitialValueForObj(Attributor &A, const AbstractAttribute &QueryingAA,
   }
 
   if (RangePtr && !RangePtr->offsetOrSizeAreUnknown()) {
+    int64_t StorageSize = DL.getTypeStoreSize(&Ty);
+    if (StorageSize != RangePtr->Size)
+      return nullptr;
     APInt Offset = APInt(64, RangePtr->Offset);
     return ConstantFoldLoadFromConst(Initializer, &Ty, Offset, DL);
   }
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 46fb567593add..aa1346d9ee56a 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1271,7 +1271,7 @@ bool LowerTypeTestsModule::hasBranchTargetEnforcement() {
     // the module flags.
     if (const auto *BTE = mdconst::extract_or_null<ConstantInt>(
           M.getModuleFlag("branch-target-enforcement")))
-      HasBranchTargetEnforcement = (BTE->getZExtValue() != 0);
+      HasBranchTargetEnforcement = !BTE->isZero();
     else
       HasBranchTargetEnforcement = 0;
   }
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 76e588b116c04..2dd0fde6b34d6 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -24,7 +24,8 @@
 //   returns 0, or a single vtable's function returns 1, replace each virtual
 //   call with a comparison of the vptr against that vtable's address.
 //
-// This pass is intended to be used during the regular and thin LTO pipelines:
+// This pass is intended to be used during the regular/thin and non-LTO
+// pipelines:
 //
 // During regular LTO, the pass determines the best optimization for each
 // virtual call and applies the resolutions directly to virtual calls that are
@@ -48,6 +49,14 @@
 //   is supported.
 // - Import phase: (same as with hybrid case above).
 //
+// During Speculative devirtualization mode -not restricted to LTO-:
+// - The pass applies speculative devirtualization without requiring any type of
+//   visibility.
+// - Skips other features like virtual constant propagation, uniform return
+//   value optimization, unique return value optimization and branch funnels as
+//   they need LTO.
+// - This mode is enabled via 'devirtualize-speculatively' flag.
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/IPO/WholeProgramDevirt.h"
@@ -61,7 +70,9 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -145,6 +156,13 @@ static cl::opt<std::string> ClWriteSummary(
              "bitcode, otherwise YAML"),
     cl::Hidden);
 
+// TODO: This option eventually should support any public visibility vtables
+// with/out LTO.
+static cl::opt<bool> ClDevirtualizeSpeculatively(
+    "devirtualize-speculatively",
+    cl::desc("Enable speculative devirtualization optimization"),
+    cl::init(false));
+
 static cl::opt<unsigned>
     ClThreshold("wholeprogramdevirt-branch-funnel-threshold", cl::Hidden,
                 cl::init(10),
@@ -892,6 +910,8 @@ void llvm::updatePublicTypeTestCalls(Module &M,
       CI->eraseFromParent();
     }
   } else {
+    // TODO: Don't replace public type tests when speculative devirtualization
+    // gets enabled in LTO mode.
     auto *True = ConstantInt::getTrue(M.getContext());
     for (Use &U : make_early_inc_range(PublicTypeTestFunc->uses())) {
       auto *CI = cast<CallInst>(U.getUser());
@@ -928,17 +948,17 @@ void llvm::updateVCallVisibilityInIndex(
     // linker, as we have no information on their eventual use.
     if (DynamicExportSymbols.count(P.first))
       continue;
+    // With validation enabled, we want to exclude symbols visible to regular
+    // objects. Local symbols will be in this group due to the current
+    // implementation but those with VCallVisibilityTranslationUnit will have
+    // already been marked in clang so are unaffected.
+    if (VisibleToRegularObjSymbols.count(P.first))
+      continue;
     for (auto &S : P.second.getSummaryList()) {
       auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
         continue;
-      // With validation enabled, we want to exclude symbols visible to regular
-      // objects. Local symbols will be in this group due to the current
-      // implementation but those with VCallVisibilityTranslationUnit will have
-      // already been marked in clang so are unaffected.
-      if (VisibleToRegularObjSymbols.count(P.first))
-        continue;
       GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
     }
   }
@@ -1083,10 +1103,10 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (!TM.Bits->GV->isConstant())
       return false;
 
-    // We cannot perform whole program devirtualization analysis on a vtable
-    // with public LTO visibility.
-    if (TM.Bits->GV->getVCallVisibility() ==
-        GlobalObject::VCallVisibilityPublic)
+    // Without ClDevirtualizeSpeculatively, we cannot perform whole program
+    // devirtualization analysis on a vtable with public LTO visibility.
+    if (!ClDevirtualizeSpeculatively && TM.Bits->GV->getVCallVisibility() ==
+                                            GlobalObject::VCallVisibilityPublic)
       return false;
 
     Function *Fn = nullptr;
@@ -1105,6 +1125,12 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (Fn->getName() == "__cxa_pure_virtual")
       continue;
 
+    // In most cases empty functions will be overridden by the
+    // implementation of the derived class, so we can skip them.
+    if (ClDevirtualizeSpeculatively && Fn->getReturnType()->isVoidTy() &&
+        Fn->getInstructionCount() <= 1)
+      continue;
+
     // We can disregard unreachable functions as possible call targets, as
     // unreachable functions shouldn't be called.
     if (mustBeUnreachableFunction(Fn, ExportSummary))
@@ -1135,14 +1161,10 @@ bool DevirtIndex::tryFindVirtualCallTargets(
     // and therefore the same GUID. This can happen if there isn't enough
     // distinguishing path when compiling the source file. In that case we
     // conservatively return false early.
+    if (P.VTableVI.hasLocal() && P.VTableVI.getSummaryList().size() > 1)
+      return false;
     const GlobalVarSummary *VS = nullptr;
-    bool LocalFound = false;
     for (const auto &S : P.VTableVI.getSummaryList()) {
-      if (GlobalValue::isLocalLinkage(S->linkage())) {
-        if (LocalFound)
-          return false;
-        LocalFound = true;
-      }
       auto *CurVS = cast<GlobalVarSummary>(S->getBaseObject());
       if (!CurVS->vTableFuncs().empty() ||
           // Previously clang did not attach the necessary type metadata to
@@ -1158,6 +1180,7 @@ bool DevirtIndex::tryFindVirtualCallTargets(
         // with public LTO visibility.
         if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
           return false;
+        break;
       }
     }
     // There will be no VS if all copies are available_externally having no
@@ -1223,10 +1246,12 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
         CallTrap->setDebugLoc(CB.getDebugLoc());
       }
 
-      // If fallback checking is enabled, add support to compare the virtual
-      // function pointer to the devirtualized target. In case of a mismatch,
-      // fall back to indirect call.
-      if (DevirtCheckMode == WPDCheckMode::Fallback) {
+      // If fallback checking or speculative devirtualization are enabled,
+      // add support to compare the virtual function pointer to the
+      // devirtualized target. In case of a mismatch, fall back to indirect
+      // call.
+      if (DevirtCheckMode == WPDCheckMode::Fallback ||
+          ClDevirtualizeSpeculatively) {
         MDNode *Weights = MDBuilder(M.getContext()).createLikelyBranchWeights();
         // Version the indirect call site. If the called value is equal to the
         // given callee, 'NewInst' will be executed, otherwise the original call
@@ -1383,9 +1408,8 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
 
   // If the summary list contains multiple summaries where at least one is
   // a local, give up, as we won't know which (possibly promoted) name to use.
-  for (const auto &S : TheFn.getSummaryList())
-    if (GlobalValue::isLocalLinkage(S->linkage()) && Size > 1)
-      return false;
+  if (TheFn.hasLocal() && Size > 1)
+    return false;
 
   // Collect functions devirtualized at least for one call site for stats.
   if (PrintSummaryDevirt || AreStatisticsEnabled())
@@ -2057,15 +2081,15 @@ void DevirtModule::scanTypeTestUsers(
     Function *TypeTestFunc,
     DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
   // Find all virtual calls via a virtual table pointer %p under an assumption
-  // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
-  // points to a member of the type identifier %md. Group calls by (type ID,
-  // offset) pair (effectively the identity of the virtual function) and store
-  // to CallSlots.
+  // of the form llvm.assume(llvm.type.test(%p, %md)) or
+  // llvm.assume(llvm.public.type.test(%p, %md)).
+  // This indicates that %p points to a member of the type identifier %md.
+  // Group calls by (type ID, offset) pair (effectively the identity of the
+  // virtual function) and store to CallSlots.
   for (Use &U : llvm::make_early_inc_range(TypeTestFunc->uses())) {
     auto *CI = dyn_cast<CallInst>(U.getUser());
     if (!CI)
       continue;
-
     // Search for virtual calls based on %p and add them to DevirtCalls.
     SmallVector<DevirtCallSite, 1> DevirtCalls;
     SmallVector<CallInst *, 1> Assumes;
@@ -2348,6 +2372,12 @@ bool DevirtModule::run() {
       (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
     return false;
 
+  Function *PublicTypeTestFunc = nullptr;
+  // If we are in speculative devirtualization mode, we can work on the public
+  // type test intrinsics.
+  if (ClDevirtualizeSpeculatively)
+    PublicTypeTestFunc =
+        Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   Function *TypeTestFunc =
       Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
@@ -2361,8 +2391,9 @@ bool DevirtModule::run() {
   // module, this pass has nothing to do. But if we are exporting, we also need
   // to handle any users that appear only in the function summaries.
   if (!ExportSummary &&
-      (!TypeTestFunc || TypeTestFunc->use_empty() || !AssumeFunc ||
-       AssumeFunc->use_empty()) &&
+      (((!PublicTypeTestFunc || PublicTypeTestFunc->use_empty()) &&
+        (!TypeTestFunc || TypeTestFunc->use_empty())) ||
+       !AssumeFunc || AssumeFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
       (!TypeCheckedLoadRelativeFunc ||
        TypeCheckedLoadRelativeFunc->use_empty()))
@@ -2373,6 +2404,9 @@ bool DevirtModule::run() {
   DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
   buildTypeIdentifierMap(Bits, TypeIdMap);
 
+  if (PublicTypeTestFunc && AssumeFunc)
+    scanTypeTestUsers(PublicTypeTestFunc, TypeIdMap);
+
   if (TypeTestFunc && AssumeFunc)
     scanTypeTestUsers(TypeTestFunc, TypeIdMap);
 
@@ -2472,8 +2506,12 @@ bool DevirtModule::run() {
                  .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
                                   S.first.ByteOffset, ExportSummary)) {
-
-      if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
+      bool SingleImplDevirt =
+          trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res);
+      // Out of speculative devirtualization mode, Try to apply virtual constant
+      // propagation or branch funneling.
+      // TODO: This should eventually be enabled for non-public type tests.
+      if (!SingleImplDevirt && !ClDevirtualizeSpeculatively) {
         DidVirtualConstProp |=
             tryVirtualConstProp(TargetsForSlot, S.second, Res, S.first);
 
@@ -2549,6 +2587,11 @@ void DevirtIndex::run() {
   if (ExportSummary.typeIdCompatibleVtableMap().empty())
     return;
 
+  // Assert that we haven't made any changes that would affect the hasLocal()
+  // flag on the GUID summary info.
+  assert(!ExportSummary.withInternalizeAndPromote() &&
+         "Expect index-based WPD to run before internalization and promotion");
+
   DenseMap<GlobalValue::GUID, std::vector<StringRef>> NameByGUID;
   for (const auto &P : ExportSummary.typeIdCompatibleVtableMap()) {
     NameByGUID[GlobalValue::getGUIDAssumingExternalLinkage(P.first)].push_back(
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index 73ec4514f8414..9bee523c7b7e5 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -2760,21 +2760,34 @@ Instruction *InstCombinerImpl::visitSub(BinaryOperator &I) {
   // Optimize pointer differences into the same array into a size.  Consider:
   //  &A[10] - &A[0]: we should compile this to "10".
   Value *LHSOp, *RHSOp;
-  if (match(Op0, m_PtrToInt(m_Value(LHSOp))) &&
-      match(Op1, m_PtrToInt(m_Value(RHSOp))))
+  if (match(Op0, m_PtrToIntOrAddr(m_Value(LHSOp))) &&
+      match(Op1, m_PtrToIntOrAddr(m_Value(RHSOp))))
     if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
                                                I.hasNoUnsignedWrap()))
       return replaceInstUsesWith(I, Res);
 
   // trunc(p)-trunc(q) -> trunc(p-q)
-  if (match(Op0, m_Trunc(m_PtrToInt(m_Value(LHSOp)))) &&
-      match(Op1, m_Trunc(m_PtrToInt(m_Value(RHSOp)))))
+  if (match(Op0, m_Trunc(m_PtrToIntOrAddr(m_Value(LHSOp)))) &&
+      match(Op1, m_Trunc(m_PtrToIntOrAddr(m_Value(RHSOp)))))
     if (Value *Res = OptimizePointerDifference(LHSOp, RHSOp, I.getType(),
                                                /* IsNUW */ false))
       return replaceInstUsesWith(I, Res);
 
-  if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
-      match(Op1, m_ZExtOrSelf(m_PtrToInt(m_Value(RHSOp))))) {
+  auto MatchSubOfZExtOfPtrToIntOrAddr = [&]() {
+    if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+        match(Op1, m_ZExt(m_PtrToIntSameSize(DL, m_Value(RHSOp)))))
+      return true;
+    if (match(Op0, m_ZExt(m_PtrToAddr(m_Value(LHSOp)))) &&
+        match(Op1, m_ZExt(m_PtrToAddr(m_Value(RHSOp)))))
+      return true;
+    // Special case for non-canonical ptrtoint in constant expression,
+    // where the zext has been folded into the ptrtoint.
+    if (match(Op0, m_ZExt(m_PtrToIntSameSize(DL, m_Value(LHSOp)))) &&
+        match(Op1, m_PtrToInt(m_Value(RHSOp))))
+      return true;
+    return false;
+  };
+  if (MatchSubOfZExtOfPtrToIntOrAddr()) {
     if (auto *GEP = dyn_cast<GEPOperator>(LHSOp)) {
       if (GEP->getPointerOperand() == RHSOp) {
         if (GEP->hasNoUnsignedWrap() || GEP->hasNoUnsignedSignedWrap()) {
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index dab200d86d4a0..8d9933bfab938 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -582,6 +582,18 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombinerImpl &IC) {
           IC.Builder.CreateBinaryIntrinsic(Intrinsic::ctlz, C, Op1);
       return BinaryOperator::CreateSub(ConstCtlz, X);
     }
+
+    // ctlz(~x & (x - 1)) -> bitwidth - cttz(x, false)
+    if (Op0->hasOneUse() &&
+        match(Op0,
+              m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+      Type *Ty = II.getType();
+      unsigned BitWidth = Ty->getScalarSizeInBits();
+      auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty,
+                                              {X, IC.Builder.getFalse()});
+      auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+      return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+    }
   }
 
   // cttz(Pow2) -> Log2(Pow2)
@@ -4003,18 +4015,29 @@ Instruction *InstCombinerImpl::visitCallInst(CallInst &CI) {
 
   // Try to fold intrinsic into select/phi operands. This is legal if:
   //  * The intrinsic is speculatable.
-  //  * The select condition is not a vector, or the intrinsic does not
-  //    perform cross-lane operations.
-  if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI) &&
-      isNotCrossLaneOperation(II))
+  //  * The operand is one of the following:
+  //    - a phi.
+  //    - a select with a scalar condition.
+  //    - a select with a vector condition and II is not a cross lane operation.
+  if (isSafeToSpeculativelyExecuteWithVariableReplaced(&CI)) {
     for (Value *Op : II->args()) {
-      if (auto *Sel = dyn_cast<SelectInst>(Op))
-        if (Instruction *R = FoldOpIntoSelect(*II, Sel))
+      if (auto *Sel = dyn_cast<SelectInst>(Op)) {
+        bool IsVectorCond = Sel->getCondition()->getType()->isVectorTy();
+        if (IsVectorCond && !isNotCrossLaneOperation(II))
+          continue;
+        // Don't replace a scalar select with a more expensive vector select if
+        // we can't simplify both arms of the select.
+        bool SimplifyBothArms =
+            !Op->getType()->isVectorTy() && II->getType()->isVectorTy();
+        if (Instruction *R = FoldOpIntoSelect(
+                *II, Sel, /*FoldWithMultiUse=*/false, SimplifyBothArms))
           return R;
+      }
       if (auto *Phi = dyn_cast<PHINode>(Op))
         if (Instruction *R = foldOpIntoPhi(*II, Phi))
           return R;
     }
+  }
 
   if (Instruction *Shuf = foldShuffledIntrinsicOperands(II))
     return Shuf;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 943c223e516fb..ede73f86f1944 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -664,7 +664,8 @@ class LLVM_LIBRARY_VISIBILITY InstCombinerImpl final
   /// This also works for Cast instructions, which obviously do not have a
   /// second operand.
   Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
-                                bool FoldWithMultiUse = false);
+                                bool FoldWithMultiUse = false,
+                                bool SimplifyBothArms = false);
 
   /// This is a convenience wrapper function for the above two functions.
   Instruction *foldBinOpIntoSelectOrPhi(BinaryOperator &I);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 5aa8de38e4321..f5130da818746 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -4697,5 +4697,31 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
                 cast<IntrinsicInst>(TrueVal)->getParamAlign(0).valueOrOne(),
                 CondVal, FalseVal));
 
+  // Canonicalize sign function ashr pattern: select (icmp slt X, 1), ashr X,
+  // bitwidth-1, 1 -> scmp(X, 0)
+  // Also handles: select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+  unsigned BitWidth = SI.getType()->getScalarSizeInBits();
+  CmpPredicate Pred;
+  Value *CmpLHS, *CmpRHS;
+
+  // Canonicalize sign function ashr patterns:
+  // select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+  // select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+  if (match(&SI, m_Select(m_ICmp(Pred, m_Value(CmpLHS), m_Value(CmpRHS)),
+                          m_Value(TrueVal), m_Value(FalseVal))) &&
+      ((Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_One()) &&
+        match(TrueVal,
+              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1))) &&
+        match(FalseVal, m_One())) ||
+       (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_Zero()) &&
+        match(TrueVal, m_One()) &&
+        match(FalseVal,
+              m_AShr(m_Specific(CmpLHS), m_SpecificInt(BitWidth - 1)))))) {
+
+    Function *Scmp = Intrinsic::getOrInsertDeclaration(
+        SI.getModule(), Intrinsic::scmp, {SI.getType(), SI.getType()});
+    return CallInst::Create(Scmp, {CmpLHS, ConstantInt::get(SI.getType(), 0)});
+  }
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 3f11cae143b81..9c8de45a0e626 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1777,7 +1777,8 @@ static Value *foldOperationIntoSelectOperand(Instruction &I, SelectInst *SI,
 }
 
 Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
-                                                bool FoldWithMultiUse) {
+                                                bool FoldWithMultiUse,
+                                                bool SimplifyBothArms) {
   // Don't modify shared select instructions unless set FoldWithMultiUse
   if (!SI->hasOneUse() && !FoldWithMultiUse)
     return nullptr;
@@ -1821,6 +1822,9 @@ Instruction *InstCombinerImpl::FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
   if (!NewTV && !NewFV)
     return nullptr;
 
+  if (SimplifyBothArms && !(NewTV && NewFV))
+    return nullptr;
+
   // Create an instruction for the arm that did not fold.
   if (!NewTV)
     NewTV = foldOperationIntoSelectOperand(Op, SI, TV, *this);
@@ -2323,6 +2327,18 @@ Constant *InstCombinerImpl::unshuffleConstant(ArrayRef<int> ShMask, Constant *C,
   return ConstantVector::get(NewVecC);
 }
 
+// Get the result of `Vector Op Splat` (or Splat Op Vector if \p SplatLHS).
+static Constant *constantFoldBinOpWithSplat(unsigned Opcode, Constant *Vector,
+                                            Constant *Splat, bool SplatLHS,
+                                            const DataLayout &DL) {
+  ElementCount EC = cast<VectorType>(Vector->getType())->getElementCount();
+  Constant *LHS = ConstantVector::getSplat(EC, Splat);
+  Constant *RHS = Vector;
+  if (!SplatLHS)
+    std::swap(LHS, RHS);
+  return ConstantFoldBinaryOpOperands(Opcode, LHS, RHS, DL);
+}
+
 Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   if (!isa<VectorType>(Inst.getType()))
     return nullptr;
@@ -2334,6 +2350,37 @@ Instruction *InstCombinerImpl::foldVectorBinop(BinaryOperator &Inst) {
   assert(cast<VectorType>(RHS->getType())->getElementCount() ==
          cast<VectorType>(Inst.getType())->getElementCount());
 
+  auto foldConstantsThroughSubVectorInsertSplat =
+      [&](Value *MaybeSubVector, Value *MaybeSplat,
+          bool SplatLHS) -> Instruction * {
+    Value *Idx;
+    Constant *Splat, *SubVector, *Dest;
+    if (!match(MaybeSplat, m_ConstantSplat(m_Constant(Splat))) ||
+        !match(MaybeSubVector,
+               m_VectorInsert(m_Constant(Dest), m_Constant(SubVector),
+                              m_Value(Idx))))
+      return nullptr;
+    SubVector =
+        constantFoldBinOpWithSplat(Opcode, SubVector, Splat, SplatLHS, DL);
+    Dest = constantFoldBinOpWithSplat(Opcode, Dest, Splat, SplatLHS, DL);
+    if (!SubVector || !Dest)
+      return nullptr;
+    auto *InsertVector =
+        Builder.CreateInsertVector(Dest->getType(), Dest, SubVector, Idx);
+    return replaceInstUsesWith(Inst, InsertVector);
+  };
+
+  // If one operand is a constant splat and the other operand is a
+  // `vector.insert` where both the destination and subvector are constant,
+  // apply the operation to both the destination and subvector, returning a new
+  // constant `vector.insert`. This helps constant folding for scalable vectors.
+  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat(
+          /*MaybeSubVector=*/LHS, /*MaybeSplat=*/RHS, /*SplatLHS=*/false))
+    return Folded;
+  if (Instruction *Folded = foldConstantsThroughSubVectorInsertSplat(
+          /*MaybeSubVector=*/RHS, /*MaybeSplat=*/LHS, /*SplatLHS=*/true))
+    return Folded;
+
   // If both operands of the binop are vector concatenations, then perform the
   // narrow binop on each pair of the source operands followed by concatenation
   // of the results.
diff --git a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
index 29968b834cfce..8181e4ef1d74f 100644
--- a/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AllocToken.cpp
@@ -36,6 +36,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
+#include "llvm/Support/AllocToken.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -54,47 +55,14 @@
 #include <variant>
 
 using namespace llvm;
+using TokenMode = AllocTokenMode;
 
 #define DEBUG_TYPE "alloc-token"
 
 namespace {
 
-//===--- Constants --------------------------------------------------------===//
-
-enum class TokenMode : unsigned {
-  /// Incrementally increasing token ID.
-  Increment = 0,
-
-  /// Simple mode that returns a statically-assigned random token ID.
-  Random = 1,
-
-  /// Token ID based on allocated type hash.
-  TypeHash = 2,
-
-  /// Token ID based on allocated type hash, where the top half ID-space is
-  /// reserved for types that contain pointers and the bottom half for types
-  /// that do not contain pointers.
-  TypeHashPointerSplit = 3,
-};
-
 //===--- Command-line options ---------------------------------------------===//
 
-cl::opt<TokenMode> ClMode(
-    "alloc-token-mode", cl::Hidden, cl::desc("Token assignment mode"),
-    cl::init(TokenMode::TypeHashPointerSplit),
-    cl::values(
-        clEnumValN(TokenMode::Increment, "increment",
-                   "Incrementally increasing token ID"),
-        clEnumValN(TokenMode::Random, "random",
-                   "Statically-assigned random token ID"),
-        clEnumValN(TokenMode::TypeHash, "typehash",
-                   "Token ID based on allocated type hash"),
-        clEnumValN(
-            TokenMode::TypeHashPointerSplit, "typehashpointersplit",
-            "Token ID based on allocated type hash, where the top half "
-            "ID-space is reserved for types that contain pointers and the "
-            "bottom half for types that do not contain pointers. ")));
-
 cl::opt<std::string> ClFuncPrefix("alloc-token-prefix",
                                   cl::desc("The allocation function prefix"),
                                   cl::Hidden, cl::init("__alloc_token_"));
@@ -217,22 +185,19 @@ class TypeHashMode : public ModeBase {
   using ModeBase::ModeBase;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    const auto [N, H] = getHash(CB, ORE);
-    return N ? boundedToken(H) : H;
-  }
 
-protected:
-  std::pair<MDNode *, uint64_t> getHash(const CallBase &CB,
-                                        OptimizationRemarkEmitter &ORE) {
     if (MDNode *N = getAllocTokenMetadata(CB)) {
       MDString *S = cast<MDString>(N->getOperand(0));
-      return {N, getStableSipHash(S->getString())};
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHash, Metadata, MaxTokens))
+        return *Token;
     }
     // Fallback.
     remarkNoMetadata(CB, ORE);
-    return {nullptr, ClFallbackToken};
+    return ClFallbackToken;
   }
 
+protected:
   /// Remark that there was no precise type information.
   static void remarkNoMetadata(const CallBase &CB,
                                OptimizationRemarkEmitter &ORE) {
@@ -253,20 +218,18 @@ class TypeHashPointerSplitMode : public TypeHashMode {
   using TypeHashMode::TypeHashMode;
 
   uint64_t operator()(const CallBase &CB, OptimizationRemarkEmitter &ORE) {
-    if (MaxTokens == 1)
-      return 0;
-    const uint64_t HalfTokens = MaxTokens / 2;
-    const auto [N, H] = getHash(CB, ORE);
-    if (!N) {
-      // Pick the fallback token (ClFallbackToken), which by default is 0,
-      // meaning it'll fall into the pointer-less bucket. Override by setting
-      // -alloc-token-fallback if that is the wrong choice.
-      return H;
+    if (MDNode *N = getAllocTokenMetadata(CB)) {
+      MDString *S = cast<MDString>(N->getOperand(0));
+      AllocTokenMetadata Metadata{S->getString(), containsPointer(N)};
+      if (auto Token = getAllocToken(TokenMode::TypeHashPointerSplit, Metadata,
+                                     MaxTokens))
+        return *Token;
     }
-    uint64_t Hash = H % HalfTokens; // base hash
-    if (containsPointer(N))
-      Hash += HalfTokens;
-    return Hash;
+    // Pick the fallback token (ClFallbackToken), which by default is 0, meaning
+    // it'll fall into the pointer-less bucket. Override by setting
+    // -alloc-token-fallback if that is the wrong choice.
+    remarkNoMetadata(CB, ORE);
+    return ClFallbackToken;
   }
 };
 
@@ -286,7 +249,7 @@ class AllocToken {
       : Options(transformOptionsFromCl(std::move(Opts))), Mod(M),
         FAM(MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
         Mode(IncrementMode(*IntPtrTy, *Options.MaxTokens)) {
-    switch (ClMode.getValue()) {
+    switch (Options.Mode) {
     case TokenMode::Increment:
       break;
     case TokenMode::Random:
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index b6cbecb6133f4..10b03bbcd33dc 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -226,6 +226,7 @@ static const Align kMinOriginAlignment = Align(4);
 static const Align kShadowTLSAlignment = Align(8);
 
 // These constants must be kept in sync with the ones in msan.h.
+// TODO: increase size to match SVE/SVE2/SME/SME2 limits
 static const unsigned kParamTLSSize = 800;
 static const unsigned kRetvalTLSSize = 800;
 
@@ -1544,6 +1545,22 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
+  static bool isAArch64SVCount(Type *Ty) {
+    if (TargetExtType *TTy = dyn_cast<TargetExtType>(Ty))
+      return TTy->getName() == "aarch64.svcount";
+    return false;
+  }
+
+  // This is intended to match the "AArch64 Predicate-as-Counter Type" (aka
+  // 'target("aarch64.svcount")', but not e.g., <vscale x 4 x i32>.
+  static bool isScalableNonVectorType(Type *Ty) {
+    if (!isAArch64SVCount(Ty))
+      LLVM_DEBUG(dbgs() << "isScalableNonVectorType: Unexpected type " << *Ty
+                        << "\n");
+
+    return Ty->isScalableTy() && !isa<VectorType>(Ty);
+  }
+
   void materializeChecks() {
 #ifndef NDEBUG
     // For assert below.
@@ -1672,6 +1689,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       LLVM_DEBUG(dbgs() << "getShadowTy: " << *ST << " ===> " << *Res << "\n");
       return Res;
     }
+    if (isScalableNonVectorType(OrigTy)) {
+      LLVM_DEBUG(dbgs() << "getShadowTy: Scalable non-vector type: " << *OrigTy
+                        << "\n");
+      return OrigTy;
+    }
+
     uint32_t TypeSize = DL.getTypeSizeInBits(OrigTy);
     return IntegerType::get(*MS.C, TypeSize);
   }
@@ -2185,8 +2208,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                         << *OrigIns << "\n");
       return;
     }
-#ifndef NDEBUG
+
     Type *ShadowTy = Shadow->getType();
+    if (isScalableNonVectorType(ShadowTy)) {
+      LLVM_DEBUG(dbgs() << "Skipping check of scalable non-vector " << *Shadow
+                        << " before " << *OrigIns << "\n");
+      return;
+    }
+#ifndef NDEBUG
     assert((isa<IntegerType>(ShadowTy) || isa<VectorType>(ShadowTy) ||
             isa<StructType>(ShadowTy) || isa<ArrayType>(ShadowTy)) &&
            "Can only insert checks for integer, vector, and aggregate shadow "
@@ -6972,6 +7001,15 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       // an extra "select". This results in much more compact IR.
       // Sa = select Sb, poisoned, (select b, Sc, Sd)
       Sa1 = getPoisonedShadow(getShadowTy(I.getType()));
+    } else if (isScalableNonVectorType(I.getType())) {
+      // This is intended to handle target("aarch64.svcount"), which can't be
+      // handled in the else branch because of incompatibility with CreateXor
+      // ("The supported LLVM operations on this type are limited to load,
+      // store, phi, select and alloca instructions").
+
+      // TODO: this currently underapproximates. Use Arm SVE EOR in the else
+      //       branch as needed instead.
+      Sa1 = getCleanShadow(getShadowTy(I.getType()));
     } else {
       // Sa = select Sb, [ (c^d) | Sc | Sd ], [ b ? Sc : Sd ]
       // If Sb (condition is poisoned), look for bits in c and d that are equal
diff --git a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
index d18c0d0d2d90d..80e77e099c695 100644
--- a/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/NumericalStabilitySanitizer.cpp
@@ -2020,7 +2020,6 @@ static void moveFastMathFlags(Function &F,
     F.removeFnAttr(attr);                                                      \
     FMF.set##setter();                                                         \
   }
-  MOVE_FLAG("unsafe-fp-math", Fast)
   MOVE_FLAG("no-infs-fp-math", NoInfs)
   MOVE_FLAG("no-nans-fp-math", NoNaNs)
   MOVE_FLAG("no-signed-zeros-fp-math", NoSignedZeros)
diff --git a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
index 4acc3f2d84690..d347cedb42988 100644
--- a/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstraintElimination.cpp
@@ -614,6 +614,16 @@ static Decomposition decompose(Value *V,
     return {V, IsKnownNonNegative};
   }
 
+  if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
+      canUseSExt(CI)) {
+    Preconditions.emplace_back(
+        CmpInst::ICMP_UGE, Op0,
+        ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
+    if (auto Decomp = MergeResults(Op0, CI, true))
+      return *Decomp;
+    return {V, IsKnownNonNegative};
+  }
+
   if (match(V, m_NSWAdd(m_Value(Op0), m_Value(Op1)))) {
     if (!isKnownNonNegative(Op0, DL))
       Preconditions.emplace_back(CmpInst::ICMP_SGE, Op0,
@@ -627,16 +637,6 @@ static Decomposition decompose(Value *V,
     return {V, IsKnownNonNegative};
   }
 
-  if (match(V, m_Add(m_Value(Op0), m_ConstantInt(CI))) && CI->isNegative() &&
-      canUseSExt(CI)) {
-    Preconditions.emplace_back(
-        CmpInst::ICMP_UGE, Op0,
-        ConstantInt::get(Op0->getType(), CI->getSExtValue() * -1));
-    if (auto Decomp = MergeResults(Op0, CI, true))
-      return *Decomp;
-    return {V, IsKnownNonNegative};
-  }
-
   // Decompose or as an add if there are no common bits between the operands.
   if (match(V, m_DisjointOr(m_Value(Op0), m_ConstantInt(CI)))) {
     if (auto Decomp = MergeResults(Op0, CI, IsSigned))
diff --git a/llvm/lib/Transforms/Scalar/MergeICmps.cpp b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
index a83cbd17a70dc..f273e9d90e71e 100644
--- a/llvm/lib/Transforms/Scalar/MergeICmps.cpp
+++ b/llvm/lib/Transforms/Scalar/MergeICmps.cpp
@@ -64,10 +64,10 @@
 
 using namespace llvm;
 
-namespace {
-
 #define DEBUG_TYPE "mergeicmps"
 
+namespace {
+
 // A BCE atom "Binary Compare Expression Atom" represents an integer load
 // that is a constant offset from a base value, e.g. `a` or `o.c` in the example
 // at the top.
@@ -128,11 +128,12 @@ class BaseIdentifier {
   unsigned Order = 1;
   DenseMap<const Value*, int> BaseToIndex;
 };
+} // namespace
 
 // If this value is a load from a constant offset w.r.t. a base address, and
 // there are no other users of the load or address, returns the base address and
 // the offset.
-BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
+static BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
   auto *const LoadI = dyn_cast<LoadInst>(Val);
   if (!LoadI)
     return {};
@@ -175,6 +176,7 @@ BCEAtom visitICmpLoadOperand(Value *const Val, BaseIdentifier &BaseId) {
   return BCEAtom(GEP, LoadI, BaseId.getBaseId(Base), Offset);
 }
 
+namespace {
 // A comparison between two BCE atoms, e.g. `a == o.a` in the example at the
 // top.
 // Note: the terminology is misleading: the comparison is symmetric, so there
@@ -239,6 +241,7 @@ class BCECmpBlock {
 private:
   BCECmp Cmp;
 };
+} // namespace
 
 bool BCECmpBlock::canSinkBCECmpInst(const Instruction *Inst,
                                     AliasAnalysis &AA) const {
@@ -302,9 +305,9 @@ bool BCECmpBlock::doesOtherWork() const {
 
 // Visit the given comparison. If this is a comparison between two valid
 // BCE atoms, returns the comparison.
-std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
-                                const ICmpInst::Predicate ExpectedPredicate,
-                                BaseIdentifier &BaseId) {
+static std::optional<BCECmp>
+visitICmp(const ICmpInst *const CmpI,
+          const ICmpInst::Predicate ExpectedPredicate, BaseIdentifier &BaseId) {
   // The comparison can only be used once:
   //  - For intermediate blocks, as a branch condition.
   //  - For the final block, as an incoming value for the Phi.
@@ -332,10 +335,9 @@ std::optional<BCECmp> visitICmp(const ICmpInst *const CmpI,
 
 // Visit the given comparison block. If this is a comparison between two valid
 // BCE atoms, returns the comparison.
-std::optional<BCECmpBlock> visitCmpBlock(Value *const Val,
-                                         BasicBlock *const Block,
-                                         const BasicBlock *const PhiBlock,
-                                         BaseIdentifier &BaseId) {
+static std::optional<BCECmpBlock>
+visitCmpBlock(Value *const Val, BasicBlock *const Block,
+              const BasicBlock *const PhiBlock, BaseIdentifier &BaseId) {
   if (Block->empty())
     return std::nullopt;
   auto *const BranchI = dyn_cast<BranchInst>(Block->getTerminator());
@@ -397,6 +399,7 @@ static inline void enqueueBlock(std::vector<BCECmpBlock> &Comparisons,
   Comparisons.push_back(std::move(Comparison));
 }
 
+namespace {
 // A chain of comparisons.
 class BCECmpChain {
 public:
@@ -420,6 +423,7 @@ class BCECmpChain {
   // The original entry block (before sorting);
   BasicBlock *EntryBlock_;
 };
+} // namespace
 
 static bool areContiguous(const BCECmpBlock &First, const BCECmpBlock &Second) {
   return First.Lhs().BaseId == Second.Lhs().BaseId &&
@@ -742,9 +746,8 @@ bool BCECmpChain::simplify(const TargetLibraryInfo &TLI, AliasAnalysis &AA,
   return true;
 }
 
-std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
-                                           BasicBlock *const LastBlock,
-                                           int NumBlocks) {
+static std::vector<BasicBlock *>
+getOrderedBlocks(PHINode &Phi, BasicBlock *const LastBlock, int NumBlocks) {
   // Walk up from the last block to find other blocks.
   std::vector<BasicBlock *> Blocks(NumBlocks);
   assert(LastBlock && "invalid last block");
@@ -777,8 +780,8 @@ std::vector<BasicBlock *> getOrderedBlocks(PHINode &Phi,
   return Blocks;
 }
 
-bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI, AliasAnalysis &AA,
-                DomTreeUpdater &DTU) {
+static bool processPhi(PHINode &Phi, const TargetLibraryInfo &TLI,
+                       AliasAnalysis &AA, DomTreeUpdater &DTU) {
   LLVM_DEBUG(dbgs() << "processPhi()\n");
   if (Phi.getNumIncomingValues() <= 1) {
     LLVM_DEBUG(dbgs() << "skip: only one incoming value in phi\n");
@@ -874,6 +877,7 @@ static bool runImpl(Function &F, const TargetLibraryInfo &TLI,
   return MadeChange;
 }
 
+namespace {
 class MergeICmpsLegacyPass : public FunctionPass {
 public:
   static char ID;
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index 8714741a08630..9829d4d50098c 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -1793,3 +1793,13 @@ bool llvm::hasOnlySimpleTerminator(const Function &F) {
   }
   return true;
 }
+
+Printable llvm::printBasicBlock(const BasicBlock *BB) {
+  return Printable([BB](raw_ostream &OS) {
+    if (!BB) {
+      OS << "<nullptr>";
+      return;
+    }
+    BB->printAsOperand(OS);
+  });
+}
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index 978d5a25a57c8..371d9e6c83163 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -260,9 +260,16 @@ bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
   // next to the defs they must go with so that we can know it's time to pop
   // the stack when we hit the end of the phi uses for a given def.
   const ValueDFS &Top = *Stack.back().V;
-  if (Top.LocalNum == LN_Last && Top.PInfo) {
-    if (!VDUse.U)
-      return false;
+  assert(Top.PInfo && "RenameStack should only contain predicate infos (defs)");
+  if (Top.LocalNum == LN_Last) {
+    if (!VDUse.U) {
+      assert(VDUse.PInfo && "A non-use VDUse should have a predicate info");
+      // We should reserve adjacent LN_Last defs for the same phi use.
+      return VDUse.LocalNum == LN_Last &&
+             // If the two phi defs have the same edge, they must be designated
+             // for the same succ BB.
+             getBlockEdge(Top.PInfo) == getBlockEdge(VDUse.PInfo);
+    }
     auto *PHI = dyn_cast<PHINode>(VDUse.U->getUser());
     if (!PHI)
       return false;
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index adf27bed3d749..facb0fabdf57e 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -4378,8 +4378,21 @@ VectorizationFactor LoopVectorizationPlanner::selectEpilogueVectorizationFactor(
   const SCEV *TC =
       vputils::getSCEVExprForVPValue(getPlanFor(MainLoopVF).getTripCount(), SE);
   assert(!isa<SCEVCouldNotCompute>(TC) && "Trip count SCEV must be computable");
-  RemainingIterations =
-      SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+  const SCEV *KnownMinTC;
+  bool ScalableTC = match(TC, m_scev_c_Mul(m_SCEV(KnownMinTC), m_SCEVVScale()));
+  // Use versions of TC and VF in which both are either scalable or fixed.
+  if (ScalableTC == MainLoopVF.isScalable())
+    RemainingIterations =
+        SE.getURemExpr(TC, SE.getElementCount(TCType, MainLoopVF * IC));
+  else if (ScalableTC) {
+    const SCEV *EstimatedTC = SE.getMulExpr(
+        KnownMinTC,
+        SE.getConstant(TCType, CM.getVScaleForTuning().value_or(1)));
+    RemainingIterations = SE.getURemExpr(
+        EstimatedTC, SE.getElementCount(TCType, MainLoopVF * IC));
+  } else
+    RemainingIterations =
+        SE.getURemExpr(TC, SE.getElementCount(TCType, EstimatedRuntimeVF * IC));
 
   // No iterations left to process in the epilogue.
   if (RemainingIterations->isZero())
@@ -7231,6 +7244,9 @@ DenseMap<const SCEV *, Value *> LoopVectorizationPlanner::executePlan(
     return DenseMap<const SCEV *, Value *>();
   }
 
+  VPlanTransforms::narrowInterleaveGroups(
+      BestVPlan, BestVF,
+      TTI.getRegisterBitWidth(TargetTransformInfo::RGK_FixedWidthVector));
   VPlanTransforms::removeDeadRecipes(BestVPlan);
 
   VPlanTransforms::convertToConcreteRecipes(BestVPlan);
@@ -8199,10 +8215,6 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(ElementCount MinVF,
       if (CM.foldTailWithEVL())
         VPlanTransforms::runPass(VPlanTransforms::addExplicitVectorLength,
                                  *Plan, CM.getMaxSafeElements());
-
-      if (auto P = VPlanTransforms::narrowInterleaveGroups(*Plan, TTI))
-        VPlans.push_back(std::move(P));
-
       assert(verifyVPlanIsValid(*Plan) && "VPlan is invalid");
       VPlans.push_back(std::move(Plan));
     }
@@ -9860,6 +9872,8 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   // Get user vectorization factor and interleave count.
   ElementCount UserVF = Hints.getWidth();
   unsigned UserIC = Hints.getInterleave();
+  if (UserIC > 1 && !LVL.isSafeForAnyVectorWidth())
+    UserIC = 1;
 
   // Plan how to best vectorize.
   LVP.plan(UserVF, UserIC);
@@ -9924,7 +9938,15 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     VectorizeLoop = false;
   }
 
-  if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
+  if (UserIC == 1 && Hints.getInterleave() > 1) {
+    assert(!LVL.isSafeForAnyVectorWidth() &&
+           "UserIC should only be ignored due to unsafe dependencies");
+    LLVM_DEBUG(dbgs() << "LV: Ignoring user-specified interleave count.\n");
+    IntDiagMsg = {"InterleavingUnsafe",
+                  "Ignoring user-specified interleave count due to possibly "
+                  "unsafe dependencies in the loop."};
+    InterleaveLoop = false;
+  } else if (!LVP.hasPlanWithVF(VF.Width) && UserIC > 1) {
     // Tell the user interleaving was avoided up-front, despite being explicitly
     // requested.
     LLVM_DEBUG(dbgs() << "LV: Ignoring UserIC, because vectorization and "
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index c95c88735c5b4..428a8f4c1348f 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1191,7 +1191,6 @@ VPlan *VPlan::duplicate() {
   }
   Old2NewVPValues[&VectorTripCount] = &NewPlan->VectorTripCount;
   Old2NewVPValues[&VF] = &NewPlan->VF;
-  Old2NewVPValues[&UF] = &NewPlan->UF;
   Old2NewVPValues[&VFxUF] = &NewPlan->VFxUF;
   if (BackedgeTakenCount) {
     NewPlan->BackedgeTakenCount = new VPValue();
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 167ba553af599..2591df8943752 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -2712,7 +2712,8 @@ class LLVM_ABI_FOR_TEST VPReductionRecipe : public VPRecipeWithIRFlags {
 
   static inline bool classof(const VPRecipeBase *R) {
     return R->getVPDefID() == VPRecipeBase::VPReductionSC ||
-           R->getVPDefID() == VPRecipeBase::VPReductionEVLSC;
+           R->getVPDefID() == VPRecipeBase::VPReductionEVLSC ||
+           R->getVPDefID() == VPRecipeBase::VPPartialReductionSC;
   }
 
   static inline bool classof(const VPUser *U) {
@@ -2783,7 +2784,10 @@ class VPPartialReductionRecipe : public VPReductionRecipe {
         Opcode(Opcode), VFScaleFactor(ScaleFactor) {
     [[maybe_unused]] auto *AccumulatorRecipe =
         getChainOp()->getDefiningRecipe();
-    assert((isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
+    // When cloning as part of a VPExpressionRecipe the chain op could have
+    // replaced by a temporary VPValue, so it doesn't have a defining recipe.
+    assert((!AccumulatorRecipe ||
+            isa<VPReductionPHIRecipe>(AccumulatorRecipe) ||
             isa<VPPartialReductionRecipe>(AccumulatorRecipe)) &&
            "Unexpected operand order for partial reduction recipe");
   }
@@ -3093,6 +3097,11 @@ class VPExpressionRecipe : public VPSingleDefRecipe {
   /// removed before codegen.
   void decompose();
 
+  unsigned getVFScaleFactor() const {
+    auto *PR = dyn_cast<VPPartialReductionRecipe>(ExpressionRecipes.back());
+    return PR ? PR->getVFScaleFactor() : 1;
+  }
+
   /// Method for generating code, must not be called as this recipe is abstract.
   void execute(VPTransformState &State) override {
     llvm_unreachable("recipe must be removed before execute");
@@ -4152,9 +4161,6 @@ class VPlan {
   /// Represents the vectorization factor of the loop.
   VPValue VF;
 
-  /// Represents the symbolic unroll factor of the loop.
-  VPValue UF;
-
   /// Represents the loop-invariant VF * UF of the vector loop region.
   VPValue VFxUF;
 
@@ -4166,11 +4172,6 @@ class VPlan {
   /// definitions are VPValues that hold a pointer to their underlying IR.
   SmallVector<VPValue *, 16> VPLiveIns;
 
-  /// Mapping from SCEVs to the VPValues representing their expansions.
-  /// NOTE: This mapping is temporary and will be removed once all users have
-  /// been modeled in VPlan directly.
-  DenseMap<const SCEV *, VPValue *> SCEVToExpansion;
-
   /// Blocks allocated and owned by the VPlan. They will be deleted once the
   /// VPlan is destroyed.
   SmallVector<VPBlockBase *> CreatedBlocks;
@@ -4308,9 +4309,6 @@ class VPlan {
   VPValue &getVF() { return VF; };
   const VPValue &getVF() const { return VF; };
 
-  /// Returns the symbolic UF of the vector loop region.
-  VPValue &getSymbolicUF() { return UF; };
-
   /// Returns VF * UF of the vector loop region.
   VPValue &getVFxUF() { return VFxUF; }
 
@@ -4320,12 +4318,6 @@ class VPlan {
 
   void addVF(ElementCount VF) { VFs.insert(VF); }
 
-  /// Remove \p VF from the plan.
-  void removeVF(ElementCount VF) {
-    assert(hasVF(VF) && "tried to remove VF not present in plan");
-    VFs.remove(VF);
-  }
-
   void setVF(ElementCount VF) {
     assert(hasVF(VF) && "Cannot set VF not already in plan");
     VFs.clear();
@@ -4427,15 +4419,6 @@ class VPlan {
   LLVM_DUMP_METHOD void dump() const;
 #endif
 
-  VPValue *getSCEVExpansion(const SCEV *S) const {
-    return SCEVToExpansion.lookup(S);
-  }
-
-  void addSCEVExpansion(const SCEV *S, VPValue *V) {
-    assert(!SCEVToExpansion.contains(S) && "SCEV already expanded");
-    SCEVToExpansion[S] = V;
-  }
-
   /// Clone the current VPlan, update all VPValues of the new VPlan and cloned
   /// recipes to refer to the clones, and return it.
   VPlan *duplicate();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 1f1b42bb9c19f..931a5b7582c4e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -168,6 +168,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
     return cast<VPWidenIntrinsicRecipe>(this)->mayHaveSideEffects();
   case VPBlendSC:
   case VPReductionEVLSC:
+  case VPPartialReductionSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
   case VPVectorPointerSC:
@@ -300,14 +301,23 @@ InstructionCost
 VPPartialReductionRecipe::computeCost(ElementCount VF,
                                       VPCostContext &Ctx) const {
   std::optional<unsigned> Opcode;
-  VPValue *Op = getOperand(0);
-  VPRecipeBase *OpR = Op->getDefiningRecipe();
-
-  // If the partial reduction is predicated, a select will be operand 0
-  if (match(getOperand(1), m_Select(m_VPValue(), m_VPValue(Op), m_VPValue()))) {
-    OpR = Op->getDefiningRecipe();
+  VPValue *Op = getVecOp();
+  uint64_t MulConst;
+  // If the partial reduction is predicated, a select will be operand 1.
+  // If it isn't predicated and the mul isn't operating on a constant, then it
+  // should have been turned into a VPExpressionRecipe.
+  // FIXME: Replace the entire function with this once all partial reduction
+  // variants are bundled into VPExpressionRecipe.
+  if (!match(Op, m_Select(m_VPValue(), m_VPValue(Op), m_VPValue())) &&
+      !match(Op, m_Mul(m_VPValue(), m_ConstantInt(MulConst)))) {
+    auto *PhiType = Ctx.Types.inferScalarType(getChainOp());
+    auto *InputType = Ctx.Types.inferScalarType(getVecOp());
+    return Ctx.TTI.getPartialReductionCost(getOpcode(), InputType, InputType,
+                                           PhiType, VF, TTI::PR_None,
+                                           TTI::PR_None, {}, Ctx.CostKind);
   }
 
+  VPRecipeBase *OpR = Op->getDefiningRecipe();
   Type *InputTypeA = nullptr, *InputTypeB = nullptr;
   TTI::PartialReductionExtendKind ExtAType = TTI::PR_None,
                                   ExtBType = TTI::PR_None;
@@ -2856,11 +2866,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
       cast<VPReductionRecipe>(ExpressionRecipes.back())->getRecurrenceKind());
   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
-    return Ctx.TTI.getExtendedReductionCost(
-        Opcode,
-        cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
-            Instruction::ZExt,
-        RedTy, SrcVecTy, std::nullopt, Ctx.CostKind);
+    unsigned Opcode = RecurrenceDescriptor::getOpcode(
+        cast<VPReductionRecipe>(ExpressionRecipes[1])->getRecurrenceKind());
+    auto *ExtR = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+    return isa<VPPartialReductionRecipe>(ExpressionRecipes.back())
+               ? Ctx.TTI.getPartialReductionCost(
+                     Opcode, Ctx.Types.inferScalarType(getOperand(0)), nullptr,
+                     RedTy, VF,
+                     TargetTransformInfo::getPartialReductionExtendKind(
+                         ExtR->getOpcode()),
+                     TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind)
+               : Ctx.TTI.getExtendedReductionCost(
+                     Opcode, ExtR->getOpcode() == Instruction::ZExt, RedTy,
+                     SrcVecTy, std::nullopt, Ctx.CostKind);
   }
   case ExpressionTypes::MulAccReduction:
     return Ctx.TTI.getMulAccReductionCost(false, Opcode, RedTy, SrcVecTy,
@@ -2871,6 +2889,19 @@ InstructionCost VPExpressionRecipe::computeCost(ElementCount VF,
     Opcode = Instruction::Sub;
     [[fallthrough]];
   case ExpressionTypes::ExtMulAccReduction: {
+    if (isa<VPPartialReductionRecipe>(ExpressionRecipes.back())) {
+      auto *Ext0R = cast<VPWidenCastRecipe>(ExpressionRecipes[0]);
+      auto *Ext1R = cast<VPWidenCastRecipe>(ExpressionRecipes[1]);
+      auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
+      return Ctx.TTI.getPartialReductionCost(
+          Opcode, Ctx.Types.inferScalarType(getOperand(0)),
+          Ctx.Types.inferScalarType(getOperand(1)), RedTy, VF,
+          TargetTransformInfo::getPartialReductionExtendKind(
+              Ext0R->getOpcode()),
+          TargetTransformInfo::getPartialReductionExtendKind(
+              Ext1R->getOpcode()),
+          Mul->getOpcode(), Ctx.CostKind);
+    }
     return Ctx.TTI.getMulAccReductionCost(
         cast<VPWidenCastRecipe>(ExpressionRecipes.front())->getOpcode() ==
             Instruction::ZExt,
@@ -2910,12 +2941,13 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
   O << " = ";
   auto *Red = cast<VPReductionRecipe>(ExpressionRecipes.back());
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
+  bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
 
   switch (ExpressionType) {
   case ExpressionTypes::ExtendedReduction: {
     getOperand(1)->printAsOperand(O, SlotTracker);
-    O << " +";
-    O << " reduce." << Instruction::getOpcodeName(Opcode) << " (";
+    O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+    O << Instruction::getOpcodeName(Opcode) << " (";
     getOperand(0)->printAsOperand(O, SlotTracker);
     Red->printFlags(O);
 
@@ -2931,8 +2963,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
   }
   case ExpressionTypes::ExtNegatedMulAccReduction: {
     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
-    O << " + reduce."
-      << Instruction::getOpcodeName(
+    O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+    O << Instruction::getOpcodeName(
              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
       << " (sub (0, mul";
     auto *Mul = cast<VPWidenRecipe>(ExpressionRecipes[2]);
@@ -2956,9 +2988,8 @@ void VPExpressionRecipe::print(raw_ostream &O, const Twine &Indent,
   case ExpressionTypes::MulAccReduction:
   case ExpressionTypes::ExtMulAccReduction: {
     getOperand(getNumOperands() - 1)->printAsOperand(O, SlotTracker);
-    O << " + ";
-    O << "reduce."
-      << Instruction::getOpcodeName(
+    O << " + " << (IsPartialReduction ? "partial." : "") << "reduce.";
+    O << Instruction::getOpcodeName(
              RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()))
       << " (";
     O << "mul";
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index ff25ef52e3380..84817d78a077a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -943,12 +943,40 @@ static void recursivelyDeleteDeadRecipes(VPValue *V) {
   }
 }
 
+/// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
+/// Returns an optional pair, where the first element indicates whether it is
+/// an intrinsic ID.
+static std::optional<std::pair<bool, unsigned>>
+getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
+  return TypeSwitch<const VPSingleDefRecipe *,
+                    std::optional<std::pair<bool, unsigned>>>(R)
+      .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
+            VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
+          [](auto *I) { return std::make_pair(false, I->getOpcode()); })
+      .Case<VPWidenIntrinsicRecipe>([](auto *I) {
+        return std::make_pair(true, I->getVectorIntrinsicID());
+      })
+      .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
+        // For recipes that do not directly map to LLVM IR instructions,
+        // assign opcodes after the last VPInstruction opcode (which is also
+        // after the last IR Instruction opcode), based on the VPDefID.
+        return std::make_pair(false,
+                              VPInstruction::OpsEnd + 1 + I->getVPDefID());
+      })
+      .Default([](auto *) { return std::nullopt; });
+}
+
 /// Try to fold \p R using InstSimplifyFolder. Will succeed and return a
-/// non-nullptr Value for a handled \p Opcode if corresponding \p Operands are
-/// foldable live-ins.
-static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
-                               ArrayRef<VPValue *> Operands,
-                               const DataLayout &DL, VPTypeAnalysis &TypeInfo) {
+/// non-nullptr VPValue for a handled opcode or intrinsic ID if corresponding \p
+/// Operands are foldable live-ins.
+static VPValue *tryToFoldLiveIns(VPSingleDefRecipe &R,
+                                 ArrayRef<VPValue *> Operands,
+                                 const DataLayout &DL,
+                                 VPTypeAnalysis &TypeInfo) {
+  auto OpcodeOrIID = getOpcodeOrIntrinsicID(&R);
+  if (!OpcodeOrIID)
+    return nullptr;
+
   SmallVector<Value *, 4> Ops;
   for (VPValue *Op : Operands) {
     if (!Op->isLiveIn() || !Op->getLiveInIRValue())
@@ -956,43 +984,57 @@ static Value *tryToFoldLiveIns(const VPRecipeBase &R, unsigned Opcode,
     Ops.push_back(Op->getLiveInIRValue());
   }
 
-  InstSimplifyFolder Folder(DL);
-  if (Instruction::isBinaryOp(Opcode))
-    return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode), Ops[0],
+  auto FoldToIRValue = [&]() -> Value * {
+    InstSimplifyFolder Folder(DL);
+    if (OpcodeOrIID->first) {
+      if (R.getNumOperands() != 2)
+        return nullptr;
+      unsigned ID = OpcodeOrIID->second;
+      return Folder.FoldBinaryIntrinsic(ID, Ops[0], Ops[1],
+                                        TypeInfo.inferScalarType(&R));
+    }
+    unsigned Opcode = OpcodeOrIID->second;
+    if (Instruction::isBinaryOp(Opcode))
+      return Folder.FoldBinOp(static_cast<Instruction::BinaryOps>(Opcode),
+                              Ops[0], Ops[1]);
+    if (Instruction::isCast(Opcode))
+      return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
+                             TypeInfo.inferScalarType(R.getVPSingleValue()));
+    switch (Opcode) {
+    case VPInstruction::LogicalAnd:
+      return Folder.FoldSelect(Ops[0], Ops[1],
+                               ConstantInt::getNullValue(Ops[1]->getType()));
+    case VPInstruction::Not:
+      return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
+                              Constant::getAllOnesValue(Ops[0]->getType()));
+    case Instruction::Select:
+      return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+      return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
                             Ops[1]);
-  if (Instruction::isCast(Opcode))
-    return Folder.FoldCast(static_cast<Instruction::CastOps>(Opcode), Ops[0],
-                           TypeInfo.inferScalarType(R.getVPSingleValue()));
-  switch (Opcode) {
-  case VPInstruction::LogicalAnd:
-    return Folder.FoldSelect(Ops[0], Ops[1],
-                             ConstantInt::getNullValue(Ops[1]->getType()));
-  case VPInstruction::Not:
-    return Folder.FoldBinOp(Instruction::BinaryOps::Xor, Ops[0],
-                            Constant::getAllOnesValue(Ops[0]->getType()));
-  case Instruction::Select:
-    return Folder.FoldSelect(Ops[0], Ops[1], Ops[2]);
-  case Instruction::ICmp:
-  case Instruction::FCmp:
-    return Folder.FoldCmp(cast<VPRecipeWithIRFlags>(R).getPredicate(), Ops[0],
-                          Ops[1]);
-  case Instruction::GetElementPtr: {
-    auto &RFlags = cast<VPRecipeWithIRFlags>(R);
-    auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
-    return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0], drop_begin(Ops),
-                          RFlags.getGEPNoWrapFlags());
-  }
-  case VPInstruction::PtrAdd:
-  case VPInstruction::WidePtrAdd:
-    return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()), Ops[0],
-                          Ops[1],
-                          cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
-  // An extract of a live-in is an extract of a broadcast, so return the
-  // broadcasted element.
-  case Instruction::ExtractElement:
-    assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
-    return Ops[0];
-  }
+    case Instruction::GetElementPtr: {
+      auto &RFlags = cast<VPRecipeWithIRFlags>(R);
+      auto *GEP = cast<GetElementPtrInst>(RFlags.getUnderlyingInstr());
+      return Folder.FoldGEP(GEP->getSourceElementType(), Ops[0],
+                            drop_begin(Ops), RFlags.getGEPNoWrapFlags());
+    }
+    case VPInstruction::PtrAdd:
+    case VPInstruction::WidePtrAdd:
+      return Folder.FoldGEP(IntegerType::getInt8Ty(TypeInfo.getContext()),
+                            Ops[0], Ops[1],
+                            cast<VPRecipeWithIRFlags>(R).getGEPNoWrapFlags());
+    // An extract of a live-in is an extract of a broadcast, so return the
+    // broadcasted element.
+    case Instruction::ExtractElement:
+      assert(!Ops[0]->getType()->isVectorTy() && "Live-ins should be scalar");
+      return Ops[0];
+    }
+    return nullptr;
+  };
+
+  if (Value *V = FoldToIRValue())
+    return R.getParent()->getPlan()->getOrAddLiveIn(V);
   return nullptr;
 }
 
@@ -1006,19 +1048,10 @@ static void simplifyRecipe(VPRecipeBase &R, VPTypeAnalysis &TypeInfo) {
 
   // Simplification of live-in IR values for SingleDef recipes using
   // InstSimplifyFolder.
-  if (TypeSwitch<VPRecipeBase *, bool>(&R)
-          .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
-                VPReplicateRecipe, VPWidenSelectRecipe>([&](auto *I) {
-            const DataLayout &DL =
-                Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
-            Value *V = tryToFoldLiveIns(*I, I->getOpcode(), I->operands(), DL,
-                                        TypeInfo);
-            if (V)
-              I->replaceAllUsesWith(Plan->getOrAddLiveIn(V));
-            return V;
-          })
-          .Default([](auto *) { return false; }))
-    return;
+  const DataLayout &DL =
+      Plan->getScalarHeader()->getIRBasicBlock()->getDataLayout();
+  if (VPValue *V = tryToFoldLiveIns(*Def, Def->operands(), DL, TypeInfo))
+    return Def->replaceAllUsesWith(V);
 
   // Fold PredPHI LiveIn -> LiveIn.
   if (auto *PredPHI = dyn_cast<VPPredInstPHIRecipe>(&R)) {
@@ -1996,29 +2029,6 @@ struct VPCSEDenseMapInfo : public DenseMapInfo<VPSingleDefRecipe *> {
     return Def == getEmptyKey() || Def == getTombstoneKey();
   }
 
-  /// Get any instruction opcode or intrinsic ID data embedded in recipe \p R.
-  /// Returns an optional pair, where the first element indicates whether it is
-  /// an intrinsic ID.
-  static std::optional<std::pair<bool, unsigned>>
-  getOpcodeOrIntrinsicID(const VPSingleDefRecipe *R) {
-    return TypeSwitch<const VPSingleDefRecipe *,
-                      std::optional<std::pair<bool, unsigned>>>(R)
-        .Case<VPInstruction, VPWidenRecipe, VPWidenCastRecipe,
-              VPWidenSelectRecipe, VPWidenGEPRecipe, VPReplicateRecipe>(
-            [](auto *I) { return std::make_pair(false, I->getOpcode()); })
-        .Case<VPWidenIntrinsicRecipe>([](auto *I) {
-          return std::make_pair(true, I->getVectorIntrinsicID());
-        })
-        .Case<VPVectorPointerRecipe, VPPredInstPHIRecipe>([](auto *I) {
-          // For recipes that do not directly map to LLVM IR instructions,
-          // assign opcodes after the last VPInstruction opcode (which is also
-          // after the last IR Instruction opcode), based on the VPDefID.
-          return std::make_pair(false,
-                                VPInstruction::OpsEnd + 1 + I->getVPDefID());
-        })
-        .Default([](auto *) { return std::nullopt; });
-  }
-
   /// If recipe \p R will lower to a GEP with a non-i8 source element type,
   /// return that source element type.
   static Type *getGEPSourceElementType(const VPSingleDefRecipe *R) {
@@ -3519,18 +3529,31 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
   VPValue *VecOp = Red->getVecOp();
 
   // Clamp the range if using extended-reduction is profitable.
-  auto IsExtendedRedValidAndClampRange = [&](unsigned Opcode, bool isZExt,
-                                             Type *SrcTy) -> bool {
+  auto IsExtendedRedValidAndClampRange =
+      [&](unsigned Opcode, Instruction::CastOps ExtOpc, Type *SrcTy) -> bool {
     return LoopVectorizationPlanner::getDecisionAndClampRange(
         [&](ElementCount VF) {
           auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
-          InstructionCost ExtRedCost = Ctx.TTI.getExtendedReductionCost(
-              Opcode, isZExt, RedTy, SrcVecTy, Red->getFastMathFlags(),
-              CostKind);
+
+          InstructionCost ExtRedCost;
           InstructionCost ExtCost =
               cast<VPWidenCastRecipe>(VecOp)->computeCost(VF, Ctx);
           InstructionCost RedCost = Red->computeCost(VF, Ctx);
+
+          if (isa<VPPartialReductionRecipe>(Red)) {
+            TargetTransformInfo::PartialReductionExtendKind ExtKind =
+                TargetTransformInfo::getPartialReductionExtendKind(ExtOpc);
+            // FIXME: Move partial reduction creation, costing and clamping
+            // here from LoopVectorize.cpp.
+            ExtRedCost = Ctx.TTI.getPartialReductionCost(
+                Opcode, SrcTy, nullptr, RedTy, VF, ExtKind,
+                llvm::TargetTransformInfo::PR_None, std::nullopt, Ctx.CostKind);
+          } else {
+            ExtRedCost = Ctx.TTI.getExtendedReductionCost(
+                Opcode, ExtOpc == Instruction::CastOps::ZExt, RedTy, SrcVecTy,
+                Red->getFastMathFlags(), CostKind);
+          }
           return ExtRedCost.isValid() && ExtRedCost < ExtCost + RedCost;
         },
         Range);
@@ -3541,8 +3564,7 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
   if (match(VecOp, m_ZExtOrSExt(m_VPValue(A))) &&
       IsExtendedRedValidAndClampRange(
           RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind()),
-          cast<VPWidenCastRecipe>(VecOp)->getOpcode() ==
-              Instruction::CastOps::ZExt,
+          cast<VPWidenCastRecipe>(VecOp)->getOpcode(),
           Ctx.Types.inferScalarType(A)))
     return new VPExpressionRecipe(cast<VPWidenCastRecipe>(VecOp), Red);
 
@@ -3560,6 +3582,8 @@ tryToMatchAndCreateExtendedReduction(VPReductionRecipe *Red, VPCostContext &Ctx,
 static VPExpressionRecipe *
 tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
                                           VPCostContext &Ctx, VFRange &Range) {
+  bool IsPartialReduction = isa<VPPartialReductionRecipe>(Red);
+
   unsigned Opcode = RecurrenceDescriptor::getOpcode(Red->getRecurrenceKind());
   if (Opcode != Instruction::Add && Opcode != Instruction::Sub)
     return nullptr;
@@ -3568,16 +3592,41 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
 
   // Clamp the range if using multiply-accumulate-reduction is profitable.
   auto IsMulAccValidAndClampRange =
-      [&](bool isZExt, VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0,
-          VPWidenCastRecipe *Ext1, VPWidenCastRecipe *OuterExt) -> bool {
+      [&](VPWidenRecipe *Mul, VPWidenCastRecipe *Ext0, VPWidenCastRecipe *Ext1,
+          VPWidenCastRecipe *OuterExt) -> bool {
     return LoopVectorizationPlanner::getDecisionAndClampRange(
         [&](ElementCount VF) {
           TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
           Type *SrcTy =
               Ext0 ? Ctx.Types.inferScalarType(Ext0->getOperand(0)) : RedTy;
-          auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
-          InstructionCost MulAccCost = Ctx.TTI.getMulAccReductionCost(
-              isZExt, Opcode, RedTy, SrcVecTy, CostKind);
+          InstructionCost MulAccCost;
+
+          if (IsPartialReduction) {
+            Type *SrcTy2 =
+                Ext1 ? Ctx.Types.inferScalarType(Ext1->getOperand(0)) : nullptr;
+            // FIXME: Move partial reduction creation, costing and clamping
+            // here from LoopVectorize.cpp.
+            MulAccCost = Ctx.TTI.getPartialReductionCost(
+                Opcode, SrcTy, SrcTy2, RedTy, VF,
+                Ext0 ? TargetTransformInfo::getPartialReductionExtendKind(
+                           Ext0->getOpcode())
+                     : TargetTransformInfo::PR_None,
+                Ext1 ? TargetTransformInfo::getPartialReductionExtendKind(
+                           Ext1->getOpcode())
+                     : TargetTransformInfo::PR_None,
+                Mul->getOpcode(), CostKind);
+          } else {
+            // Only partial reductions support mixed extends at the moment.
+            if (Ext0 && Ext1 && Ext0->getOpcode() != Ext1->getOpcode())
+              return false;
+
+            bool IsZExt =
+                !Ext0 || Ext0->getOpcode() == Instruction::CastOps::ZExt;
+            auto *SrcVecTy = cast<VectorType>(toVectorTy(SrcTy, VF));
+            MulAccCost = Ctx.TTI.getMulAccReductionCost(IsZExt, Opcode, RedTy,
+                                                        SrcVecTy, CostKind);
+          }
+
           InstructionCost MulCost = Mul->computeCost(VF, Ctx);
           InstructionCost RedCost = Red->computeCost(VF, Ctx);
           InstructionCost ExtCost = 0;
@@ -3611,14 +3660,10 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         dyn_cast_if_present<VPWidenCastRecipe>(B->getDefiningRecipe());
     auto *Mul = cast<VPWidenRecipe>(VecOp->getDefiningRecipe());
 
-    // Match reduce.add(mul(ext, ext)).
-    if (RecipeA && RecipeB &&
-        (RecipeA->getOpcode() == RecipeB->getOpcode() || A == B) &&
-        match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
+    // Match reduce.add/sub(mul(ext, ext)).
+    if (RecipeA && RecipeB && match(RecipeA, m_ZExtOrSExt(m_VPValue())) &&
         match(RecipeB, m_ZExtOrSExt(m_VPValue())) &&
-        IsMulAccValidAndClampRange(RecipeA->getOpcode() ==
-                                       Instruction::CastOps::ZExt,
-                                   Mul, RecipeA, RecipeB, nullptr)) {
+        IsMulAccValidAndClampRange(Mul, RecipeA, RecipeB, nullptr)) {
       if (Sub)
         return new VPExpressionRecipe(RecipeA, RecipeB, Mul,
                                       cast<VPWidenRecipe>(Sub), Red);
@@ -3626,8 +3671,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
     }
     // Match reduce.add(mul).
     // TODO: Add an expression type for this variant with a negated mul
-    if (!Sub &&
-        IsMulAccValidAndClampRange(true, Mul, nullptr, nullptr, nullptr))
+    if (!Sub && IsMulAccValidAndClampRange(Mul, nullptr, nullptr, nullptr))
       return new VPExpressionRecipe(Mul, Red);
   }
   // TODO: Add an expression type for negated versions of other expression
@@ -3647,9 +3691,7 @@ tryToMatchAndCreateMulAccumulateReduction(VPReductionRecipe *Red,
         cast<VPWidenCastRecipe>(Mul->getOperand(1)->getDefiningRecipe());
     if ((Ext->getOpcode() == Ext0->getOpcode() || Ext0 == Ext1) &&
         Ext0->getOpcode() == Ext1->getOpcode() &&
-        IsMulAccValidAndClampRange(Ext0->getOpcode() ==
-                                       Instruction::CastOps::ZExt,
-                                   Mul, Ext0, Ext1, Ext)) {
+        IsMulAccValidAndClampRange(Mul, Ext0, Ext1, Ext) && Mul->hasOneUse()) {
       auto *NewExt0 = new VPWidenCastRecipe(
           Ext0->getOpcode(), Ext0->getOperand(0), Ext->getResultType(), *Ext0,
           *Ext0, Ext0->getDebugLoc());
@@ -3956,9 +3998,6 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   // used.
   // TODO: Assert that they aren't used.
 
-  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
-  Plan.getSymbolicUF().replaceAllUsesWith(UF);
-
   // If there are no users of the runtime VF, compute VFxUF by constant folding
   // the multiplication of VF and UF.
   if (VF.getNumUsers() == 0) {
@@ -3978,6 +4017,7 @@ void VPlanTransforms::materializeVFAndVFxUF(VPlan &Plan, VPBasicBlock *VectorPH,
   }
   VF.replaceAllUsesWith(RuntimeVF);
 
+  VPValue *UF = Plan.getOrAddLiveIn(ConstantInt::get(TCTy, Plan.getUF()));
   VPValue *MulByUF = Builder.createNaryOp(Instruction::Mul, {RuntimeVF, UF});
   VFxUF.replaceAllUsesWith(MulByUF);
 }
@@ -4045,14 +4085,14 @@ static bool canNarrowLoad(VPWidenRecipe *WideMember0, unsigned OpIdx,
   return false;
 }
 
-/// Returns VF from \p VFs if \p IR is a full interleave group with factor and
-/// number of members both equal to VF. The interleave group must also access
-/// the full vector width.
-static std::optional<ElementCount> isConsecutiveInterleaveGroup(
-    VPInterleaveRecipe *InterleaveR, ArrayRef<ElementCount> VFs,
-    VPTypeAnalysis &TypeInfo, const TargetTransformInfo &TTI) {
-  if (!InterleaveR)
-    return std::nullopt;
+/// Returns true if \p IR is a full interleave group with factor and number of
+/// members both equal to \p VF. The interleave group must also access the full
+/// vector width \p VectorRegWidth.
+static bool isConsecutiveInterleaveGroup(VPInterleaveRecipe *InterleaveR,
+                                         unsigned VF, VPTypeAnalysis &TypeInfo,
+                                         unsigned VectorRegWidth) {
+  if (!InterleaveR || InterleaveR->getMask())
+    return false;
 
   Type *GroupElementTy = nullptr;
   if (InterleaveR->getStoredValues().empty()) {
@@ -4061,7 +4101,7 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup(
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return std::nullopt;
+      return false;
   } else {
     GroupElementTy =
         TypeInfo.inferScalarType(InterleaveR->getStoredValues()[0]);
@@ -4069,27 +4109,13 @@ static std::optional<ElementCount> isConsecutiveInterleaveGroup(
                 [&TypeInfo, GroupElementTy](VPValue *Op) {
                   return TypeInfo.inferScalarType(Op) == GroupElementTy;
                 }))
-      return std::nullopt;
+      return false;
   }
 
-  auto GetVectorWidthForVF = [&TTI](ElementCount VF) {
-    TypeSize Size = TTI.getRegisterBitWidth(
-        VF.isFixed() ? TargetTransformInfo::RGK_FixedWidthVector
-                     : TargetTransformInfo::RGK_ScalableVector);
-    assert(Size.isScalable() == VF.isScalable() &&
-           "if Size is scalable, VF must to and vice versa");
-    return Size.getKnownMinValue();
-  };
-
-  for (ElementCount VF : VFs) {
-    unsigned MinVal = VF.getKnownMinValue();
-    unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * MinVal;
-    auto IG = InterleaveR->getInterleaveGroup();
-    if (IG->getFactor() == MinVal && IG->getNumMembers() == MinVal &&
-        GroupSize == GetVectorWidthForVF(VF))
-      return {VF};
-  }
-  return std::nullopt;
+  unsigned GroupSize = GroupElementTy->getScalarSizeInBits() * VF;
+  auto IG = InterleaveR->getInterleaveGroup();
+  return IG->getFactor() == VF && IG->getNumMembers() == VF &&
+         GroupSize == VectorRegWidth;
 }
 
 /// Returns true if \p VPValue is a narrow VPValue.
@@ -4100,18 +4126,16 @@ static bool isAlreadyNarrow(VPValue *VPV) {
   return RepR && RepR->isSingleScalar();
 }
 
-std::unique_ptr<VPlan>
-VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
-                                        const TargetTransformInfo &TTI) {
-  using namespace llvm::VPlanPatternMatch;
+void VPlanTransforms::narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+                                             unsigned VectorRegWidth) {
   VPRegionBlock *VectorLoop = Plan.getVectorLoopRegion();
-
-  if (!VectorLoop)
-    return nullptr;
+  if (!VectorLoop || VectorLoop->getEntry()->getNumSuccessors() != 0)
+    return;
 
   VPTypeAnalysis TypeInfo(Plan);
+
+  unsigned VFMinVal = VF.getKnownMinValue();
   SmallVector<VPInterleaveRecipe *> StoreGroups;
-  std::optional<ElementCount> VFToOptimize;
   for (auto &R : *VectorLoop->getEntryBasicBlock()) {
     if (isa<VPCanonicalIVPHIRecipe>(&R) || match(&R, m_BranchOnCount()))
       continue;
@@ -4125,33 +4149,30 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
     //  * recipes writing to memory except interleave groups
     // Only support plans with a canonical induction phi.
     if (R.isPhi())
-      return nullptr;
+      return;
 
     auto *InterleaveR = dyn_cast<VPInterleaveRecipe>(&R);
     if (R.mayWriteToMemory() && !InterleaveR)
-      return nullptr;
+      return;
+
+    // Do not narrow interleave groups if there are VectorPointer recipes and
+    // the plan was unrolled. The recipe implicitly uses VF from
+    // VPTransformState.
+    // TODO: Remove restriction once the VF for the VectorPointer offset is
+    // modeled explicitly as operand.
+    if (isa<VPVectorPointerRecipe>(&R) && Plan.getUF() > 1)
+      return;
 
     // All other ops are allowed, but we reject uses that cannot be converted
     // when checking all allowed consumers (store interleave groups) below.
     if (!InterleaveR)
       continue;
 
-    // Try to find a single VF, where all interleave groups are consecutive and
-    // saturate the full vector width. If we already have a candidate VF, check
-    // if it is applicable for the current InterleaveR, otherwise look for a
-    // suitable VF across the Plans VFs.
-    //
-    if (VFToOptimize) {
-      if (!isConsecutiveInterleaveGroup(InterleaveR, {*VFToOptimize}, TypeInfo,
-                                        TTI))
-        return nullptr;
-    } else {
-      if (auto VF = isConsecutiveInterleaveGroup(
-              InterleaveR, to_vector(Plan.vectorFactors()), TypeInfo, TTI))
-        VFToOptimize = *VF;
-      else
-        return nullptr;
-    }
+    // Bail out on non-consecutive interleave groups.
+    if (!isConsecutiveInterleaveGroup(InterleaveR, VFMinVal, TypeInfo,
+                                      VectorRegWidth))
+      return;
+
     // Skip read interleave groups.
     if (InterleaveR->getStoredValues().empty())
       continue;
@@ -4185,34 +4206,24 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
     auto *WideMember0 = dyn_cast_or_null<VPWidenRecipe>(
         InterleaveR->getStoredValues()[0]->getDefiningRecipe());
     if (!WideMember0)
-      return nullptr;
+      return;
     for (const auto &[I, V] : enumerate(InterleaveR->getStoredValues())) {
       auto *R = dyn_cast_or_null<VPWidenRecipe>(V->getDefiningRecipe());
       if (!R || R->getOpcode() != WideMember0->getOpcode() ||
           R->getNumOperands() > 2)
-        return nullptr;
+        return;
       if (any_of(enumerate(R->operands()),
                  [WideMember0, Idx = I](const auto &P) {
                    const auto &[OpIdx, OpV] = P;
                    return !canNarrowLoad(WideMember0, OpIdx, OpV, Idx);
                  }))
-        return nullptr;
+        return;
     }
     StoreGroups.push_back(InterleaveR);
   }
 
   if (StoreGroups.empty())
-    return nullptr;
-
-  // All interleave groups in Plan can be narrowed for VFToOptimize. Split the
-  // original Plan into 2: a) a new clone which contains all VFs of Plan, except
-  // VFToOptimize, and b) the original Plan with VFToOptimize as single VF.
-  std::unique_ptr<VPlan> NewPlan;
-  if (size(Plan.vectorFactors()) != 1) {
-    NewPlan = std::unique_ptr<VPlan>(Plan.duplicate());
-    Plan.setVF(*VFToOptimize);
-    NewPlan->removeVF(*VFToOptimize);
-  }
+    return;
 
   // Convert InterleaveGroup \p R to a single VPWidenLoadRecipe.
   SmallPtrSet<VPValue *, 4> NarrowedOps;
@@ -4283,8 +4294,9 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
   auto *Inc = cast<VPInstruction>(CanIV->getBackedgeValue());
   VPBuilder PHBuilder(Plan.getVectorPreheader());
 
-  VPValue *UF = &Plan.getSymbolicUF();
-  if (VFToOptimize->isScalable()) {
+  VPValue *UF = Plan.getOrAddLiveIn(
+      ConstantInt::get(CanIV->getScalarType(), 1 * Plan.getUF()));
+  if (VF.isScalable()) {
     VPValue *VScale = PHBuilder.createElementCount(
         CanIV->getScalarType(), ElementCount::getScalable(1));
     VPValue *VScaleUF = PHBuilder.createNaryOp(Instruction::Mul, {VScale, UF});
@@ -4296,10 +4308,6 @@ VPlanTransforms::narrowInterleaveGroups(VPlan &Plan,
         Plan.getOrAddLiveIn(ConstantInt::get(CanIV->getScalarType(), 1)));
   }
   removeDeadRecipes(Plan);
-  assert(none_of(*VectorLoop->getEntryBasicBlock(),
-                 IsaPred<VPVectorPointerRecipe>) &&
-         "All VPVectorPointerRecipes should have been removed");
-  return NewPlan;
 }
 
 /// Add branch weight metadata, if the \p Plan's middle block is terminated by a
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index ca8d956d95d27..b28559b620e13 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -341,20 +341,14 @@ struct VPlanTransforms {
   static DenseMap<const SCEV *, Value *> expandSCEVs(VPlan &Plan,
                                                      ScalarEvolution &SE);
 
-  /// Try to find a single VF among \p Plan's VFs for which all interleave
-  /// groups (with known minimum VF elements) can be replaced by wide loads and
-  /// stores processing VF elements, if all transformed interleave groups access
-  /// the full vector width (checked via the maximum vector register width). If
-  /// the transformation can be applied, the original \p Plan will be split in
-  /// 2:
-  ///  1. The original Plan with the single VF containing the optimized recipes
-  ///  using wide loads instead of interleave groups.
-  ///  2. A new clone which contains all VFs of Plan except the optimized VF.
-  ///
-  /// This effectively is a very simple form of loop-aware SLP, where we use
-  /// interleave groups to identify candidates.
-  static std::unique_ptr<VPlan>
-  narrowInterleaveGroups(VPlan &Plan, const TargetTransformInfo &TTI);
+  /// Try to convert a plan with interleave groups with VF elements to a plan
+  /// with the interleave groups replaced by wide loads and stores processing VF
+  /// elements, if all transformed interleave groups access the full vector
+  /// width (checked via \o VectorRegWidth). This effectively is a very simple
+  /// form of loop-aware SLP, where we use interleave groups to identify
+  /// candidates.
+  static void narrowInterleaveGroups(VPlan &Plan, ElementCount VF,
+                                     unsigned VectorRegWidth);
 
   /// Predicate and linearize the control-flow in the only loop region of
   /// \p Plan. If \p FoldTail is true, create a mask guarding the loop
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
index 32e4b8872b1df..4db92e7def3ed 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.cpp
@@ -32,8 +32,6 @@ bool vputils::onlyScalarValuesUsed(const VPValue *Def) {
 }
 
 VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
-  if (auto *Expanded = Plan.getSCEVExpansion(Expr))
-    return Expanded;
   VPValue *Expanded = nullptr;
   if (auto *E = dyn_cast<SCEVConstant>(Expr))
     Expanded = Plan.getOrAddLiveIn(E->getValue());
@@ -50,7 +48,6 @@ VPValue *vputils::getOrCreateVPValueForSCEVExpr(VPlan &Plan, const SCEV *Expr) {
       Plan.getEntry()->appendRecipe(Expanded->getDefiningRecipe());
     }
   }
-  Plan.addSCEVExpansion(Expr, Expanded);
   return Expanded;
 }
 
@@ -92,6 +89,60 @@ const SCEV *vputils::getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE) {
       .Default([&SE](const VPRecipeBase *) { return SE.getCouldNotCompute(); });
 }
 
+bool vputils::isSingleScalar(const VPValue *VPV) {
+  auto PreservesUniformity = [](unsigned Opcode) -> bool {
+    if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
+      return true;
+    switch (Opcode) {
+    case Instruction::GetElementPtr:
+    case Instruction::ICmp:
+    case Instruction::FCmp:
+    case Instruction::Select:
+    case VPInstruction::Not:
+    case VPInstruction::Broadcast:
+    case VPInstruction::PtrAdd:
+      return true;
+    default:
+      return false;
+    }
+  };
+
+  // A live-in must be uniform across the scope of VPlan.
+  if (VPV->isLiveIn())
+    return true;
+
+  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
+    const VPRegionBlock *RegionOfR = Rep->getRegion();
+    // Don't consider recipes in replicate regions as uniform yet; their first
+    // lane cannot be accessed when executing the replicate region for other
+    // lanes.
+    if (RegionOfR && RegionOfR->isReplicator())
+      return false;
+    return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) &&
+                                     all_of(Rep->operands(), isSingleScalar));
+  }
+  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
+          VPWidenSelectRecipe>(VPV))
+    return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
+  if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
+    return PreservesUniformity(WidenR->getOpcode()) &&
+           all_of(WidenR->operands(), isSingleScalar);
+  }
+  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
+    return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
+           (PreservesUniformity(VPI->getOpcode()) &&
+            all_of(VPI->operands(), isSingleScalar));
+  if (isa<VPPartialReductionRecipe>(VPV))
+    return false;
+  if (isa<VPReductionRecipe>(VPV))
+    return true;
+  if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
+    return Expr->isSingleScalar();
+
+  // VPExpandSCEVRecipes must be placed in the entry and are always uniform.
+  return isa<VPExpandSCEVRecipe>(VPV);
+}
+
 bool vputils::isUniformAcrossVFsAndUFs(VPValue *V) {
   // Live-ins are uniform.
   if (V->isLiveIn())
@@ -151,6 +202,8 @@ unsigned vputils::getVFScaleFactor(VPRecipeBase *R) {
     return RR->getVFScaleFactor();
   if (auto *RR = dyn_cast<VPPartialReductionRecipe>(R))
     return RR->getVFScaleFactor();
+  if (auto *ER = dyn_cast<VPExpressionRecipe>(R))
+    return ER->getVFScaleFactor();
   assert(
       (!isa<VPInstruction>(R) || cast<VPInstruction>(R)->getOpcode() !=
                                      VPInstruction::ReductionStartVector) &&
diff --git a/llvm/lib/Transforms/Vectorize/VPlanUtils.h b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
index 840a5b9e12a01..37cd413da9079 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanUtils.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanUtils.h
@@ -41,59 +41,7 @@ const SCEV *getSCEVExprForVPValue(VPValue *V, ScalarEvolution &SE);
 
 /// Returns true if \p VPV is a single scalar, either because it produces the
 /// same value for all lanes or only has its first lane used.
-inline bool isSingleScalar(const VPValue *VPV) {
-  auto PreservesUniformity = [](unsigned Opcode) -> bool {
-    if (Instruction::isBinaryOp(Opcode) || Instruction::isCast(Opcode))
-      return true;
-    switch (Opcode) {
-    case Instruction::GetElementPtr:
-    case Instruction::ICmp:
-    case Instruction::FCmp:
-    case Instruction::Select:
-    case VPInstruction::Not:
-    case VPInstruction::Broadcast:
-    case VPInstruction::PtrAdd:
-      return true;
-    default:
-      return false;
-    }
-  };
-
-  // A live-in must be uniform across the scope of VPlan.
-  if (VPV->isLiveIn())
-    return true;
-
-  if (auto *Rep = dyn_cast<VPReplicateRecipe>(VPV)) {
-    const VPRegionBlock *RegionOfR = Rep->getRegion();
-    // Don't consider recipes in replicate regions as uniform yet; their first
-    // lane cannot be accessed when executing the replicate region for other
-    // lanes.
-    if (RegionOfR && RegionOfR->isReplicator())
-      return false;
-    return Rep->isSingleScalar() || (PreservesUniformity(Rep->getOpcode()) &&
-                                     all_of(Rep->operands(), isSingleScalar));
-  }
-  if (isa<VPWidenGEPRecipe, VPDerivedIVRecipe, VPBlendRecipe,
-          VPWidenSelectRecipe>(VPV))
-    return all_of(VPV->getDefiningRecipe()->operands(), isSingleScalar);
-  if (auto *WidenR = dyn_cast<VPWidenRecipe>(VPV)) {
-    return PreservesUniformity(WidenR->getOpcode()) &&
-           all_of(WidenR->operands(), isSingleScalar);
-  }
-  if (auto *VPI = dyn_cast<VPInstruction>(VPV))
-    return VPI->isSingleScalar() || VPI->isVectorToScalar() ||
-           (PreservesUniformity(VPI->getOpcode()) &&
-            all_of(VPI->operands(), isSingleScalar));
-  if (isa<VPPartialReductionRecipe>(VPV))
-    return false;
-  if (isa<VPReductionRecipe>(VPV))
-    return true;
-  if (auto *Expr = dyn_cast<VPExpressionRecipe>(VPV))
-    return Expr->isSingleScalar();
-
-  // VPExpandSCEVRecipes must be placed in the entry and are alway uniform.
-  return isa<VPExpandSCEVRecipe>(VPV);
-}
+bool isSingleScalar(const VPValue *VPV);
 
 /// Return true if \p V is a header mask in \p Plan.
 bool isHeaderMask(const VPValue *V, const VPlan &Plan);
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 0678bc90ef4b5..83e3fcaaeee2b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -41,10 +41,10 @@ class VPRecipeBase;
 class VPInterleaveBase;
 class VPPhiAccessors;
 
-// This is the base class of the VPlan Def/Use graph, used for modeling the data
-// flow into, within and out of the VPlan. VPValues can stand for live-ins
-// coming from the input IR and instructions which VPlan will generate if
-// executed.
+/// This is the base class of the VPlan Def/Use graph, used for modeling the
+/// data flow into, within and out of the VPlan. VPValues can stand for live-ins
+/// coming from the input IR and instructions which VPlan will generate if
+/// executed.
 class LLVM_ABI_FOR_TEST VPValue {
   friend class VPDef;
   friend struct VPDoubleValueDef;
@@ -57,7 +57,7 @@ class LLVM_ABI_FOR_TEST VPValue {
   SmallVector<VPUser *, 1> Users;
 
 protected:
-  // Hold the underlying Value, if any, attached to this VPValue.
+  /// Hold the underlying Value, if any, attached to this VPValue.
   Value *UnderlyingVal;
 
   /// Pointer to the VPDef that defines this VPValue. If it is nullptr, the
diff --git a/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..1de8ab5d3e590
--- /dev/null
+++ b/llvm/test/Analysis/BasicAA/matrix-intrinsics.ll
@@ -0,0 +1,30 @@
+; RUN: opt %s -aa-pipeline=basic-aa -passes=aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s
+
+; BasicAA should prove that loads from sufficiently large static offsets
+; don't overlap with matrix loads with a statically known size.
+
+define <8 x double> @non_overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: non_overlapping_strided_load:
+; Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 12
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+define <8 x double> @overlapping_strided_load(ptr %src) {
+; CHECK-LABEL: Function: overlapping_strided_load:
+; CHECK: Just Ref: %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2) <-> call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+; CHECK: Just Mod: call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2) <-> %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 11
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  ret <8 x double> %l
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
index ed851f29a6bfc..67ce44e5eff50 100644
--- a/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
+++ b/llvm/test/Analysis/BranchProbabilityInfo/pr22718.ll
@@ -12,14 +12,14 @@
 @.str = private unnamed_addr constant [17 x i8] c"x = %lu\0Ay = %lu\0A\00", align 1
 
 ; Function Attrs: inlinehint nounwind uwtable
-define i32 @main() #0 {
+define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i64, align 8
   store i32 0, ptr %retval
   store i64 0, ptr @y, align 8
   store i64 0, ptr @x, align 8
-  call void @srand(i32 422304) #3
+  call void @srand(i32 422304)
   store i64 0, ptr %i, align 8
   br label %for.cond
 
@@ -29,7 +29,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !prof !1
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #3
+  %call = call i32 @rand()
   %conv = sitofp i32 %call to double
   %mul = fmul double %conv, 1.000000e+02
   %div = fdiv double %mul, 0x41E0000000000000
@@ -65,17 +65,12 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @srand(i32) #1
+declare void @srand(i32)
 
 ; Function Attrs: nounwind
-declare i32 @rand() #1
+declare i32 @rand()
 
-declare i32 @printf(ptr, ...) #2
-
-attributes #0 = { inlinehint nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare i32 @printf(ptr, ...)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
index e0078006b320d..ac654ddaf3808 100644
--- a/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/sve-intrinsics.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-1
-; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=CHECK-VSCALE-2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve2p1 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -mcpu=neoverse-v1 -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sme2 | FileCheck %s --check-prefixes=CHECK-VSCALE-2,CHECK-SVE2p1-OR-SME2
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -intrinsic-cost-strategy=type-based-intrinsic-cost -disable-output -S -mtriple=aarch64--linux-gnu -mattr=+sve | FileCheck %s --check-prefix=TYPE_BASED_ONLY
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -920,7 +922,8 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -934,28 +937,53 @@ define void @get_lane_mask() #0 {
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
 ; CHECK-VSCALE-1-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-VSCALE-2-LABEL: 'get_lane_mask'
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
-; CHECK-VSCALE-2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+; CHECK-SVE-LABEL: 'get_lane_mask'
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-SVE2p1-OR-SME2-LABEL: 'get_lane_mask'
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i64 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i64 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i64 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i32 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i64 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i64 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i32 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i32 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 8 for: %mask_v4i1_i32 = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 4 for: %mask_v2i1_i32 = call <2 x i1> @llvm.get.active.lane.mask.v2i1.i32(i32 poison, i32 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 48 for: %mask_v32i1_i64 = call <32 x i1> @llvm.get.active.lane.mask.v32i1.i64(i64 poison, i64 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of 6 for: %mask_v16i1_i16 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i16(i16 poison, i16 poison)
+; CHECK-SVE2p1-OR-SME2-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; TYPE_BASED_ONLY-LABEL: 'get_lane_mask'
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i64 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 poison, i64 poison)
@@ -966,7 +994,8 @@ define void @get_lane_mask() #0 {
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv8i1_i32 = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32 poison, i32 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
-; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 2 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 13 for: %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
+; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 5 for: %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 1 for: %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 32 for: %mask_v16i1_i64 = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 poison, i64 poison)
 ; TYPE_BASED_ONLY-NEXT:  Cost Model: Found costs of 16 for: %mask_v8i1_i64 = call <8 x i1> @llvm.get.active.lane.mask.v8i1.i64(i64 poison, i64 poison)
@@ -990,6 +1019,7 @@ define void @get_lane_mask() #0 {
   %mask_nxv4i1_i32 = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32 poison, i32 poison)
   %mask_nxv2i1_i32 = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32 poison, i32 poison)
 
+  %mask_nxv64i1_i64 = call <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64 poison, i64 poison)
   %mask_nxv32i1_i64 = call <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64 poison, i64 poison)
   %mask_nxv16i1_i16 = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16 poison, i16 poison)
 
@@ -1416,6 +1446,7 @@ declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32, i32)
 declare <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i32(i32, i32)
 declare <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i32(i32, i32)
 declare <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i32(i32, i32)
+declare <vscale x 64 x i1> @llvm.get.active.lane.mask.nxv64i1.i64(i64, i64)
 declare <vscale x 32 x i1> @llvm.get.active.lane.mask.nxv32i1.i64(i64, i64)
 declare <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i16(i16, i16)
 declare <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64, i64)
diff --git a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
index 1e9a5f7c97476..c0410cf2563da 100644
--- a/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/add-cast-vect.ll
@@ -1,4 +1,6 @@
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -15,13 +17,18 @@ target triple = "armv7--linux-gnueabihf"
 %T464 = type <4 x i64>
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'direct'
+; COST-LABEL: 'direct'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -29,16 +36,22 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups1632'
+; COST-LABEL: 'ups1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = sext %T416 %v0 to %T432
   %r2 = sext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} sext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.s16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -46,16 +59,22 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu1632'
+; COST-LABEL: 'upu1632'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T416, ptr %loadaddr2
 ; ASM: vldr
   %r1 = zext %T416 %v0 to %T432
   %r2 = zext %T416 %v1 to %T432
-; COST: cost of 0 for instruction: {{.*}} zext <4 x i16> {{.*}} to <4 x i32>
-  %r3 = add %T432 %r1, %r2 
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
+  %r3 = add %T432 %r1, %r2
 ; ASM: vaddl.u16
   store %T432 %r3, ptr %storeaddr
 ; ASM: vst1.64
@@ -63,51 +82,66 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 }
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'ups3264'
+; COST-LABEL: 'ups3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = sext %T232 %r3 to %T264
 ; ASM: vmovl.s32
-; COST: cost of 1 for instruction: {{.*}} sext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'upu3264'
+; COST-LABEL: 'upu3264'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
   %v1 = load %T232, ptr %loadaddr2
 ; ASM: vldr
-  %r3 = add %T232 %v0, %v1 
+  %r3 = add %T232 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <2 x i32>
   %st = zext %T232 %r3 to %T264
 ; ASM: vmovl.u32
-; COST: cost of 1 for instruction: {{.*}} zext <2 x i32> {{.*}} to <2 x i64>
   store %T264 %st, ptr %storeaddr
 ; ASM: vst1.64
   ret void
 }
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
-; COST: function 'dn3216'
+; COST-LABEL: 'dn3216'
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = add <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
   %v1 = load %T432, ptr %loadaddr2
 ; ASM: vld1.64
-  %r3 = add %T432 %v0, %v1 
+  %r3 = add %T432 %v0, %v1
 ; ASM: vadd.i32
-; COST: cost of 1 for instruction: {{.*}} add <4 x i32>
   %st = trunc %T432 %r3 to %T416
 ; ASM: vmovn.i32
-; COST: cost of 1 for instruction: {{.*}} trunc <4 x i32> {{.*}} to <4 x i16>
   store %T416 %st, ptr %storeaddr
 ; ASM: vstr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
index c2248c2278ce8..e5bbac63dff69 100644
--- a/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
+++ b/llvm/test/Analysis/CostModel/ARM/cast_ldst.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=cortex-a9 < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-MAIN
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V8M-BASE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi -mattr=+neon,+fp-armv8 < %s | FileCheck %s --check-prefix=CHECK-V8R
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
diff --git a/llvm/test/Analysis/CostModel/ARM/fparith.ll b/llvm/test/Analysis/CostModel/ARM/fparith.ll
index f9424e8b86557..6f2626cd20588 100644
--- a/llvm/test/Analysis/CostModel/ARM/fparith.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fparith.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @f32() {
 ; CHECK-MVE-LABEL: 'f32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul float undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul float undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd float undef, undef
   %d = fsub float undef, undef
@@ -25,16 +25,16 @@ define void @f32() {
 
 define void @f16() {
 ; CHECK-MVE-LABEL: 'f16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul half undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul half undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd half undef, undef
   %d = fsub half undef, undef
@@ -44,16 +44,16 @@ define void @f16() {
 
 define void @f64() {
 ; CHECK-MVE-LABEL: 'f64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'f64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c = fadd double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d = fsub double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %e = fmul double undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c = fadd double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %d = fsub double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %e = fmul double undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c = fadd double undef, undef
   %d = fsub double undef, undef
@@ -63,28 +63,28 @@ define void @f64() {
 
 define void @vf32() {
 ; CHECK-MVE-LABEL: 'vf32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c8 = fadd <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d8 = fsub <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e8 = fmul <8 x float> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %c8 = fadd <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %d8 = fsub <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %e8 = fmul <8 x float> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x float> undef, undef
   %d2 = fsub <2 x float> undef, undef
@@ -100,28 +100,28 @@ define void @vf32() {
 
 define void @vf16() {
 ; CHECK-MVE-LABEL: 'vf16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c2 = fadd <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d2 = fsub <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e2 = fmul <2 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c4 = fadd <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d4 = fsub <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e4 = fmul <4 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %c8 = fadd <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %d8 = fsub <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %e8 = fmul <8 x half> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c2 = fadd <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d2 = fsub <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e2 = fmul <2 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c4 = fadd <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d4 = fsub <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e4 = fmul <4 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %c8 = fadd <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %d8 = fsub <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %e8 = fmul <8 x half> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x half> undef, undef
   %d2 = fsub <2 x half> undef, undef
@@ -137,28 +137,28 @@ define void @vf16() {
 
 define void @vf64() {
 ; CHECK-MVE-LABEL: 'vf64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'vf64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %c2 = fadd <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %d2 = fsub <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %e2 = fmul <2 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %c4 = fadd <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %d4 = fsub <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %e4 = fmul <4 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %c8 = fadd <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %d8 = fsub <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %e8 = fmul <8 x double> undef, undef
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %c2 = fadd <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %d2 = fsub <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 4 for: %e2 = fmul <2 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %c4 = fadd <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %d4 = fsub <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 8 for: %e4 = fmul <4 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %c8 = fadd <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %d8 = fsub <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 16 for: %e8 = fmul <8 x double> undef, undef
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %c2 = fadd <2 x double> undef, undef
   %d2 = fsub <2 x double> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
index aff7b19a9c87a..af548f6216370 100644
--- a/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
+++ b/llvm/test/Analysis/CostModel/ARM/fptoi_sat.ll
@@ -1,213 +1,213 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+fp64 < %s | FileCheck %s --check-prefix=CHECK-MVEFP
 
 define void @casts() {
 ; CHECK-MVE-LABEL: 'casts'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 406 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 314 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 276 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 250 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 404 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 376 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 626 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 548 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 496 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1360 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1304 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 922 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 794 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 844 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 792 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1352 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1296 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1834 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1578 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1676 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1576 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1672 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1568 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4912 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4800 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 3018 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2762 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2860 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2760 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2856 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2752 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4880 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4768 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:314 CodeSize:85 Lat:169 SizeLat:169 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:276 CodeSize:58 Lat:131 SizeLat:131 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:250 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:404 CodeSize:59 Lat:133 SizeLat:133 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:376 CodeSize:33 Lat:105 SizeLat:105 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:626 CodeSize:169 Lat:337 SizeLat:337 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:114 Lat:259 SizeLat:259 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:548 CodeSize:115 Lat:261 SizeLat:261 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:496 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1360 CodeSize:117 Lat:265 SizeLat:265 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1304 CodeSize:65 Lat:209 SizeLat:209 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:922 CodeSize:169 Lat:337 SizeLat:337 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:114 Lat:259 SizeLat:259 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:794 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:844 CodeSize:115 Lat:261 SizeLat:261 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:792 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1352 CodeSize:117 Lat:265 SizeLat:265 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1296 CodeSize:65 Lat:209 SizeLat:209 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1834 CodeSize:337 Lat:673 SizeLat:673 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1676 CodeSize:226 Lat:515 SizeLat:515 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1578 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1676 CodeSize:227 Lat:517 SizeLat:517 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1672 CodeSize:229 Lat:521 SizeLat:521 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1568 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4912 CodeSize:233 Lat:529 SizeLat:529 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4800 CodeSize:129 Lat:417 SizeLat:417 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:3018 CodeSize:337 Lat:673 SizeLat:673 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2860 CodeSize:226 Lat:515 SizeLat:515 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2762 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2860 CodeSize:227 Lat:517 SizeLat:517 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2760 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2856 CodeSize:229 Lat:521 SizeLat:521 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2752 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4880 CodeSize:233 Lat:529 SizeLat:529 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4768 CodeSize:129 Lat:417 SizeLat:417 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'casts'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 62 for instruction: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 52 for instruction: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 94 for instruction: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 84 for instruction: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 234 for instruction: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 196 for instruction: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 178 for instruction: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 324 for instruction: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 304 for instruction: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1148 for instruction: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1104 for instruction: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 762 for instruction: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 650 for instruction: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 684 for instruction: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 648 for instruction: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1192 for instruction: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1152 for instruction: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4488 for instruction: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4400 for instruction: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2698 for instruction: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2474 for instruction: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2540 for instruction: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2472 for instruction: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2536 for instruction: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2464 for instruction: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4560 for instruction: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4480 for instruction: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s8 = call i8 @llvm.fptosi.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u8 = call i8 @llvm.fptoui.sat.i8.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32s16 = call i16 @llvm.fptosi.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f32u16 = call i16 @llvm.fptoui.sat.i16.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f32s32 = call i32 @llvm.fptosi.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f32u32 = call i32 @llvm.fptoui.sat.i32.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f32s64 = call i64 @llvm.fptosi.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %f32u64 = call i64 @llvm.fptoui.sat.i64.f32(float undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s1 = call i1 @llvm.fptosi.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u1 = call i1 @llvm.fptoui.sat.i1.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s8 = call i8 @llvm.fptosi.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u8 = call i8 @llvm.fptoui.sat.i8.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s16 = call i16 @llvm.fptosi.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u16 = call i16 @llvm.fptoui.sat.i16.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f64s32 = call i32 @llvm.fptosi.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f64u32 = call i32 @llvm.fptoui.sat.i32.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64s64 = call i64 @llvm.fptosi.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:9 Lat:7 SizeLat:7 for: %f64u64 = call i64 @llvm.fptoui.sat.i64.f64(double undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f32u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f32s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f32u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f32(<2 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:35 Lat:45 SizeLat:45 for: %v2f64s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:62 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:52 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:94 CodeSize:22 Lat:27 SizeLat:27 for: %v2f64s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:84 CodeSize:17 Lat:17 SizeLat:17 for: %v2f64u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f64(<2 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f32u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f32s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f32u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f32(<4 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:234 CodeSize:69 Lat:89 SizeLat:89 for: %v4f64s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:196 CodeSize:42 Lat:51 SizeLat:51 for: %v4f64s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:178 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:324 CodeSize:43 Lat:53 SizeLat:53 for: %v4f64s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:304 CodeSize:33 Lat:33 SizeLat:33 for: %v4f64u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f64(<4 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v8f32u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1148 CodeSize:43 Lat:53 SizeLat:53 for: %v8f32s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1104 CodeSize:5 Lat:9 SizeLat:9 for: %v8f32u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f32(<8 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:762 CodeSize:137 Lat:177 SizeLat:177 for: %v8f64s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:82 Lat:99 SizeLat:99 for: %v8f64s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:650 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:684 CodeSize:83 Lat:101 SizeLat:101 for: %v8f64s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:648 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1192 CodeSize:85 Lat:105 SizeLat:105 for: %v8f64s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1152 CodeSize:65 Lat:65 SizeLat:65 for: %v8f64u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f64(<8 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:28 Lat:24 SizeLat:24 for: %v16f32u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16f32u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4488 CodeSize:85 Lat:105 SizeLat:105 for: %v16f32s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4400 CodeSize:9 Lat:17 SizeLat:17 for: %v16f32u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f32(<16 x float> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2698 CodeSize:273 Lat:353 SizeLat:353 for: %v16f64s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2540 CodeSize:162 Lat:195 SizeLat:195 for: %v16f64s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2474 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2540 CodeSize:163 Lat:197 SizeLat:197 for: %v16f64s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2472 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2536 CodeSize:165 Lat:201 SizeLat:201 for: %v16f64s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2464 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4560 CodeSize:169 Lat:209 SizeLat:209 for: %v16f64s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4480 CodeSize:129 Lat:129 SizeLat:129 for: %v16f64u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f64(<16 x double> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f32s1 = call i1 @llvm.fptosi.sat.i1.f32(float undef)
   %f32u1 = call i1 @llvm.fptoui.sat.i1.f32(float undef)
@@ -324,110 +324,110 @@ define void @casts() {
 
 define void @fp16() {
 ; CHECK-MVE-LABEL: 'fp16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 23 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 21 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 102 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 134 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 240 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 202 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 406 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 378 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 480 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 402 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 402 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 352 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 550 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 498 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1362 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1306 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1250 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 994 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 994 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1092 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 992 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1680 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1576 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4920 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4808 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:7 Lat:23 SizeLat:23 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:23 CodeSize:6 Lat:23 SizeLat:23 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:7 Lat:24 SizeLat:24 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:21 CodeSize:3 Lat:21 SizeLat:21 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:43 Lat:85 SizeLat:85 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:102 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:88 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:134 CodeSize:30 Lat:67 SizeLat:67 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:17 Lat:53 SizeLat:53 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:240 CodeSize:85 Lat:169 SizeLat:169 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:202 CodeSize:58 Lat:131 SizeLat:131 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:176 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:406 CodeSize:59 Lat:133 SizeLat:133 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:378 CodeSize:33 Lat:105 SizeLat:105 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:480 CodeSize:169 Lat:337 SizeLat:337 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:402 CodeSize:114 Lat:259 SizeLat:259 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:352 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:550 CodeSize:115 Lat:261 SizeLat:261 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:498 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1362 CodeSize:117 Lat:265 SizeLat:265 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1306 CodeSize:65 Lat:209 SizeLat:209 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1250 CodeSize:337 Lat:673 SizeLat:673 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1092 CodeSize:226 Lat:515 SizeLat:515 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:994 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1092 CodeSize:227 Lat:517 SizeLat:517 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:992 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1680 CodeSize:229 Lat:521 SizeLat:521 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1576 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4920 CodeSize:233 Lat:529 SizeLat:529 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4808 CodeSize:129 Lat:417 SizeLat:417 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'fp16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 76 for instruction: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 72 for instruction: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 284 for instruction: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 278 for instruction: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1112 for instruction: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1102 for instruction: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 124 for instruction: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4484 for instruction: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 4400 for instruction: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s8 = call i8 @llvm.fptosi.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u8 = call i8 @llvm.fptoui.sat.i8.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16s16 = call i16 @llvm.fptosi.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:5 CodeSize:7 Lat:5 SizeLat:5 for: %f16u16 = call i16 @llvm.fptoui.sat.i16.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f16s32 = call i32 @llvm.fptosi.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %f16u32 = call i32 @llvm.fptoui.sat.i32.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %f16s64 = call i64 @llvm.fptosi.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 3 for: %f16u64 = call i64 @llvm.fptoui.sat.i64.f16(half undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s1 = call <2 x i1> @llvm.fptosi.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u1 = call <2 x i1> @llvm.fptoui.sat.v2i1.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16s8 = call <2 x i8> @llvm.fptosi.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v2f16u8 = call <2 x i8> @llvm.fptoui.sat.v2i8.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16s16 = call <2 x i16> @llvm.fptosi.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16u16 = call <2 x i16> @llvm.fptoui.sat.v2i16.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:44 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s32 = call <2 x i32> @llvm.fptosi.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:40 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u32 = call <2 x i32> @llvm.fptoui.sat.v2i32.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:76 CodeSize:5 Lat:9 SizeLat:9 for: %v2f16s64 = call <2 x i64> @llvm.fptosi.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:72 CodeSize:3 Lat:5 SizeLat:5 for: %v2f16u64 = call <2 x i64> @llvm.fptoui.sat.v2i64.v2f16(<2 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s1 = call <4 x i1> @llvm.fptosi.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u1 = call <4 x i1> @llvm.fptoui.sat.v4i1.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16s8 = call <4 x i8> @llvm.fptosi.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v4f16u8 = call <4 x i8> @llvm.fptoui.sat.v4i8.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16s16 = call <4 x i16> @llvm.fptosi.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16u16 = call <4 x i16> @llvm.fptoui.sat.v4i16.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:9 SizeLat:9 for: %v4f16s32 = call <4 x i32> @llvm.fptosi.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u32 = call <4 x i32> @llvm.fptoui.sat.v4i32.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:284 CodeSize:6 Lat:11 SizeLat:11 for: %v4f16s64 = call <4 x i64> @llvm.fptosi.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:278 CodeSize:3 Lat:5 SizeLat:5 for: %v4f16u64 = call <4 x i64> @llvm.fptoui.sat.v4i64.v4f16(<4 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s1 = call <8 x i1> @llvm.fptosi.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u1 = call <8 x i1> @llvm.fptoui.sat.v8i1.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16s8 = call <8 x i8> @llvm.fptosi.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:7 Lat:6 SizeLat:6 for: %v8f16u8 = call <8 x i8> @llvm.fptoui.sat.v8i8.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16s16 = call <8 x i16> @llvm.fptosi.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16u16 = call <8 x i16> @llvm.fptoui.sat.v8i16.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:6 Lat:11 SizeLat:11 for: %v8f16s32 = call <8 x i32> @llvm.fptosi.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u32 = call <8 x i32> @llvm.fptoui.sat.v8i32.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1112 CodeSize:8 Lat:15 SizeLat:15 for: %v8f16s64 = call <8 x i64> @llvm.fptosi.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:1102 CodeSize:3 Lat:5 SizeLat:5 for: %v8f16u64 = call <8 x i64> @llvm.fptoui.sat.v8i64.v8f16(<8 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s1 = call <16 x i1> @llvm.fptosi.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u1 = call <16 x i1> @llvm.fptoui.sat.v16i1.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16s8 = call <16 x i8> @llvm.fptosi.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:12 CodeSize:14 Lat:12 SizeLat:12 for: %v16f16u8 = call <16 x i8> @llvm.fptoui.sat.v16i8.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16s16 = call <16 x i16> @llvm.fptosi.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16u16 = call <16 x i16> @llvm.fptoui.sat.v16i16.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:124 CodeSize:75 Lat:85 SizeLat:85 for: %v16f16s32 = call <16 x i32> @llvm.fptosi.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u32 = call <16 x i32> @llvm.fptoui.sat.v16i32.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4484 CodeSize:79 Lat:93 SizeLat:93 for: %v16f16s64 = call <16 x i64> @llvm.fptosi.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:4400 CodeSize:5 Lat:9 SizeLat:9 for: %v16f16u64 = call <16 x i64> @llvm.fptoui.sat.v16i64.v16f16(<16 x half> undef)
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %f16s1 = call i1 @llvm.fptosi.sat.i1.f16(half undef)
   %f16u1 = call i1 @llvm.fptoui.sat.i1.f16(half undef)
diff --git a/llvm/test/Analysis/CostModel/ARM/freeshift.ll b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
index 51e87b5d0591e..cd5c8c58233c5 100644
--- a/llvm/test/Analysis/CostModel/ARM/freeshift.ll
+++ b/llvm/test/Analysis/CostModel/ARM/freeshift.ll
@@ -1,23 +1,23 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi < %s | FileCheck %s
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @shl(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'shl'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = shl i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = shl i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = shl i32 %a, 3
   %ac = add i32 %b, %as
@@ -36,19 +36,19 @@ define void @shl(i32 %a, i32 %b) {
 
 define void @ashr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'ashr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = ashr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = ashr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = ashr i32 %a, 3
   %ac = add i32 %b, %as
@@ -67,19 +67,19 @@ define void @ashr(i32 %a, i32 %b) {
 
 define void @lshr(i32 %a, i32 %b) {
 ; CHECK-LABEL: 'lshr'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ac = add i32 %b, %as
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ss = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sc = sub i32 %b, %ss
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %xs = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xc = xor i32 %b, %xs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ns = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %nc = and i32 %b, %ns
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %os = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %oc = or i32 %b, %os
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %is = lshr i32 %a, 3
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ic = icmp eq i32 %b, %is
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ac = add i32 %b, %as
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ss = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %sc = sub i32 %b, %ss
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %xs = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %xc = xor i32 %b, %xs
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %ns = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %nc = and i32 %b, %ns
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %os = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %oc = or i32 %b, %os
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %is = lshr i32 %a, 3
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ic = icmp eq i32 %b, %is
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %as = lshr i32 %a, 3
   %ac = add i32 %b, %as
diff --git a/llvm/test/Analysis/CostModel/ARM/gep.ll b/llvm/test/Analysis/CostModel/ARM/gep.ll
index 48de193c15304..cce87a51d5708 100644
--- a/llvm/test/Analysis/CostModel/ARM/gep.ll
+++ b/llvm/test/Analysis/CostModel/ARM/gep.ll
@@ -1,98 +1,98 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-V6M
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-V7M-NOFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-V7M-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp < %s | FileCheck %s --check-prefix=CHECK-MVEFP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-T32
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-A32
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @testi8(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi8'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi8'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi8'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi8'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi8'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi8'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi8'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i8, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i8, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i8, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i8, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i8, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i8, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i8, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i8, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i8, ptr %a, i32 4095
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i8, ptr %a, i32 4096
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i8, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i8, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i8, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i8, ptr %a, i32 1
   %am4 = getelementptr inbounds i8, ptr %a, i32 -1
@@ -109,88 +109,88 @@ define void @testi8(ptr %a, i32 %i) {
 
 define void @testi16(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi16'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi16'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi16'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi16'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi16'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi16'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi16'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i16, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i16, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i16, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i16, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i16, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i16, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i16, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i16, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i16, ptr %a, i32 2046
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i16, ptr %a, i32 2048
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i16, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i16, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i16, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i16, ptr %a, i32 1
   %am4 = getelementptr inbounds i16, ptr %a, i32 -1
@@ -207,88 +207,88 @@ define void @testi16(ptr %a, i32 %i) {
 
 define void @testi32(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi32'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi32'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi32'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi32'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi32'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi32'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi32'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i32, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i32, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i32, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %ai = getelementptr inbounds i32, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i32, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i32, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i32, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i32, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds i32, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds i32, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i32, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i32, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %ai = getelementptr inbounds i32, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i32, ptr %a, i32 1
   %am4 = getelementptr inbounds i32, ptr %a, i32 -1
@@ -305,102 +305,102 @@ define void @testi32(ptr %a, i32 %i) {
 
 define void @testi64(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testi64'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testi64'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testi64'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testi64'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testi64'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testi64'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testi64'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds i64, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a15 = getelementptr inbounds i64, ptr %a, i32 15
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a16 = getelementptr inbounds i64, ptr %a, i32 16
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a31 = getelementptr inbounds i64, ptr %a, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a32 = getelementptr inbounds i64, ptr %a, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds i64, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds i64, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am4 = getelementptr inbounds i64, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a15 = getelementptr inbounds i64, ptr %a, i32 15
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a16 = getelementptr inbounds i64, ptr %a, i32 16
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a31 = getelementptr inbounds i64, ptr %a, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a32 = getelementptr inbounds i64, ptr %a, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4095 = getelementptr inbounds i64, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a4096 = getelementptr inbounds i64, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds i64, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds i64, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds i64, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds i64, ptr %a, i32 1
   %am4 = getelementptr inbounds i64, ptr %a, i32 -1
@@ -419,102 +419,102 @@ define void @testi64(ptr %a, i32 %i) {
 
 define void @testhalf(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testhalf'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testhalf'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testhalf'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testhalf'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testhalf'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testhalf'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testhalf'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1 = getelementptr inbounds half, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds half, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds half, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds half, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds half, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds half, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds half, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds half, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds half, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1 = getelementptr inbounds half, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds half, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds half, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds half, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds half, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds half, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds half, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds half, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds half, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds half, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds half, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds half, ptr %a, i32 1
   %am1 = getelementptr inbounds half, ptr %a, i32 -1
@@ -533,102 +533,102 @@ define void @testhalf(ptr %a, i32 %i) {
 
 define void @testfloat(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testfloat'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testfloat'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testfloat'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testfloat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testfloat'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testfloat'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testfloat'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds float, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds float, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds float, ptr %a, i32 255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds float, ptr %a, i32 256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds float, ptr %a, i32 -255
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds float, ptr %a, i32 -256
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds float, ptr %a, i32 -63
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds float, ptr %a, i32 -64
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds float, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds float, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds float, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds float, ptr %a, i32 255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds float, ptr %a, i32 256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds float, ptr %a, i32 -255
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds float, ptr %a, i32 -256
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds float, ptr %a, i32 1023
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds float, ptr %a, i32 1024
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds float, ptr %a, i32 -63
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds float, ptr %a, i32 -64
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds float, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds float, ptr %a, i32 1
   %am1 = getelementptr inbounds float, ptr %a, i32 -1
@@ -647,102 +647,102 @@ define void @testfloat(ptr %a, i32 %i) {
 
 define void @testdouble(ptr %a, i32 %i) {
 ; CHECK-V6M-LABEL: 'testdouble'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testdouble'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testdouble'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testdouble'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testdouble'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testdouble'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testdouble'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a1 = getelementptr inbounds double, ptr %a, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am1 = getelementptr inbounds double, ptr %a, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %a255 = getelementptr inbounds double, ptr %a, i32 127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a256 = getelementptr inbounds double, ptr %a, i32 128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am255 = getelementptr inbounds double, ptr %a, i32 -127
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %am256 = getelementptr inbounds double, ptr %a, i32 -128
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1023 = getelementptr inbounds double, ptr %a, i32 511
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a1024 = getelementptr inbounds double, ptr %a, i32 512
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am63 = getelementptr inbounds double, ptr %a, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %am64 = getelementptr inbounds double, ptr %a, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ai = getelementptr inbounds double, ptr %a, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a1 = getelementptr inbounds double, ptr %a, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am1 = getelementptr inbounds double, ptr %a, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %a255 = getelementptr inbounds double, ptr %a, i32 127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a256 = getelementptr inbounds double, ptr %a, i32 128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am255 = getelementptr inbounds double, ptr %a, i32 -127
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %am256 = getelementptr inbounds double, ptr %a, i32 -128
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1023 = getelementptr inbounds double, ptr %a, i32 511
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %a1024 = getelementptr inbounds double, ptr %a, i32 512
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am63 = getelementptr inbounds double, ptr %a, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %am64 = getelementptr inbounds double, ptr %a, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %ai = getelementptr inbounds double, ptr %a, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %a1 = getelementptr inbounds double, ptr %a, i32 1
   %am1 = getelementptr inbounds double, ptr %a, i32 -1
@@ -761,375 +761,375 @@ define void @testdouble(ptr %a, i32 %i) {
 
 define void @testvecs(i32 %i) {
 ; CHECK-V6M-LABEL: 'testvecs'
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V6M-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V6M-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-NOFP-LABEL: 'testvecs'
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-NOFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-NOFP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-V7M-FP-LABEL: 'testvecs'
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-V7M-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-V7M-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'testvecs'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-MVEFP-LABEL: 'testvecs'
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-MVEFP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-MVEFP-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-T32-LABEL: 'testvecs'
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-T32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-T32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-T32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-A32-LABEL: 'testvecs'
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
-; CHECK-A32-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b8 = getelementptr inbounds <4 x i16>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b9 = getelementptr inbounds <4 x i32>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b10 = getelementptr inbounds <4 x i64>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b11 = getelementptr inbounds <4 x half>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b12 = getelementptr inbounds <4 x float>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %b13 = getelementptr inbounds <4 x double>, ptr undef, i32 1
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o7 = getelementptr inbounds <4 x i8>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o8 = getelementptr inbounds <4 x i16>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o9 = getelementptr inbounds <4 x i32>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o10 = getelementptr inbounds <4 x i64>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o11 = getelementptr inbounds <4 x half>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o12 = getelementptr inbounds <4 x float>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %o13 = getelementptr inbounds <4 x double>, ptr undef, i32 4
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p7 = getelementptr inbounds <4 x i8>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p8 = getelementptr inbounds <4 x i16>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p9 = getelementptr inbounds <4 x i32>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p10 = getelementptr inbounds <4 x i64>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p11 = getelementptr inbounds <4 x half>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p12 = getelementptr inbounds <4 x float>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %p13 = getelementptr inbounds <4 x double>, ptr undef, i32 31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q7 = getelementptr inbounds <4 x i8>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q8 = getelementptr inbounds <4 x i16>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q9 = getelementptr inbounds <4 x i32>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q10 = getelementptr inbounds <4 x i64>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q11 = getelementptr inbounds <4 x half>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q12 = getelementptr inbounds <4 x float>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %q13 = getelementptr inbounds <4 x double>, ptr undef, i32 32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r11 = getelementptr inbounds <4 x half>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r12 = getelementptr inbounds <4 x float>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %r13 = getelementptr inbounds <4 x double>, ptr undef, i32 -31
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s7 = getelementptr inbounds <4 x i8>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s8 = getelementptr inbounds <4 x i16>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s9 = getelementptr inbounds <4 x i32>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s10 = getelementptr inbounds <4 x i64>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s11 = getelementptr inbounds <4 x half>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s12 = getelementptr inbounds <4 x float>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %s13 = getelementptr inbounds <4 x double>, ptr undef, i32 -32
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c7 = getelementptr inbounds <4 x i8>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c8 = getelementptr inbounds <4 x i16>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c9 = getelementptr inbounds <4 x i32>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c10 = getelementptr inbounds <4 x i64>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c11 = getelementptr inbounds <4 x half>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c12 = getelementptr inbounds <4 x float>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 1 for: %c13 = getelementptr inbounds <4 x double>, ptr undef, i32 %i
+; CHECK-A32-NEXT:  Cost Model: Found costs of 0 for: %d0 = getelementptr inbounds i8, ptr undef, i32 -1
+; CHECK-A32-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 
   %b7 = getelementptr inbounds <4 x i8>, ptr undef, i32 1
diff --git a/llvm/test/Analysis/CostModel/ARM/immediates.ll b/llvm/test/Analysis/CostModel/ARM/immediates.ll
index ed13636f92193..cd42313f03ed7 100644
--- a/llvm/test/Analysis/CostModel/ARM/immediates.ll
+++ b/llvm/test/Analysis/CostModel/ARM/immediates.ll
@@ -1,145 +1,53 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-SIZE
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency    -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-LATENCY
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1-THROUGHPUT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2-THROUGHPUT
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.base   | FileCheck %s --check-prefix=CHECK-T1
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output  -mtriple=thumbv8m.main   | FileCheck %s --check-prefix=CHECK-T2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define i32 @const_costs() {
-; CHECK-T1-SIZE-LABEL: 'const_costs'
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T1-LABEL: 'const_costs'
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T1-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T1-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
-; CHECK-T2-SIZE-LABEL: 'const_costs'
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-LATENCY-LABEL: 'const_costs'
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-LATENCY-LABEL: 'const_costs'
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-LATENCY-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T1-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T1-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
-;
-; CHECK-T2-THROUGHPUT-LABEL: 'const_costs'
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_1 = add i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %add_32767 = add i32 undef, 32767
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_1 = sub i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sub_32768 = sub i32 undef, 32768
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_2 = mul i32 undef, 2
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_3 = mul i32 undef, 3
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %mul_27 = mul i32 undef, 27
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_255 = and i32 undef, 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_65535 = and i32 undef, 65535
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %and_1 = and i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_1 = xor i32 undef, 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %xor_7 = xor i32 undef, 7
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_1 = getelementptr i32, ptr undef, i32 1
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %gep_16 = getelementptr i32, ptr undef, i32 16
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_244 = icmp ne i32 undef, 244
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_256 = icmp uge i32 undef, 256
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %cmp_1024 = icmp ult i32 undef, 1024
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_1_0 = select i1 undef, i32 1, i32 0
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %select_7_255 = select i1 undef, i32 7, i32 255
-; CHECK-T2-THROUGHPUT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 1
+; CHECK-T2-LABEL: 'const_costs'
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_1 = add i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %add_32767 = add i32 undef, 32767
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_1 = sub i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %sub_32768 = sub i32 undef, 32768
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_2 = mul i32 undef, 2
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_3 = mul i32 undef, 3
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %mul_27 = mul i32 undef, 27
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_255 = and i32 undef, 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_65535 = and i32 undef, 65535
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %and_1 = and i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_1 = xor i32 undef, 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %xor_7 = xor i32 undef, 7
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_1 = getelementptr i32, ptr undef, i32 1
+; CHECK-T2-NEXT:  Cost Model: Found costs of 0 for: %gep_16 = getelementptr i32, ptr undef, i32 16
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_244 = icmp ne i32 undef, 244
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_256 = icmp uge i32 undef, 256
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: %cmp_1024 = icmp ult i32 undef, 1024
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_1_0 = select i1 undef, i32 1, i32 0
+; CHECK-T2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %select_7_255 = select i1 undef, i32 7, i32 255
+; CHECK-T2-NEXT:  Cost Model: Found costs of 1 for: ret i32 1
 ;
   %add_1 = add i32 undef, 1
   %add_32767 = add i32 undef, 32767
diff --git a/llvm/test/Analysis/CostModel/ARM/insertelement.ll b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
index 5a922dd59055d..f14b200fef4e2 100644
--- a/llvm/test/Analysis/CostModel/ARM/insertelement.ll
+++ b/llvm/test/Analysis/CostModel/ARM/insertelement.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
 target triple = "thumbv7-apple-ios6.0.0"
@@ -7,12 +8,16 @@ target triple = "thumbv7-apple-ios6.0.0"
 ; due to renaming constraints.
 %T_i8v = type <8 x i8>
 %T_i8 = type i8
-; CHECK: insertelement_i8
-define void @insertelement_i8(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i8(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i8'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <8 x i8>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i8, ptr %saddr, align 1
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <8 x i8> %v0, i8 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <8 x i8> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i8v, ptr %vaddr
   %v1 = load %T_i8, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <8 x i8>
   %v2 = insertelement %T_i8v %v0, %T_i8 %v1, i32 1
   store %T_i8v %v2, ptr %vaddr
   ret void
@@ -21,12 +26,16 @@ define void @insertelement_i8(ptr %saddr,
 
 %T_i16v = type <4 x i16>
 %T_i16 = type i16
-; CHECK: insertelement_i16
-define void @insertelement_i16(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i16(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i16'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i16, ptr %saddr, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <4 x i16> %v0, i16 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i16v, ptr %vaddr
   %v1 = load %T_i16, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <4 x i16>
   %v2 = insertelement %T_i16v %v0, %T_i16 %v1, i32 1
   store %T_i16v %v2, ptr %vaddr
   ret void
@@ -34,12 +43,16 @@ define void @insertelement_i16(ptr %saddr,
 
 %T_i32v = type <2 x i32>
 %T_i32 = type i32
-; CHECK: insertelement_i32
-define void @insertelement_i32(ptr %saddr,
-                           ptr %vaddr) {
+define void @insertelement_i32(ptr %saddr, ptr %vaddr) {
+; CHECK-LABEL: 'insertelement_i32'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load i32, ptr %saddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of 3 for: %v2 = insertelement <2 x i32> %v0, i32 %v1, i32 1
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> %v2, ptr %vaddr, align 4
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
   %v0 = load %T_i32v, ptr %vaddr
   %v1 = load %T_i32, ptr %saddr
-;CHECK: estimated cost of 3 for {{.*}} insertelement <2 x i32>
   %v2 = insertelement %T_i32v %v0, %T_i32 %v1, i32 1
   store %T_i32v %v2, ptr %vaddr
   ret void
diff --git a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
deleted file mode 100644
index 6377437c8ca4b..0000000000000
--- a/llvm/test/Analysis/CostModel/ARM/intrinsic-cost-kinds.ll
+++ /dev/null
@@ -1,371 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput   < %s | FileCheck %s --check-prefix=THRU
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency      < %s | FileCheck %s --check-prefix=LATE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size    < %s | FileCheck %s --check-prefix=SIZE
-; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=size-latency < %s | FileCheck %s --check-prefix=SIZE_LATE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-; Test a cross-section of intrinsics for various cost-kinds.
-; Other test files may check for accuracy of a particular intrinsic
-; across subtargets or types. This is just a basic correctness check using an
-; ARM target and a legal scalar type (i32/float) and/or an
-; illegal vector type (16 x i32/float).
-
-declare i32 @llvm.smax.i32(i32, i32)
-declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
-
-declare float @llvm.fmuladd.f32(float, float, float)
-declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
-
-declare float @llvm.log2.f32(float)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-
-declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
-declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
-
-declare float @llvm.maximum.f32(float, float)
-declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
-
-declare i32 @llvm.cttz.i32(i32, i1)
-declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.ctlz.i32(i32, i1)
-declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
-
-declare i32 @llvm.fshl.i32(i32, i32, i32)
-declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
-
-declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
-declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
-declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
-
-declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
-
-declare i32 @llvm.ssa.copy.i32(i32)
-declare float @llvm.ssa.copy.f32(float)
-declare ptr @llvm.ssa.copy.p0(ptr)
-
-define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
-; THRU-LABEL: 'smax'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'smax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'smax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'smax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
-  %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
-  ret void
-}
-
-define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'fmuladd'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmuladd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmuladd'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmuladd'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
-  %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
-  ret void
-}
-
-define void @log2(float %a, <16 x float> %va) {
-; THRU-LABEL: 'log2'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'log2'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'log2'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'log2'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.log2.f32(float %a)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.log2.f32(float %a)
-  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
-  ret void
-}
-
-define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
-; THRU-LABEL: 'constrained_fadd'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'constrained_fadd'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'constrained_fadd'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'constrained_fadd'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-  %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
-  ret void
-}
-
-define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
-; THRU-LABEL: 'fmaximum'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fmaximum'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fmaximum'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fmaximum'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %s = call float @llvm.maximum.f32(float %a, float %b)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 208 for instruction: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call float @llvm.maximum.f32(float %a, float %b)
-  %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
-  ret void
-}
-
-define void @cttz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'cttz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'cttz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'cttz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'cttz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
-  %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
-  ret void
-}
-
-define void @ctlz(i32 %a, <16 x i32> %va) {
-; THRU-LABEL: 'ctlz'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ctlz'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ctlz'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ctlz'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
-  %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
-  ret void
-}
-
-define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
-; THRU-LABEL: 'fshl'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'fshl'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'fshl'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 92 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'fshl'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 120 for instruction: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
-  %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
-  ret void
-}
-
-define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
-; THRU-LABEL: 'maskedgather'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedgather'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedgather'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedgather'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
-  ret void
-}
-
-define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
-; THRU-LABEL: 'maskedscatter'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'maskedscatter'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'maskedscatter'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'maskedscatter'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 176 for instruction: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
-  ret void
-}
-
-define void @reduce_fmax(<16 x float> %va) {
-; THRU-LABEL: 'reduce_fmax'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'reduce_fmax'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'reduce_fmax'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'reduce_fmax'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
-  ret void
-}
-
-define void @memcpy(ptr %a, ptr %b, i32 %c) {
-; THRU-LABEL: 'memcpy'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'memcpy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'memcpy'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'memcpy'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
-  ret void
-}
-
-define void @ssa_copy() {
-  ; CHECK: %{{.*}} = llvm.intr.ssa.copy %{{.*}} : f32
-; THRU-LABEL: 'ssa_copy'
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; THRU-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; LATE-LABEL: 'ssa_copy'
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE-LABEL: 'ssa_copy'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; SIZE_LATE-LABEL: 'ssa_copy'
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %f = call float @llvm.ssa.copy.f32(float undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-; SIZE_LATE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %i = call i32 @llvm.ssa.copy.i32(i32 undef)
-  %f = call float @llvm.ssa.copy.f32(float undef)
-  %p = call ptr @llvm.ssa.copy.p0(ptr undef)
-  ret void
-}
diff --git a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
index 44042094fb06b..c98601ff45745 100644
--- a/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load-to-trunc.ll
@@ -1,17 +1,17 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s
+
 ; Check memory cost model action for a load of an unusually sized integer
 ; follow by and a trunc to a register sized integer gives a cost of 1 rather
 ; than the expanded cost if it is not.  Currently, this target does not have
 ; that expansion.
 
-; RUN: opt -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -mtriple=armv8r-none-eabi < %s | FileCheck %s --check-prefix=CHECK
-
 ; Check that cost is 1 for unusual load to register sized load.
 define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualIntegerWithTrunc'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %trunc = trunc i128 %out to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i32 %trunc
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %trunc = trunc i128 %out to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i32 %trunc
 ;
   %out = load i128, ptr %ptr
   %trunc = trunc i128 %out to i32
@@ -20,8 +20,8 @@ define i32 @loadUnusualIntegerWithTrunc(ptr %ptr) {
 
 define i128 @loadUnusualInteger(ptr %ptr) {
 ; CHECK-LABEL: 'loadUnusualInteger'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %out = load i128, ptr %ptr, align 8
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i128 %out
+; CHECK-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %out = load i128, ptr %ptr, align 8
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i128 %out
 ;
   %out = load i128, ptr %ptr
   ret i128 %out
diff --git a/llvm/test/Analysis/CostModel/ARM/load_store.ll b/llvm/test/Analysis/CostModel/ARM/load_store.ll
index 4c322e9462067..dd2eaae40a591 100644
--- a/llvm/test/Analysis/CostModel/ARM/load_store.ll
+++ b/llvm/test/Analysis/CostModel/ARM/load_store.ll
@@ -1,171 +1,117 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8a-linux-gnueabihf < %s | FileCheck %s --check-prefix=CHECK-V8-SIZE
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE-SIZE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv6m-none-eabi < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m3 < %s | FileCheck %s --check-prefix=CHECK-NOVEC
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7m-none-eabi -mcpu=cortex-m4 < %s | FileCheck %s --check-prefix=CHECK-FP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve < %s | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift < %s | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=arm-none-eabi -mcpu=cortex-a53 < %s | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @stores() {
 ; CHECK-NOVEC-LABEL: 'stores'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store double undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store double undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'stores'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'stores'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'stores'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'stores'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'stores'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i8 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i16 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i32 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i64 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store i128 undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store float undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store double undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <16 x i8> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x float> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <8 x i16> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x float> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x double> undef, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i8 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i16 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store i32 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: store i64 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store i128 undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store float undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store double undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <16 x i8> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:1 SizeLat:1 for: store <4 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x float> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <8 x i16> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: store <4 x float> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:1 SizeLat:1 for: store <2 x double> undef, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   store i8 undef, ptr undef, align 4
   store i16 undef, ptr undef, align 4
@@ -199,160 +145,108 @@ define void @stores() {
 
 define void @loads() {
 ; CHECK-NOVEC-LABEL: 'loads'
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NOVEC-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NOVEC-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-FP-LABEL: 'loads'
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-FP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-FP-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
 ; CHECK-MVE-LABEL: 'loads'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:18 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'loads'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-V8-SIZE-LABEL: 'loads'
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-V8-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'loads'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = load i8, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = load i16, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = load i32, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = load i64, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = load i128, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = load float, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = load double, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = load <2 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = load <2 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = load <2 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = load <2 x i64>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = load <4 x i32>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = load <8 x i16>, ptr undef, align 2
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = load <16 x i8>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = load <4 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = load <4 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = load <2 x float>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %18 = load <2 x double>, ptr undef, align 4
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = load <2 x i64>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = load <4 x i32>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = load <8 x i16>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = load <4 x float>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %23 = load <2 x double>, ptr undef, align 1
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %1 = load i8, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %2 = load i16, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %3 = load i32, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:4 SizeLat:1 for: %4 = load i64, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %5 = load i128, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %6 = load float, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %7 = load double, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %8 = load <2 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %9 = load <2 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %10 = load <2 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %11 = load <2 x i64>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %12 = load <4 x i32>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %13 = load <8 x i16>, ptr undef, align 2
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %14 = load <16 x i8>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %15 = load <4 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:1 Lat:4 SizeLat:1 for: %16 = load <4 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %17 = load <2 x float>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %18 = load <2 x double>, ptr undef, align 4
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %19 = load <2 x i64>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %20 = load <4 x i32>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %21 = load <8 x i16>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %22 = load <4 x float>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:1 Lat:4 SizeLat:1 for: %23 = load <2 x double>, ptr undef, align 1
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   load i8, ptr undef, align 4
   load i16, ptr undef, align 4
diff --git a/llvm/test/Analysis/CostModel/ARM/logicalop.ll b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
index 967426c727ea7..82eb716b81c10 100644
--- a/llvm/test/Analysis/CostModel/ARM/logicalop.ll
+++ b/llvm/test/Analysis/CostModel/ARM/logicalop.ll
@@ -1,72 +1,40 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @op() {
   ; Logical and/or - select's cost must be equivalent to that of binop
-; CHECK-MVE-RECIP-LABEL: 'op'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'op'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'op'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'op'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'op'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'op'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'op'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'op'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select i1 undef, i1 undef, i1 false
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %sor = select i1 undef, i1 true, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %bor = or i1 undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'op'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'op'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'op'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'op'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %sand = select i1 undef, i1 undef, i1 false
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %band = and i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %sor = select i1 undef, i1 true, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %bor = or i1 undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select i1 undef, i1 undef, i1 false
   %band = and i1 undef, undef
@@ -77,61 +45,33 @@ define void @op() {
 }
 
 define void @vecop() {
-; CHECK-MVE-RECIP-LABEL: 'vecop'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-NEON-RECIP-LABEL: 'vecop'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB1-RECIP-LABEL: 'vecop'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-RECIP-LABEL: 'vecop'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'vecop'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'vecop'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'vecop'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'vecop'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %band = and <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %bor = or <4 x i1> undef, undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-MVE-LABEL: 'vecop'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %band = and <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %bor = or <4 x i1> undef, undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-NEON-LABEL: 'vecop'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %band = and <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %bor = or <4 x i1> undef, undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+; CHECK-THUMB1-LABEL: 'vecop'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
+;
+; CHECK-THUMB2-LABEL: 'vecop'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> zeroinitializer
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %band = and <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %sor = select <4 x i1> undef, <4 x i1> splat (i1 true), <4 x i1> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 4 for: %bor = or <4 x i1> undef, undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %sand = select <4 x i1> undef, <4 x i1> undef, <4 x i1> <i1 false, i1 false, i1 false, i1 false>
   %band = and <4 x i1> undef, undef
diff --git a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
index 17d4263102ad2..07d2bd0100f2b 100644
--- a/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/mul-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = mul <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = mul <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = mul <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = mul <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
index 7de2799d5af9c..3ae02cdec5aba 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-smlal-patterns.ll
@@ -1,21 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
 
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -26,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = sext i16 %a to i32
     %bs = sext i16 %b to i32
@@ -51,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = sext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = sext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
@@ -82,18 +82,18 @@ define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 
 define i64 @different_extend_ops(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'different_extend_ops'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'different_extend_ops'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = sext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = sext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = sext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = sext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = sext i16 %a to i32
     %bs = zext i16 %b to i32
diff --git a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
index 521816d13000b..04a952098a9c6 100644
--- a/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
+++ b/llvm/test/Analysis/CostModel/ARM/muls-in-umull-patterns.ll
@@ -1,20 +1,21 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 5
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
-; RUN: opt -passes="print<cost-model>" 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main -mattr=+dsp  < %s | FileCheck %s
+; RUN: opt -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple thumbv8.1-m.main < %s | FileCheck %s --check-prefix=CHECK-NO-DSP
+
 define i64 @test(i16 %a, i16 %b) {
 ; CHECK-LABEL: 'test'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
 ; CHECK-NO-DSP-LABEL: 'test'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %ms
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -25,20 +26,20 @@ define i64 @test(i16 %a, i16 %b) {
 
 define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 ; CHECK-LABEL: 'withadd'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withadd'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %as = zext i16 %a to i32
     %bs = zext i16 %b to i32
@@ -50,24 +51,24 @@ define i64 @withadd(i16 %a, i16 %b, i64 %c) {
 
 define i64 @withloads(ptr %pa, ptr %pb, i64 %c) {
 ; CHECK-LABEL: 'withloads'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %m = mul i32 %as, %bs
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %m = mul i32 %as, %bs
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
 ; CHECK-NO-DSP-LABEL: 'withloads'
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %a = load i16, ptr %pa, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %b = load i16, ptr %pb, align 2
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %as = zext i16 %a to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %bs = zext i16 %b to i32
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %m = mul i32 %as, %bs
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %ms = zext i32 %m to i64
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r = add i64 %c, %ms
-; CHECK-NO-DSP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret i64 %r
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %a = load i16, ptr %pa, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %b = load i16, ptr %pb, align 2
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %as = zext i16 %a to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 0 for: %bs = zext i16 %b to i32
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %m = mul i32 %as, %bs
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: %ms = zext i32 %m to i64
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 2 for: %r = add i64 %c, %ms
+; CHECK-NO-DSP-NEXT:  Cost Model: Found costs of 1 for: ret i64 %r
 ;
     %a = load i16, ptr %pa
     %b = load i16, ptr %pb
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
new file mode 100644
index 0000000000000..b3ad818df394e
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-intrinsic-cost-kinds.ll
@@ -0,0 +1,181 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt -mtriple=armv8.1m.main -mattr=+mve.fp -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output   < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+; Test a cross-section of intrinsics for various cost-kinds.
+; Other test files may check for accuracy of a particular intrinsic
+; across subtargets or types. This is just a basic correctness check using an
+; ARM target and a legal scalar type (i32/float) and/or an
+; illegal vector type (16 x i32/float).
+
+declare i32 @llvm.smax.i32(i32, i32)
+declare <16 x i32> @llvm.smax.v16i32(<16 x i32>, <16 x i32>)
+
+declare float @llvm.fmuladd.f32(float, float, float)
+declare <16 x float> @llvm.fmuladd.v16f32(<16 x float>, <16 x float>, <16 x float>)
+
+declare float @llvm.log2.f32(float)
+declare <16 x float> @llvm.log2.v16f32(<16 x float>)
+
+declare float @llvm.experimental.constrained.fadd.f32(float, float, metadata, metadata)
+declare <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float>, <16 x float>, metadata, metadata)
+
+declare float @llvm.maximum.f32(float, float)
+declare <16 x float> @llvm.maximum.v16f32(<16 x float>, <16 x float>)
+
+declare i32 @llvm.cttz.i32(i32, i1)
+declare <16 x i32> @llvm.cttz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare <16 x i32> @llvm.ctlz.v16i32(<16 x i32>, i1)
+
+declare i32 @llvm.fshl.i32(i32, i32, i32)
+declare <16 x i32> @llvm.fshl.v16i32(<16 x i32>, <16 x i32>, <16 x i32>)
+
+declare <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr>, i32, <16 x i1>, <16 x float>)
+declare void @llvm.masked.scatter.v16f32.v16p0(<16 x float>, <16 x ptr>, i32, <16 x i1>)
+declare float @llvm.vector.reduce.fmax.v16f32(<16 x float>)
+
+declare void @llvm.memcpy.p0.p0.i32(ptr, ptr, i32, i1)
+
+declare i32 @llvm.ssa.copy.i32(i32)
+declare float @llvm.ssa.copy.f32(float)
+declare ptr @llvm.ssa.copy.p0(ptr)
+
+define void @smax(i32 %a, i32 %b, <16 x i32> %va, <16 x i32> %vb) {
+; CHECK-LABEL: 'smax'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.smax.i32(i32 %a, i32 %b)
+  %v = call <16 x i32> @llvm.smax.v16i32(<16 x i32> %va, <16 x i32> %vb)
+  ret void
+}
+
+define void @fmuladd(float %a, float %b, float %c, <16 x float> %va, <16 x float> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'fmuladd'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.fmuladd.f32(float %a, float %b, float %c)
+  %v = call <16 x float> @llvm.fmuladd.v16f32(<16 x float> %va, <16 x float> %vb, <16 x float> %vc)
+  ret void
+}
+
+define void @log2(float %a, <16 x float> %va) {
+; CHECK-LABEL: 'log2'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.log2.f32(float %a)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:48 Lat:192 SizeLat:192 for: %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.log2.f32(float %a)
+  %v = call <16 x float> @llvm.log2.v16f32(<16 x float> %va)
+  ret void
+}
+
+define void @constrained_fadd(float %a, <16 x float> %va) strictfp {
+; CHECK-LABEL: 'constrained_fadd'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT:  Cost Model: Found costs of 48 for: %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.experimental.constrained.fadd.f32(float %a, float %a, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  %t = call <16 x float> @llvm.experimental.constrained.fadd.v16f32(<16 x float> %va, <16 x float> %va, metadata !"round.dynamic", metadata !"fpexcept.ignore")
+  ret void
+}
+
+define void @fmaximum(float %a, float %b, <16 x float> %va, <16 x float> %vb) {
+; CHECK-LABEL: 'fmaximum'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:1 Lat:10 SizeLat:10 for: %s = call float @llvm.maximum.f32(float %a, float %b)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:208 CodeSize:64 Lat:208 SizeLat:208 for: %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call float @llvm.maximum.f32(float %a, float %b)
+  %v = call <16 x float> @llvm.maximum.v16f32(<16 x float> %va, <16 x float> %vb)
+  ret void
+}
+
+define void @cttz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'cttz'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  %v = call <16 x i32> @llvm.cttz.v16i32(<16 x i32> %va, i1 false)
+  ret void
+}
+
+define void @ctlz(i32 %a, <16 x i32> %va) {
+; CHECK-LABEL: 'ctlz'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+; CHECK-NEXT:  Cost Model: Found costs of 8 for: %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %v = call <16 x i32> @llvm.ctlz.v16i32(<16 x i32> %va, i1 true)
+  ret void
+}
+
+define void @fshl(i32 %a, i32 %b, i32 %c, <16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc) {
+; CHECK-LABEL: 'fshl'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:7 CodeSize:8 Lat:7 SizeLat:7 for: %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:120 CodeSize:92 Lat:120 SizeLat:120 for: %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %s = call i32 @llvm.fshl.i32(i32 %a, i32 %b, i32 %c)
+  %v = call <16 x i32> @llvm.fshl.v16i32(<16 x i32> %va, <16 x i32> %vb, <16 x i32> %vc)
+  ret void
+}
+
+define void @maskedgather(<16 x ptr> %va, <16 x i1> %vb, <16 x float> %vc) {
+; CHECK-LABEL: 'maskedgather'
+; CHECK-NEXT:  Cost Model: Found costs of 176 for: %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> align 1 %va, <16 x i1> %vb, <16 x float> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %v = call <16 x float> @llvm.masked.gather.v16f32.v16p0(<16 x ptr> %va, i32 1, <16 x i1> %vb, <16 x float> %vc)
+  ret void
+}
+
+define void @maskedscatter(<16 x float> %va, <16 x ptr> %vb, <16 x i1> %vc) {
+; CHECK-LABEL: 'maskedscatter'
+; CHECK-NEXT:  Cost Model: Found costs of 176 for: call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> align 1 %vb, <16 x i1> %vc)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  call void @llvm.masked.scatter.v16f32.v16p0(<16 x float> %va, <16 x ptr> %vb, i32 1, <16 x i1> %vc)
+  ret void
+}
+
+define void @reduce_fmax(<16 x float> %va) {
+; CHECK-LABEL: 'reduce_fmax'
+; CHECK-NEXT:  Cost Model: Found costs of RThru:9 CodeSize:6 Lat:9 SizeLat:9 for: %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %v = call float @llvm.vector.reduce.fmax.v16f32(<16 x float> %va)
+  ret void
+}
+
+define void @memcpy(ptr %a, ptr %b, i32 %c) {
+; CHECK-LABEL: 'memcpy'
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  call void @llvm.memcpy.p0.p0.i32(ptr align 1 %a, ptr align 1 %b, i32 32, i1 false)
+  ret void
+}
+
+define void @ssa_copy() {
+; CHECK-LABEL: 'ssa_copy'
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %f = call float @llvm.ssa.copy.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found costs of 0 for: %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %i = call i32 @llvm.ssa.copy.i32(i32 undef)
+  %f = call float @llvm.ssa.copy.f32(float undef)
+  %p = call ptr @llvm.ssa.copy.p0(ptr undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
new file mode 100644
index 0000000000000..d09216a41485f
--- /dev/null
+++ b/llvm/test/Analysis/CostModel/ARM/mve-target-intrinsics.ll
@@ -0,0 +1,24 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
+; RUN: opt < %s -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+
+define void @intrinsics() {
+; CHECK-LABEL: 'intrinsics'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
+;
+  %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
+  %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
+  %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
+  %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
+  ret void
+}
+
+declare i32 @llvm.arm.ssat(i32, i32)
+declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
+declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
+declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/ARM/select.ll b/llvm/test/Analysis/CostModel/ARM/select.ll
index f626901990f01..de429fd2b9be4 100644
--- a/llvm/test/Analysis/CostModel/ARM/select.ll
+++ b/llvm/test/Analysis/CostModel/ARM/select.ll
@@ -1,272 +1,140 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1-SIZE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.base | FileCheck %s --check-prefix=CHECK-THUMB1
+; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=all -mtriple=thumbv8m.main | FileCheck %s --check-prefix=CHECK-THUMB2
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @selects() {
   ; Scalar values
-; CHECK-MVE-RECIP-LABEL: 'selects'
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-LABEL: 'selects'
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:6 Lat:10 SizeLat:10 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-NEON-RECIP-LABEL: 'selects'
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-LABEL: 'selects'
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 19 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 100 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
-; CHECK-THUMB1-RECIP-LABEL: 'selects'
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB1-LABEL: 'selects'
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB1-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
-; CHECK-THUMB2-RECIP-LABEL: 'selects'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-MVE-SIZE-LABEL: 'selects'
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-MVE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-NEON-SIZE-LABEL: 'selects'
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 100 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-NEON-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB1-SIZE-LABEL: 'selects'
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB1-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'selects'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v0 = select i1 undef, i1 undef, i1 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v1 = select i1 undef, i8 undef, i8 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2 = select i1 undef, i16 undef, i16 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3 = select i1 undef, i32 undef, i32 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v4 = select i1 undef, i64 undef, i64 undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v5 = select i1 undef, float undef, float undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v6 = select i1 undef, double undef, double undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 25 for instruction: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+; CHECK-THUMB2-LABEL: 'selects'
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:3 Lat:1 SizeLat:1 for: %v0 = select i1 undef, i1 undef, i1 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v1 = select i1 undef, i8 undef, i8 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v2 = select i1 undef, i16 undef, i16 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v3 = select i1 undef, i32 undef, i32 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v4 = select i1 undef, i64 undef, i64 undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:2 Lat:1 SizeLat:1 for: %v5 = select i1 undef, float undef, float undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v6 = select i1 undef, double undef, double undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v7 = select <2 x i1> undef, <2 x i8> undef, <2 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v8 = select <4 x i1> undef, <4 x i8> undef, <4 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v9 = select <8 x i1> undef, <8 x i8> undef, <8 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v10 = select <16 x i1> undef, <16 x i8> undef, <16 x i8> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v11 = select <2 x i1> undef, <2 x i16> undef, <2 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v12 = select <4 x i1> undef, <4 x i16> undef, <4 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v13 = select <8 x i1> undef, <8 x i16> undef, <8 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v13b = select <16 x i1> undef, <16 x i16> undef, <16 x i16> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v14 = select <2 x i1> undef, <2 x i32> undef, <2 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v15 = select <4 x i1> undef, <4 x i32> undef, <4 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:24 Lat:16 SizeLat:16 for: %v15b = select <8 x i1> undef, <8 x i32> undef, <8 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:48 Lat:32 SizeLat:32 for: %v15c = select <16 x i1> undef, <16 x i32> undef, <16 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v16 = select <2 x i1> undef, <2 x i64> undef, <2 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:20 Lat:16 SizeLat:16 for: %v16a = select <4 x i1> undef, <4 x i64> undef, <4 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:40 Lat:32 SizeLat:32 for: %v16b = select <8 x i1> undef, <8 x i64> undef, <8 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:80 Lat:64 SizeLat:64 for: %v16c = select <16 x i1> undef, <16 x i64> undef, <16 x i64> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:6 Lat:4 SizeLat:4 for: %v17 = select <2 x i1> undef, <2 x float> undef, <2 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:12 Lat:8 SizeLat:8 for: %v18 = select <4 x i1> undef, <4 x float> undef, <4 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:10 Lat:8 SizeLat:8 for: %v19 = select <2 x i1> undef, <2 x double> undef, <2 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:3 Lat:2 SizeLat:2 for: %v20 = select <1 x i1> undef, <1 x i32> undef, <1 x i32> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:9 Lat:6 SizeLat:6 for: %v21 = select <3 x i1> undef, <3 x float> undef, <3 x float> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of RThru:20 CodeSize:25 Lat:20 SizeLat:20 for: %v22 = select <5 x i1> undef, <5 x double> undef, <5 x double> undef
+; CHECK-THUMB2-NEXT:  Cost Model: Found costs of 1 for: ret void
 ;
   %v0 = select i1 undef, i1 undef, i1 undef
   %v1 = select i1 undef, i8 undef, i8 undef
diff --git a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
index 465611d24a1f1..5ef869e64903c 100644
--- a/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shl-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %r3 = shl <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:1 SizeLat:1 for: %r3 = shl <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/shuffle.ll b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
index a17bbaba9d9af..0fd14566e23de 100644
--- a/llvm/test/Analysis/CostModel/ARM/shuffle.ll
+++ b/llvm/test/Analysis/CostModel/ARM/shuffle.ll
@@ -1,59 +1,59 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv8.1m.main-none-eabi -mattr=+mve.fp | FileCheck %s --check-prefix=CHECK-MVE
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=thumbv7-apple-ios6.0.0 -mcpu=swift | FileCheck %s --check-prefix=CHECK-NEON
 
 target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
 
 define void @broadcast() {
 ; CHECK-MVE-LABEL: 'broadcast'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:24 CodeSize:12 Lat:24 SizeLat:24 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:80 CodeSize:40 Lat:80 SizeLat:80 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:4 CodeSize:2 Lat:4 SizeLat:4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:6 CodeSize:3 Lat:6 SizeLat:6 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:10 CodeSize:5 Lat:10 SizeLat:10 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'broadcast'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 14 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 26 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 50 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 14 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 26 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 50 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> zeroinitializer
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> zeroinitializer
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> zeroinitializer
@@ -90,58 +90,58 @@ define void @broadcast() {
 ;; Reverse shuffles should be lowered to vrev and possibly a vext (for quadwords, on neon)
 define void @reverse() {
 ; CHECK-MVE-LABEL: 'reverse'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 192 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:48 CodeSize:24 Lat:48 SizeLat:48 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:192 CodeSize:96 Lat:192 SizeLat:192 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'reverse'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 15, i32 14, i32 13, i32 12, i32 11, i32 10, i32 9, i32 8, i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v3i24 = shufflevector <3 x i24> undef, <3 x i24> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 18 for: %v3i124 = shufflevector <3 x i124> undef, <3 x i124> undef, <3 x i32> <i32 2, i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 1, i32 0>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
@@ -179,40 +179,40 @@ define void @reverse() {
 
 define void @concat() {
 ; CHECK-MVE-LABEL: 'concat'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'concat'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v16i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v8i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 12 for: %v4i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f16 = shufflevector <2 x half> undef, <2 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f16 = shufflevector <4 x half> undef, <4 x half> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v16f16 = shufflevector <8 x half> undef, <8 x half> undef, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v4f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v8f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v4f64 = shufflevector <2 x double> undef, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
   %v8i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -241,54 +241,54 @@ define void @concat() {
 
 define void @select() {
 ; CHECK-MVE-LABEL: 'select'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:8 CodeSize:4 Lat:8 SizeLat:8 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'select'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 6 for: %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i16 = shufflevector <2 x i16> undef, <2 x i16> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 16 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 32 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i32 = shufflevector <2 x i32> undef, <2 x i32> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2i64 = shufflevector <2 x i64> undef, <2 x i64> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 10 for: %v2f16 = shufflevector <2 x half> undef, <2 x half> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 0, i32 17, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 4 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 8, i32 1, i32 10, i32 11, i32 4, i32 5, i32 6, i32 15>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 1 for: %v2f64 = shufflevector <2 x double> undef, <2 x double> undef, <2 x i32> <i32 1, i32 0>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 2 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v2i8 = shufflevector <2 x i8> undef, <2 x i8> undef, <2 x i32> <i32 0, i32 3>
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 0, i32 5, i32 2, i32 7>
@@ -324,40 +324,40 @@ define void @select() {
 
 define void @vrev2() {
 ; CHECK-MVE-LABEL: 'vrev2'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:16 CodeSize:8 Lat:16 SizeLat:16 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev2'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 20 for instruction: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i16 = shufflevector <4 x i16> undef, <4 x i16> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i32 = shufflevector <4 x i32> undef, <4 x i32> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 24 for: %v4i64 = shufflevector <4 x i64> undef, <4 x i64> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f16 = shufflevector <4 x half> undef, <4 x half> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6, i32 9, i32 8, i32 11, i32 10, i32 13, i32 12, i32 15, i32 14>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 20 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 8 for: %v4f64 = shufflevector <4 x double> undef, <4 x double> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v4i8 = shufflevector <4 x i8> undef, <4 x i8> undef, <4 x i32> <i32 1, i32 0, i32 3, i32 2>
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 1, i32 0, i32 3, i32 2, i32 5, i32 4, i32 7, i32 6>
@@ -386,26 +386,26 @@ define void @vrev2() {
 
 define void @vrev4() {
 ; CHECK-MVE-LABEL: 'vrev4'
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 256 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-MVE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:256 CodeSize:128 Lat:256 SizeLat:256 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:128 CodeSize:64 Lat:128 SizeLat:128 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:64 CodeSize:32 Lat:64 SizeLat:64 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:32 CodeSize:16 Lat:32 SizeLat:32 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-MVE-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
 ; CHECK-NEON-LABEL: 'vrev4'
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 80 for instruction: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 40 for instruction: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEON-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i16 = shufflevector <8 x i16> undef, <8 x i16> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 96 for: %v16i16 = shufflevector <16 x i16> undef, <16 x i16> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 48 for: %v8i32 = shufflevector <8 x i32> undef, <8 x i32> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 80 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of 40 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEON-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v8i8 = shufflevector <8 x i8> undef, <8 x i8> undef, <8 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4>
   %v16i8 = shufflevector <16 x i8> undef, <16 x i8> undef, <16 x i32> <i32 3, i32 2, i32 1, i32 0, i32 7, i32 6, i32 5, i32 4, i32 11, i32 10, i32 9, i32 8, i32 15, i32 14, i32 13, i32 12>
diff --git a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
index df26121ac25ab..924a6291cee05 100644
--- a/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
+++ b/llvm/test/Analysis/CostModel/ARM/sub-cast-vect.ll
@@ -1,5 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s  -passes="print<cost-model>" 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+; RUN: opt < %s  -passes="print<cost-model>" -cost-kind=all 2>&1 -disable-output -mtriple=armv7-linux-gnueabihf -mcpu=cortex-a9 | FileCheck --check-prefix=COST %s
+
 ; To see the assembly output: llc -mcpu=cortex-a9 < %s | FileCheck --check-prefix=ASM %s
 ; ASM lines below are only for reference, tests on that direction should go to tests/CodeGen/ARM
 
@@ -17,11 +18,11 @@ target triple = "armv7--linux-gnueabihf"
 
 define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'direct'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
@@ -36,13 +37,13 @@ define void @direct(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = sext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = sext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = sext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = sext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -59,13 +60,13 @@ define void @ups1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu1632'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i16>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r1 = zext <4 x i16> %v0 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: %r2 = zext <4 x i16> %v1 to <4 x i32>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %r1, %r2
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i32> %r3, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i16>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i16>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r1 = zext <4 x i16> %v0 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 0 for: %r2 = zext <4 x i16> %v1 to <4 x i32>
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %r1, %r2
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i32> %r3, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T416, ptr %loadaddr
 ; ASM: vldr
@@ -82,12 +83,12 @@ define void @upu1632(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'ups3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = sext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = sext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -104,12 +105,12 @@ define void @ups3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'upu3264'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <2 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <2 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = zext <2 x i32> %r3 to <2 x i64>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <2 x i64> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <2 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <2 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <2 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = zext <2 x i32> %r3 to <2 x i64>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <2 x i64> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T232, ptr %loadaddr
 ; ASM: vldr
@@ -126,12 +127,12 @@ define void @upu3264(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 
 define void @dn3216(ptr %loadaddr, ptr %loadaddr2, ptr %storeaddr) {
 ; COST-LABEL: 'dn3216'
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v0 = load <4 x i32>, ptr %loadaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %r3 = sub <4 x i32> %v0, %v1
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %st = trunc <4 x i32> %r3 to <4 x i16>
-; COST-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: store <4 x i16> %st, ptr %storeaddr, align 8
-; COST-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v0 = load <4 x i32>, ptr %loadaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:1 CodeSize:1 Lat:4 SizeLat:1 for: %v1 = load <4 x i32>, ptr %loadaddr2, align 8
+; COST-NEXT:  Cost Model: Found costs of 1 for: %r3 = sub <4 x i32> %v0, %v1
+; COST-NEXT:  Cost Model: Found costs of 1 for: %st = trunc <4 x i32> %r3 to <4 x i16>
+; COST-NEXT:  Cost Model: Found costs of 1 for: store <4 x i16> %st, ptr %storeaddr, align 8
+; COST-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret void
 ;
   %v0 = load %T432, ptr %loadaddr
 ; ASM: vld1.64
diff --git a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll b/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
deleted file mode 100644
index 2d6b3184ccde0..0000000000000
--- a/llvm/test/Analysis/CostModel/ARM/target-intrinsics.ll
+++ /dev/null
@@ -1,41 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=throughput -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-RECIP
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=latency -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-LAT
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -cost-kind=code-size -mtriple=thumbv8.1m.main -mattr=+mve | FileCheck %s --check-prefix=CHECK-THUMB2-SIZE
-
-target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
-
-define void @intrinsics() {
-; CHECK-THUMB2-RECIP-LABEL: 'intrinsics'
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; CHECK-THUMB2-LAT-LABEL: 'intrinsics'
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-LAT-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-; CHECK-THUMB2-SIZE-LABEL: 'intrinsics'
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-; CHECK-THUMB2-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
-;
-  %t1 = call i32 @llvm.arm.ssat(i32 undef, i32 undef)
-  %t2 = tail call { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr undef)
-  %t3 = call { i32, i32 } @llvm.arm.mve.sqrshrl(i32 undef, i32 undef, i32 undef, i32 48)
-  %t4 = tail call { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32 0, i32 0, i32 0, i32 0, i32 0, <8 x i16> undef, <8 x i16> undef)
-  ret void
-}
-
-declare i32 @llvm.arm.ssat(i32, i32)
-declare { <8 x half>, <8 x half> } @llvm.arm.mve.vld2q.v8f16.p0(ptr)
-declare { i32, i32 } @llvm.arm.mve.sqrshrl(i32, i32, i32, i32)
-declare { i32, i32 } @llvm.arm.mve.vmlldava.v8i16(i32, i32, i32, i32, i32, <8 x i16>, <8 x i16>)
diff --git a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
index 245e8f70df8d6..058370c34fb3d 100644
--- a/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
+++ b/llvm/test/Analysis/CostModel/SystemZ/intrinsic-cost-crash.ll
@@ -23,10 +23,10 @@
 %"class.llvm::Metadata.306.1758.9986.10470.10954.11438.11922.12406.12890.13374.13858.15310.15794.16278.17730.19182.21118.25958.26926.29346.29830.30314.30798.31282.31766.32250.32734.33702.36606.38058.41638" = type { i8, i8, i16, i32 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(ptr nocapture) #0
+declare void @llvm.lifetime.end(ptr nocapture)
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden void @fun(ptr %N, i1 %arg) #1 align 2 {
+define hidden void @fun(ptr %N, i1 %arg) align 2 {
 ; CHECK: define
 entry:
   %NumOperands.i = getelementptr inbounds %"class.llvm::SDNode.310.1762.9990.10474.10958.11442.11926.12410.12894.13378.13862.15314.15798.16282.17734.19186.21122.25962.26930.29350.29834.30318.30802.31286.31770.32254.32738.33706.36610.38062.41642", ptr %N, i64 0, i32 8
@@ -47,8 +47,6 @@ for.body:                                         ; preds = %for.body, %for.body
   br i1 %exitcond193, label %for.cond.cleanup, label %for.body
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
index 0c0fb4170b148..891d604f5cf13 100644
--- a/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
+++ b/llvm/test/Analysis/Delinearization/constant_functions_multi_dim.ll
@@ -4,7 +4,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: noinline nounwind uwtable
-define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) #0 !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
+define void @mat_mul(ptr %C, ptr %A, ptr %B, i64 %N) !kernel_arg_addr_space !2 !kernel_arg_access_qual !3 !kernel_arg_type !4 !kernel_arg_base_type !4 !kernel_arg_type_qual !5 {
 ; CHECK-LABEL: 'mat_mul'
 ; CHECK-NEXT:  Inst: %tmp = load float, ptr %arrayidx, align 4
 ; CHECK-NEXT:  AccessFunction: {(4 * %N * %call),+,4}<%for.inc>
@@ -22,8 +22,8 @@ entry:
   br label %entry.split
 
 entry.split:                                      ; preds = %entry
-  %call = tail call i64 @_Z13get_global_idj(i32 0) #3
-  %call1 = tail call i64 @_Z13get_global_idj(i32 1) #3
+  %call = tail call i64 @_Z13get_global_idj(i32 0)
+  %call1 = tail call i64 @_Z13get_global_idj(i32 1)
   %cmp1 = icmp sgt i64 %N, 0
   %mul = mul nsw i64 %call, %N
   br i1 %cmp1, label %for.inc.lr.ph, label %for.end
@@ -59,15 +59,10 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @_Z13get_global_idj(i32) #1
+declare i64 @_Z13get_global_idj(i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare float @llvm.fmuladd.f32(float, float, float) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable }
-attributes #3 = { nounwind readnone }
+declare float @llvm.fmuladd.f32(float, float, float)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
index b498d70648bad..f5be89a51f484 100644
--- a/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/MIVCheckConst.ll
@@ -30,7 +30,7 @@ target datalayout = "e-m:e-p:32:32-i1:32-i64:64-a:0-v32:32-n16:32"
 %20 = type { [768 x i32] }
 %21 = type { [416 x i32] }
 
-define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) #0 align 2 {
+define void @test(ptr %A, ptr %B, i1 %arg, i32 %n, i32 %m) align 2 {
 ; CHECK-LABEL: 'test'
 ; CHECK-NEXT:  Src: %v1 = load i32, ptr %B, align 4 --> Dst: %v1 = load i32, ptr %B, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -91,5 +91,3 @@ bb38:
 bb40:
   ret void
 }
-
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
index e5d5d21e365a1..eba017a7f4975 100644
--- a/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
+++ b/llvm/test/Analysis/DependenceAnalysis/NonCanonicalizedSubscript.ll
@@ -52,7 +52,7 @@ for.end:
 @a = global [10004 x [10004 x i32]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @coupled_miv_type_mismatch(i32 %n) #0 {
+define void @coupled_miv_type_mismatch(i32 %n) {
 ; CHECK-LABEL: 'coupled_miv_type_mismatch'
 ; CHECK-NEXT:  Src: %2 = load i32, ptr %arrayidx5, align 4 --> Dst: %2 = load i32, ptr %arrayidx5, align 4
 ; CHECK-NEXT:    da analyze - none!
@@ -101,8 +101,6 @@ for.end13:                                        ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.7.0"}
diff --git a/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
new file mode 100644
index 0000000000000..64fad37ab699a
--- /dev/null
+++ b/llvm/test/Analysis/DependenceAnalysis/compute-absolute-value.ll
@@ -0,0 +1,48 @@
+; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -disable-output "-passes=print<da>" 2>&1 | FileCheck %s
+
+; for (i = 0; i < 3; i++) {
+;     a[-k * i] = 1;
+;     a[-k * i + (2 * k + 1)] = 2;
+; }
+;
+; When k = -1, dependency exists between the two stores. Accesses will be:
+;
+;   - a[-k * i]               : a[ 0], a[-1], a[-2]
+;   - a[-k * i + (2 * k + 1)] : a[-1], a[-2], a[-3]
+;
+; We cannot determine the sign of `k` and `2*k + 1` at compile time,
+;
+define void @unknown_sign(ptr %a, i64 %k) {
+; CHECK-LABEL: 'unknown_sign'
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 1, ptr %idx.0, align 1
+; CHECK-NEXT:    da analyze - none!
+; CHECK-NEXT:  Src: store i8 1, ptr %idx.0, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - output [<>]!
+; CHECK-NEXT:  Src: store i8 2, ptr %idx.1, align 1 --> Dst: store i8 2, ptr %idx.1, align 1
+; CHECK-NEXT:    da analyze - none!
+;
+entry:
+  %k.neg = sub nsw i64 0, %k
+  %kk = mul nsw i64 %k, 2
+  %subscript.1.init = add i64 1, %kk
+  br label %loop
+
+loop:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %subscript.0 = phi i64 [ 0, %entry ], [ %subscript.0.next, %loop ]
+  %subscript.1 = phi i64 [ %subscript.1.init, %entry ], [ %subscript.1.next, %loop ]
+  %idx.0 = getelementptr i8, ptr %a, i64 %subscript.0
+  %idx.1 = getelementptr i8, ptr %a, i64 %subscript.1
+  store i8 1, ptr %idx.0
+  store i8 2, ptr %idx.1
+  %i.next = add i64 %i, 1
+  %subscript.0.next = add nsw i64 %subscript.0, %k.neg
+  %subscript.1.next = add nsw i64 %subscript.1, %k.neg
+  %cond.exit = icmp eq i64 %i.next, 3
+  br i1 %cond.exit, label %exit, label %loop
+
+exit:
+  ret void
+}
+
diff --git a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
index c11191ee96d01..5470ef94ae3d6 100644
--- a/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
+++ b/llvm/test/Analysis/MemoryDependenceAnalysis/invariant.group-bug.ll
@@ -17,13 +17,13 @@ target triple = "x86_64-grtev4-linux-gnu"
 %4 = type { ptr }
 %5 = type { i64, [8 x i8] }
 
-define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr #0 {
+define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1 %arg4) local_unnamed_addr {
 ; CHECK-LABEL: @fail(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[I4:%.*]] = load ptr, ptr [[ARG1:%.*]], align 8, !invariant.group [[META6:![0-9]+]]
 ; CHECK-NEXT:    [[I5:%.*]] = getelementptr inbounds ptr, ptr [[I4]], i64 6
 ; CHECK-NEXT:    [[I6:%.*]] = load ptr, ptr [[I5]], align 8, !invariant.load [[META6]]
-; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[I7:%.*]] = tail call i64 [[I6]](ptr [[ARG1]])
 ; CHECK-NEXT:    [[I9:%.*]] = load ptr, ptr [[ARG2:%.*]], align 8
 ; CHECK-NEXT:    store i8 0, ptr [[I9]], align 1
 ; CHECK-NEXT:    br i1 [[ARG4:%.*]], label [[BB10:%.*]], label [[BB29:%.*]]
@@ -32,7 +32,7 @@ define void @fail(ptr noalias sret(i1) %arg, ptr %arg1, ptr %arg2, ptr %arg3, i1
 ; CHECK-NEXT:    [[I15_PRE:%.*]] = load ptr, ptr [[I14_PHI_TRANS_INSERT]], align 8, !invariant.load [[META6]]
 ; CHECK-NEXT:    br label [[BB12:%.*]]
 ; CHECK:       bb12:
-; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0) #[[ATTR1]]
+; CHECK-NEXT:    [[I16:%.*]] = call i64 [[I15_PRE]](ptr nonnull [[ARG1]], ptr null, i64 0)
 ; CHECK-NEXT:    br i1 true, label [[BB28:%.*]], label [[BB17:%.*]]
 ; CHECK:       bb17:
 ; CHECK-NEXT:    br i1 true, label [[BB18:%.*]], label [[BB21:%.*]]
@@ -55,7 +55,7 @@ bb:
   %i4 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i5 = getelementptr inbounds ptr, ptr %i4, i64 6
   %i6 = load ptr, ptr %i5, align 8, !invariant.load !6
-  %i7 = tail call i64 %i6(ptr %arg1) #1
+  %i7 = tail call i64 %i6(ptr %arg1)
   %i9 = load ptr, ptr %arg2, align 8
   store i8 0, ptr %i9, align 1
   br i1 %arg4, label %bb10, label %bb29
@@ -67,7 +67,7 @@ bb12:                                             ; preds = %bb28, %bb10
   %i13 = load ptr, ptr %arg1, align 8, !invariant.group !6
   %i14 = getelementptr inbounds ptr, ptr %i13, i64 22
   %i15 = load ptr, ptr %i14, align 8, !invariant.load !6
-  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0) #1
+  %i16 = call i64 %i15(ptr nonnull %arg1, ptr null, i64 0)
   br i1 %arg4, label %bb28, label %bb17
 
 bb17:                                             ; preds = %bb12
@@ -110,9 +110,6 @@ bb29:                                             ; preds = %bb28, %bb
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.linker.options = !{}
 !llvm.module.flags = !{!0, !1, !3, !4, !5}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr28880.ll b/llvm/test/Analysis/MemorySSA/pr28880.ll
index 98f326180e857..a2690b9f0afc4 100644
--- a/llvm/test/Analysis/MemorySSA/pr28880.ll
+++ b/llvm/test/Analysis/MemorySSA/pr28880.ll
@@ -8,7 +8,7 @@
 @global.1 = external hidden unnamed_addr global double, align 8
 
 ; Function Attrs: nounwind ssp uwtable
-define hidden fastcc void @hoge(i1 %arg) unnamed_addr #0 {
+define hidden fastcc void @hoge(i1 %arg) unnamed_addr {
 bb:
   br i1 %arg, label %bb1, label %bb2
 
@@ -45,6 +45,3 @@ bb4:                                              ; preds = %bb3
 bb6:                                              ; preds = %bb3
   unreachable
 }
-
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Analysis/MemorySSA/pr39197.ll b/llvm/test/Analysis/MemorySSA/pr39197.ll
index af57b3c02a76f..6be0c5887a7e1 100644
--- a/llvm/test/Analysis/MemorySSA/pr39197.ll
+++ b/llvm/test/Analysis/MemorySSA/pr39197.ll
@@ -12,13 +12,13 @@ declare void @dummy()
 
 ; CHECK-LABEL: @main()
 ; Function Attrs: nounwind
-define dso_local void @main() #0 {
+define dso_local void @main() {
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
   %1 = alloca ptr, align 8
   %2 = call signext i32 @func_2()
   %3 = icmp ne i32 %2, 0
@@ -64,45 +64,45 @@ define dso_local void @func_1() #0 {
 }
 
 ; Function Attrs: nounwind
-declare dso_local signext i32 @func_2() #0
+declare dso_local signext i32 @func_2()
 
 ; Function Attrs: nounwind
-define dso_local void @safe_sub_func_uint8_t_u_u() #0 {
+define dso_local void @safe_sub_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int64_t_s_s() #0 {
+define dso_local void @safe_add_func_int64_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_rshift_func_int16_t_s_u() #0 {
+define dso_local void @safe_rshift_func_int16_t_s_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_uint8_t_u_u() #0 {
+define dso_local void @safe_div_func_uint8_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_uint16_t_u_u() #0 {
+define dso_local void @safe_mul_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_mul_func_int16_t_s_s() #0 {
+define dso_local void @safe_mul_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int32_t_s_s() #0 {
+define dso_local void @safe_div_func_int32_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
+define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) {
   %2 = alloca i16, align 2
   store i16 %0, ptr %2, align 2, !tbaa !1
   %3 = load i16, ptr %2, align 2, !tbaa !1
@@ -113,29 +113,25 @@ define dso_local signext i16 @safe_sub_func_int16_t_s_s(i16 signext) #0 {
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint16_t_u_u() #0 {
+define dso_local void @safe_add_func_uint16_t_u_u() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_div_func_int8_t_s_s() #0 {
+define dso_local void @safe_div_func_int8_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_int16_t_s_s() #0 {
+define dso_local void @safe_add_func_int16_t_s_s() {
   ret void
 }
 
 ; Function Attrs: nounwind
-define dso_local void @safe_add_func_uint8_t_u_u() #0 {
+define dso_local void @safe_add_func_uint8_t_u_u() {
   ret void
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 8.0.0 (http://llvm.org/git/clang.git 7cda4756fc9713d98fd3513b8df172700f267bad) (http://llvm.org/git/llvm.git 199c0d32e96b646bd8cf6beeaf0f99f8a434b56a)"}
diff --git a/llvm/test/Analysis/MemorySSA/pr40038.ll b/llvm/test/Analysis/MemorySSA/pr40038.ll
index efdcbe543c5d2..39ea78b0ed88b 100644
--- a/llvm/test/Analysis/MemorySSA/pr40038.ll
+++ b/llvm/test/Analysis/MemorySSA/pr40038.ll
@@ -10,21 +10,21 @@ target triple = "s390x-ibm-linux"
 
 ; Function Attrs: nounwind
 ; CHECK-LABEL: @main
-define dso_local void @main() #0 {
+define dso_local void @main() {
 bb:
   call void @func_1()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_1() #0 {
+define dso_local void @func_1() {
 bb:
   call void @func_2()
   unreachable
 }
 
 ; Function Attrs: nounwind
-define dso_local void @func_2() #0 {
+define dso_local void @func_2() {
 bb:
   %tmp = alloca i32, align 4
   store i32 0, ptr @g_80, align 4, !tbaa !1
@@ -68,10 +68,7 @@ bb18:                                             ; preds = %bb12, %bb1
 }
 
 ; Function Attrs: cold noreturn nounwind
-declare void @llvm.trap() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="z13" "target-features"="+transactional-execution,+vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { cold noreturn nounwind }
+declare void @llvm.trap()
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/MemorySSA/pr43569.ll b/llvm/test/Analysis/MemorySSA/pr43569.ll
index 02d074ef2aeb9..c81f8d4ab8b14 100644
--- a/llvm/test/Analysis/MemorySSA/pr43569.ll
+++ b/llvm/test/Analysis/MemorySSA/pr43569.ll
@@ -10,7 +10,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; CHECK-LABEL: @c()
 ; Function Attrs: nounwind uwtable
-define dso_local void @c() #0 {
+define dso_local void @c() {
 entry:
   call void @llvm.instrprof.increment(ptr @__profn_c, i64 68269137, i32 3, i32 0)
   br label %for.cond
@@ -42,8 +42,4 @@ for.end:                                          ; preds = %for.cond1
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.instrprof.increment(ptr, i64, i32, i32) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
+declare void @llvm.instrprof.increment(ptr, i64, i32, i32)
diff --git a/llvm/test/Analysis/ScalarEvolution/pr22674.ll b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
index 95f96ca17e763..b2f4ae640802c 100644
--- a/llvm/test/Analysis/ScalarEvolution/pr22674.ll
+++ b/llvm/test/Analysis/ScalarEvolution/pr22674.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-pc-linux-gnux32"
 %"class.llvm::AttributeImpl.2.1802.3601.5914.6685.7456.8227.9255.9769.10026.18508" = type <{ ptr, %"class.llvm::FoldingSetImpl::Node.1.1801.3600.5913.6684.7455.8226.9254.9768.10025.18505", i8, [3 x i8] }>
 
 ; Function Attrs: nounwind uwtable
-define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) #0 align 2 {
+define void @_ZNK4llvm11AttrBuilder13hasAttributesENS_12AttributeSetEy(i1 %arg) align 2 {
 entry:
   br i1 %arg, label %cond.false, label %_ZNK4llvm12AttributeSet11getNumSlotsEv.exit
 
@@ -82,8 +82,6 @@ return:                                           ; preds = %_ZNK4llvm9Attribute
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
index 3879b2e71851a..d9cc3e5873ba6 100644
--- a/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
+++ b/llvm/test/Analysis/ScalarEvolution/scev-canonical-mode.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @ehF(i1 %arg) #0 {
+define void @ehF(i1 %arg) {
 entry:
   br i1 %arg, label %if.then.i, label %hup.exit
 
@@ -28,5 +28,3 @@ for.body.i:                                       ; preds = %for.body.i, %for.bo
 hup.exit:                                         ; preds = %for.body.i, %if.then.i, %entry
   ret void
 }
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
index 67d81e7e7fe1a..7fb4231995d15 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/PR17620.ll
@@ -13,7 +13,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %classD = type { ptr }
 
 ; Function Attrs: ssp uwtable
-define ptr @test(ptr %this, ptr %p1) #0 align 2 {
+define ptr @test(ptr %this, ptr %p1) align 2 {
 entry:
 ; CHECK-LABEL: @test
 ; CHECK: load ptr, ptr %p1, align 8, !tbaa
@@ -25,10 +25,7 @@ entry:
   unreachable
 }
 
-declare void @callee(ptr, ptr) #1
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @callee(ptr, ptr)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
index 942fdf5f2cac7..f9a2988dc7e40 100644
--- a/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
+++ b/llvm/test/Analysis/TypeBasedAliasAnalysis/tbaa-path.ll
@@ -9,7 +9,7 @@
 %struct.StructC = type { i16, %struct.StructB, i32 }
 %struct.StructD = type { i16, %struct.StructB, i32, i8 }
 
-define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z1gPjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f32).
 ; CHECK: Function
@@ -35,7 +35,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) #0 {
+define i32 @_Z2g2PjP7StructAy(ptr %s, ptr %A, i64 %count) {
 entry:
 ; Access to ptr and &(A->f16).
 ; CHECK: Function
@@ -60,7 +60,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g3P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32).
 ; CHECK: Function
@@ -89,7 +89,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g4P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f16).
 ; CHECK: Function
@@ -117,7 +117,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g5P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->f32).
 ; CHECK: Function
@@ -145,7 +145,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) #0 {
+define i32 @_Z2g6P7StructAP7StructBy(ptr %A, ptr %B, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(B->a.f32_2).
 ; CHECK: Function
@@ -174,7 +174,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g7P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f32).
 ; CHECK: Function
@@ -202,7 +202,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) #0 {
+define i32 @_Z2g8P7StructAP7StructSy(ptr %A, ptr %S, i64 %count) {
 entry:
 ; Access to &(A->f32) and &(S->f16).
 ; CHECK: Function
@@ -229,7 +229,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z2g9P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f32).
 ; CHECK: Function
@@ -257,7 +257,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) #0 {
+define i32 @_Z3g10P7StructSP8StructS2y(ptr %S, ptr %S2, i64 %count) {
 entry:
 ; Access to &(S->f32) and &(S2->f16).
 ; CHECK: Function
@@ -284,7 +284,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g11P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(C->b.a.f32) and &(D->b.a.f32).
 ; CHECK: Function
@@ -318,7 +318,7 @@ entry:
   ret i32 %3
 }
 
-define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) #0 {
+define i32 @_Z3g12P7StructCP7StructDy(ptr %C, ptr %D, i64 %count) {
 entry:
 ; Access to &(b1->a.f32) and &(b2->a.f32).
 ; CHECK: Function
@@ -357,8 +357,6 @@ entry:
   ret i32 %5
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!1, !1, i64 0}
 !1 = !{!"any pointer", !2}
 !2 = !{!"omnipotent char", !3}
diff --git a/llvm/test/Assembler/atomic.ll b/llvm/test/Assembler/atomic.ll
index db9dd9ff11235..dfaf068dbc590 100644
--- a/llvm/test/Assembler/atomic.ll
+++ b/llvm/test/Assembler/atomic.ll
@@ -52,6 +52,25 @@ define void @f(ptr %x) {
   ; CHECK: atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
   atomicrmw volatile usub_sat ptr %x, i32 10 syncscope("agent") monotonic
 
+  ; CHECK : load atomic <1 x i32>, ptr %x unordered, align 4
+  load atomic <1 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <1 x i32> splat (i32 3), ptr %x release, align 4
+  store atomic <1 x i32> <i32 3>, ptr %x release, align 4
+  ; CHECK : load atomic <2 x i32>, ptr %x unordered, align 4
+  load atomic <2 x i32>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+  store atomic <2 x i32> <i32 3, i32 4>, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x ptr>, ptr %x unordered, align 4
+  load atomic <2 x ptr>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+  store atomic <2 x ptr> zeroinitializer, ptr %x release, align 4
+
+  ; CHECK : load atomic <2 x float>, ptr %x unordered, align 4
+  load atomic <2 x float>, ptr %x unordered, align 4
+  ; CHECK : store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+  store atomic <2 x float> <float 3.0, float 4.0>, ptr %x release, align 4
+
   ; CHECK: fence syncscope("singlethread") release
   fence syncscope("singlethread") release
   ; CHECK: fence seq_cst
diff --git a/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
new file mode 100644
index 0000000000000..458bd2edce712
--- /dev/null
+++ b/llvm/test/Assembler/autoupgrade-invalid-masked-align.ll
@@ -0,0 +1,49 @@
+; RUN: split-file %s %t
+; RUN: not llvm-as < %t/masked-store.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE
+; RUN: not llvm-as < %t/masked-store-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-STORE-ZERO
+; RUN: not llvm-as < %t/masked-load.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD
+; RUN: not llvm-as < %t/masked-load-zero.ll 2>&1 | FileCheck %s --check-prefix=MASKED-LOAD-ZERO
+; RUN: not llvm-as < %t/masked-scatter.ll 2>&1 | FileCheck %s --check-prefix=MASKED-SCATTER
+; RUN: not llvm-as < %t/masked-gather.ll 2>&1 | FileCheck %s --check-prefix=MASKED-GATHER
+
+;--- masked-store.ll
+; MASKED-STORE: LLVM ERROR: Invalid alignment argument
+define void @masked_store(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-store-zero.ll
+; MASKED-STORE-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_store_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.store.v2f64.p0(<2 x double> %val, ptr %ptr, i32 0, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-load.ll
+; MASKED-LOAD: LLVM ERROR: Invalid alignment argument
+define void @masked_load(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-load-zero.ll
+; MASKED-LOAD-ZERO: LLVM ERROR: Invalid zero alignment argument
+define void @masked_load_zero(ptr %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.load.v2f64.p0(ptr %ptr, i32 0, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
+
+;--- masked-scatter.ll
+; MASKED-SCATTER: LLVM ERROR: Invalid alignment argument
+define void @masked_scatter(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call void @llvm.masked.scatter.v2f64.p0(<2 x double> %val, <2 x ptr> %ptr, i32 3, <2 x i1> %mask)
+  ret void
+}
+
+;--- masked-gather.ll
+; MASKED-GATHER: LLVM ERROR: Invalid alignment argument
+define void @masked_gather(<2 x ptr> %ptr, <2 x i1> %mask, <2 x double> %val) {
+  call <2 x double> @llvm.masked.gather.v2f64.p0(<2 x ptr> %ptr, i32 3, <2 x i1> %mask, <2 x double> %val)
+  ret void
+}
diff --git a/llvm/test/Assembler/metadata-annotations.ll b/llvm/test/Assembler/metadata-annotations.ll
new file mode 100644
index 0000000000000..4fd471338cd0a
--- /dev/null
+++ b/llvm/test/Assembler/metadata-annotations.ll
@@ -0,0 +1,9 @@
+; RUN: llvm-as < %s | llvm-dis --materialize-metadata --show-annotations | FileCheck %s
+
+; CHECK: ; Materializable
+; CHECK-NEXT: define dso_local i32 @test() {}
+define dso_local i32 @test() {
+entry:
+  ret i32 0
+}
+
diff --git a/llvm/test/Bitcode/DILocation-implicit-code.ll b/llvm/test/Bitcode/DILocation-implicit-code.ll
index 159cb8ae80bb2..1090bfff2d108 100644
--- a/llvm/test/Bitcode/DILocation-implicit-code.ll
+++ b/llvm/test/Bitcode/DILocation-implicit-code.ll
@@ -13,7 +13,7 @@ $_ZN1A3fooEi = comdat any
 @_ZTIi = external dso_local constant i8*
 
 ; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test1v() #0 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
+define dso_local void @_Z5test1v() personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !7 {
 entry:
   %retval = alloca %struct.A, align 1
   %a = alloca %struct.A, align 1
@@ -40,13 +40,13 @@ lpad:                                             ; preds = %entry
 
 catch.dispatch:                                   ; preds = %lpad
   %sel = load i32, i32* %ehselector.slot, align 4, !dbg !13
-  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)) #4, !dbg !13
+  %3 = call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*)), !dbg !13
   %matches = icmp eq i32 %sel, %3, !dbg !13
   br i1 %matches, label %catch, label %eh.resume, !dbg !13
 
 catch:                                            ; preds = %catch.dispatch
   %exn = load i8*, i8** %exn.slot, align 8, !dbg !13
-  %4 = call i8* @__cxa_begin_catch(i8* %exn) #4, !dbg !13
+  %4 = call i8* @__cxa_begin_catch(i8* %exn), !dbg !13
   %5 = bitcast i8* %4 to i32*, !dbg !13
   %6 = load i32, i32* %5, align 4, !dbg !13
   store i32 %6, i32* %e, align 4, !dbg !13
@@ -55,7 +55,7 @@ catch:                                            ; preds = %catch.dispatch
           to label %invoke.cont2 unwind label %lpad1, !dbg !15
 
 invoke.cont2:                                     ; preds = %catch
-  call void @__cxa_end_catch() #4, !dbg !16
+  call void @__cxa_end_catch(), !dbg !16
   br label %return
 
 lpad1:                                            ; preds = %catch
@@ -65,7 +65,7 @@ lpad1:                                            ; preds = %catch
   store i8* %9, i8** %exn.slot, align 8, !dbg !12
   %10 = extractvalue { i8*, i32 } %8, 1, !dbg !12
   store i32 %10, i32* %ehselector.slot, align 4, !dbg !12
-  call void @__cxa_end_catch() #4, !dbg !16
+  call void @__cxa_end_catch(), !dbg !16
   br label %eh.resume, !dbg !16
 
 try.cont:                                         ; No predecessors!
@@ -84,7 +84,7 @@ eh.resume:                                        ; preds = %lpad1, %catch.dispa
 }
 
 ; Function Attrs: noinline nounwind optnone uwtable
-define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr #1 comdat align 2 !dbg !17 {
+define linkonce_odr dso_local void @_ZN1AC2Ev(%struct.A* %this) unnamed_addr comdat align 2 !dbg !17 {
 entry:
   %this.addr = alloca %struct.A*, align 8
   store %struct.A* %this, %struct.A** %this.addr, align 8
@@ -93,7 +93,7 @@ entry:
 }
 
 ; Function Attrs: noinline optnone uwtable
-define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) #0 comdat align 2 !dbg !19 {
+define linkonce_odr dso_local void @_ZN1A3fooEi(%struct.A* %this, i32 %i) comdat align 2 !dbg !19 {
 entry:
   %retval = alloca %struct.A, align 1
   %this.addr = alloca %struct.A*, align 8
@@ -106,10 +106,10 @@ entry:
   br i1 %cmp, label %if.then, label %if.end, !dbg !20
 
 if.then:                                          ; preds = %entry
-  %exception = call i8* @__cxa_allocate_exception(i64 4) #4, !dbg !22
+  %exception = call i8* @__cxa_allocate_exception(i64 4), !dbg !22
   %1 = bitcast i8* %exception to i32*, !dbg !22
   store i32 1, i32* %1, align 16, !dbg !22
-  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null) #5, !dbg !22
+  call void @__cxa_throw(i8* %exception, i8* bitcast (i8** @_ZTIi to i8*), i8* null), !dbg !22
   unreachable, !dbg !22
 
 if.end:                                           ; preds = %entry
@@ -119,17 +119,17 @@ if.end:                                           ; preds = %entry
 declare dso_local i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.eh.typeid.for(i8*) #2
+declare i32 @llvm.eh.typeid.for(i8*)
 
 declare dso_local i8* @__cxa_begin_catch(i8*)
 
 declare dso_local void @__cxa_end_catch()
 
 ; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
 
 ; Function Attrs: noinline optnone uwtable
-define dso_local void @_Z5test2v() #0 !dbg !24 {
+define dso_local void @_Z5test2v() !dbg !24 {
 entry:
   %a = alloca %struct.A, align 1
   %b = alloca %struct.A, align 1
@@ -143,13 +143,6 @@ declare dso_local i8* @__cxa_allocate_exception(i64)
 
 declare dso_local void @__cxa_throw(i8*, i8*, i8*)
 
-attributes #0 = { noinline optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Bitcode/drop-debug-info.3.5.ll b/llvm/test/Bitcode/drop-debug-info.3.5.ll
index 35d39583cd879..16102a04e71b1 100644
--- a/llvm/test/Bitcode/drop-debug-info.3.5.ll
+++ b/llvm/test/Bitcode/drop-debug-info.3.5.ll
@@ -12,15 +12,13 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.10.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main() #0 {
+define i32 @main() {
 entry:
   %retval = alloca i32, align 4
   store i32 0, i32* %retval
   ret i32 0, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll b/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
deleted file mode 100644
index 00c01314a51c2..0000000000000
--- a/llvm/test/Bitcode/thinlto-deadstrip-flag.ll
+++ /dev/null
@@ -1,20 +0,0 @@
-; REQUIRES: x86-registered-target
-; RUN: opt -module-summary %s -o %t.o
-
-; Ensure dead stripping performed flag is set on distributed index
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=WITHDEAD
-; WITHDEAD: <FLAGS op0=97/>
-
-; Ensure dead stripping performed flag is not set on distributed index
-; when option used to disable dead stripping computation.
-; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
-; RUN:		-r %t.o,glob,plx -compute-dead=false
-; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
-; NODEAD: <FLAGS op0=96/>
-
-target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-unknown-linux-gnu"
-
-@glob = global i32 0
diff --git a/llvm/test/Bitcode/thinlto-index-flags.ll b/llvm/test/Bitcode/thinlto-index-flags.ll
new file mode 100644
index 0000000000000..e957ce681038a
--- /dev/null
+++ b/llvm/test/Bitcode/thinlto-index-flags.ll
@@ -0,0 +1,39 @@
+; REQUIRES: x86-registered-target
+; RUN: opt -module-summary %s -o %t.o
+
+;; By default, the indexing step should perform and set the appropriate index
+;; flags for dead stripping, attribute propagation, DSO local propagation,
+;; and internalization/promotion.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=ALL
+;; The flag value should be 0x461 aka 1121:
+;; 0x1: Dead stripping
+;; 0x20: Attribute propagation
+;; 0x40: DSO local propagation
+;; 0x400: Internalization/promotion
+; ALL: <FLAGS op0=1121/>
+
+;; Ensure dead stripping performed flag is not set on distributed index
+;; when option used to disable dead stripping computation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -compute-dead=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NODEAD
+;; Flag should be 0x460 aka 1120.
+; NODEAD: <FLAGS op0=1120/>
+
+;; Disabling attribute propagation should disable that as well as DSO local
+;; propagation.
+; RUN: llvm-lto2 run %t.o -o %t.out -thinlto-distributed-indexes \
+; RUN:		-r %t.o,glob,plx -propagate-attrs=false
+; RUN: llvm-bcanalyzer -dump %t.o.thinlto.bc | FileCheck %s --check-prefix=NOPROP
+;; Flag should be 0x401 aka 1025.
+; NOPROP: <FLAGS op0=1025/>
+
+;; Note there isn't currently a way to disable internalization+promotion, which
+;; are performed together.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@glob = global i32 0
diff --git a/llvm/test/Bitcode/upgrade-branch-protection.ll b/llvm/test/Bitcode/upgrade-branch-protection.ll
index 1b33e3901198e..6f60ba543e6c0 100644
--- a/llvm/test/Bitcode/upgrade-branch-protection.ll
+++ b/llvm/test/Bitcode/upgrade-branch-protection.ll
@@ -1,8 +1,11 @@
-;; Test that module flags "branch-target-enforcement" and "sign-return-address"  can be upgraded to
-;; are upgraded from Error to Min.
+;; Test that module flags "branch-target-enforcement" and "sign-return-address"
+;; can be upgraded to are upgraded from Error to Min and the value is changed 2
+;; as the module is converted to the semantic.
 
 ; RUN: llvm-as %s -o - | llvm-dis - | FileCheck %s
 
+target triple = "aarch64-unknown-linux-gnu"
+
 !llvm.module.flags = !{!0, !1, !2, !3}
 
 !0 = !{i32 1, !"branch-target-enforcement", i32 1}
@@ -10,7 +13,7 @@
 !2 = !{i32 1, !"sign-return-address-all", i32 1}
 !3 = !{i32 1, !"sign-return-address-with-bkey", i32 1}
 
-;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 1}
-;CHECK: !1 = !{i32 8, !"sign-return-address", i32 1}
-;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 1}
-;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
\ No newline at end of file
+;CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2}
+;CHECK: !1 = !{i32 8, !"sign-return-address", i32 2}
+;CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2}
+;CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
diff --git a/llvm/test/Bitcode/upgrade-tbaa.ll b/llvm/test/Bitcode/upgrade-tbaa.ll
index 7a70d9993df60..893ba69052265 100644
--- a/llvm/test/Bitcode/upgrade-tbaa.ll
+++ b/llvm/test/Bitcode/upgrade-tbaa.ll
@@ -2,7 +2,7 @@
 ; RUN: verify-uselistorder < %s
 
 ; Function Attrs: nounwind
-define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) #0 {
+define void @_Z4testPiPf(i32* nocapture %pI, float* nocapture %pF) {
 entry:
   store i32 0, i32* %pI, align 4, !tbaa !{!"int", !0}
   ; CHECK: store i32 0, ptr %pI, align 4, !tbaa [[TAG_INT:!.*]]
@@ -11,8 +11,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!"omnipotent char", !1}
 !1 = !{!"Simple C/C++ TBAA"}
 !2 = !{!"float", !0}
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 0f75887d79cab..c68ae925d4295 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1734,8 +1734,7 @@ define float @test_different_call_conv_target(float %x) {
 define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
 ; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -1745,7 +1744,8 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
 define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) {
 ; CHECK-LABEL: name: test_shufflevector_v2s32_s32
 ; CHECK: [[ARG:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], shufflemask(1)
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ARG]](<2 x s32>), [[IDX]](s64)
 ; CHECK: $w0 = COPY [[RES]](s32)
   %vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> <i32 1>
   %res = extractelement <1 x i32> %vec, i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
index 97a0417042e1f..b040ff2932ce8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir
@@ -56,7 +56,7 @@
   }
 
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
   attributes #2 = { optsize }
   attributes #3 = { minsize }
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
index fc4fbacc33bcf..f24aeaed1d8bc 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir
@@ -47,7 +47,7 @@
     ret void
   }
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
index b06cadfa2e3f4..e4d2ca32468f6 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir
@@ -50,7 +50,7 @@
 
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2e7025222d733..fdd0ebb00a7ef 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -117,33 +117,6 @@ body:             |
     $q0 = COPY %2(<8 x s16>)
     RET_ReallyLR implicit $q0
 
-...
----
-name:            shuffle_1elt_mask
-alignment:       4
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $d0, $d1
-
-    ; CHECK-LABEL: name: shuffle_1elt_mask
-    ; CHECK: liveins: $d0, $d1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
-    ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
-    ; CHECK-NEXT: $d1 = COPY [[COPY3]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
-    %0:_(s64) = COPY $d0
-    %1:_(s64) = COPY $d1
-    %3:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(0)
-    %4:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(1)
-    $d0 = COPY %3(s64)
-    $d1 = COPY %4(s64)
-    RET_ReallyLR implicit $d0, implicit $d1
-
 ...
 ---
 name:            oversize_shuffle_v4i64
@@ -641,7 +614,8 @@ body:             |
     %0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
     %3:_(<8 x s8>) = COPY $d1
     %1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
-    %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+    %7:_(s64) = G_CONSTANT i64 4
+    %4:_(s1) = G_EXTRACT_VECTOR_ELT %1:_(<8 x s1>), %7(s64)
     %5:_(s8) = G_ZEXT %4:_(s1)
     %6:_(s32) = G_ANYEXT %5:_(s8)
     $w0 = COPY %6:_(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
index 9261d7af41c69..914dfa0c58d59 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
@@ -80,22 +80,3 @@ body:             |
     %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1,0,1,-1)
     RET_ReallyLR implicit %2
 ...
-
----
-name: shuffle_vector_scalar
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_scalar
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x1
-    %2:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0, 0, 0)
-    RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
index 9bf79936463c3..7a314ba9ab358 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
@@ -20,23 +20,3 @@ body:             |
     %2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1(<2 x s32>), shufflemask(0, 2, 1, 3)
     RET_ReallyLR implicit %2
 ...
-
----
-name: shuffle_vector_undef_rhs_scalar
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    ; CHECK-LABEL: name: shuffle_vector_undef_rhs_scalar
-    ; CHECK: liveins: $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = G_IMPLICIT_DEF
-    %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(s64), %1(s64), shufflemask(0, 1)
-    RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
index 2c9ae5b06b62e..901341003d495 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
@@ -386,7 +386,7 @@ body:             |
     ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x p0>)
     %0:_(p0) = COPY $x0
     %1:_(p0) = COPY $x1
-    %6:_(<4 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,1,0,1)
+    %6:_(<4 x p0>) = G_BUILD_VECTOR %0, %1, %0, %1
     RET_ReallyLR implicit %6
 ...
 
@@ -408,104 +408,6 @@ body:             |
     ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x p0>)
     %0:_(p0) = COPY $x0
     %1:_(p0) = COPY $x1
-    %6:_(<2 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1,0)
-    RET_ReallyLR implicit %6
-...
-
-# Check that we properly use undef values when shuffle_vector
-# on scalars gets lowered to build_vector.
----
-name: shuffle_vector_on_scalars_to_build_vector_with_undef
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_build_vector_with_undef
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64), [[DEF]](s64), [[COPY1]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
-    %0:_(s64) = COPY $x0
-    %1:_(s64) = COPY $x1
-    %6:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,-1,-1,1)
-    RET_ReallyLR implicit %6
-...
-
-# Check that shuffle_vector on scalars gets combined into a plain
-# copy when the resulting type is a scalar as well and the sizes
-# are compatible.
----
-name: shuffle_vector_on_scalars_to_copy_ptr
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0
-
-    ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_copy_ptr
-    ; CHECK: liveins: $x0
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
-    ; CHECK-NEXT: RET_ReallyLR implicit [[COPY]](p0)
-    %0:_(p0) = COPY $x0
-    %6:_(p0) = G_SHUFFLE_VECTOR %0, %0, shufflemask(0)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_lhs
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_lhs
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_rhs
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_rhs
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x1
-    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
-    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
-    ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
-    RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_undef
-tracksRegLiveness: true
-body:             |
-  bb.1:
-    liveins: $x0, $x1
-
-    ; CHECK-LABEL: name: shuffle_vector_to_copy_undef
-    ; CHECK: liveins: $x0, $x1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-    ; CHECK-NEXT: RET_ReallyLR implicit [[DEF]](s32)
-    %0:_(<2 x s32>) = COPY $x0
-    %1:_(<2 x s32>) = COPY $x1
-    %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1)
+    %6:_(<2 x p0>) = G_BUILD_VECTOR %1, %0
     RET_ReallyLR implicit %6
 ...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
index b2528840a39cf..46dbc1556fb1d 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
@@ -96,7 +96,7 @@ body:             |
     ; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fpr(s32) = G_SITOFP [[COPY1]](s32)
     ; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr(s32) = COPY [[COPY2]](s32)
     ; CHECK-NEXT: [[SELECT:%[0-9]+]]:fpr(s32) = G_SELECT [[COPY2]](s32), [[COPY3]], [[SITOFP]]
-    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gpr(s32) = G_FPTOSI [[SELECT]](s32)
+    ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:fpr(s32) = G_FPTOSI [[SELECT]](s32)
     %0:_(s32) = COPY $w0
     %2:_(s32) = COPY $w1
     %3:_(s32) = COPY $w2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
index 0c1776e61a4d4..6e3682a02eaff 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-2014-08-11-MachineCombinerCrash.ll
@@ -37,7 +37,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
index f2ed57ead074e..353e818f09683 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-a57-fp-load-balancing.ll
@@ -325,7 +325,7 @@ entry:
 
 declare void @hhh(double, double)
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
index 7e97116d9d022..8da0e111357d0 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-dynamic-stack-layout.ll
@@ -694,8 +694,8 @@ bb1:
 ; CHECK:  .[[LABEL]]:
 ; CHECK:  ret
 
-attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !1 = !{!2, !2, i64 0}
 !2 = !{!"int", !3, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
index 296435adc8de5..937bfe4ff94bf 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
@@ -519,8 +519,8 @@ while.cond:
   br label %while.cond
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67ed278b..b54f262dbbf4a 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body:                                         ; preds = %for.body.preheader1
   br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
 }
 
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD:       // %bb.0: // %entry
+; CHECK-SD-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT:    cbz w2, .LBB6_3
+; CHECK-SD-NEXT:  // %bb.1: // %iter.check
+; CHECK-SD-NEXT:    str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT:    stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT:    .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT:    .cfi_offset w19, -8
+; CHECK-SD-NEXT:    .cfi_offset w20, -16
+; CHECK-SD-NEXT:    .cfi_offset w21, -24
+; CHECK-SD-NEXT:    .cfi_offset w22, -32
+; CHECK-SD-NEXT:    .cfi_offset w23, -40
+; CHECK-SD-NEXT:    .cfi_offset w24, -48
+; CHECK-SD-NEXT:    .cfi_offset w25, -64
+; CHECK-SD-NEXT:    sxtb x9, w1
+; CHECK-SD-NEXT:    cmp w2, #3
+; CHECK-SD-NEXT:    mov w10, w2
+; CHECK-SD-NEXT:    b.hi .LBB6_4
+; CHECK-SD-NEXT:  // %bb.2:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_3:
+; CHECK-SD-NEXT:    mov x0, xzr
+; CHECK-SD-NEXT:    ret
+; CHECK-SD-NEXT:  .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT:    dup v0.2d, x9
+; CHECK-SD-NEXT:    cmp w2, #16
+; CHECK-SD-NEXT:    b.hs .LBB6_6
+; CHECK-SD-NEXT:  // %bb.5:
+; CHECK-SD-NEXT:    mov x11, xzr
+; CHECK-SD-NEXT:    mov x8, xzr
+; CHECK-SD-NEXT:    b .LBB6_10
+; CHECK-SD-NEXT:  .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:    and x12, x10, #0xc
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x15, x0
+; CHECK-SD-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT:    and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT:    fmov x13, d0
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:  .LBB6_7: // %vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr q17, [x15], #16
+; CHECK-SD-NEXT:    subs x16, x16, #16
+; CHECK-SD-NEXT:    ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT:    ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT:    ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT:    ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT:    ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT:    ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT:    ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT:    ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT:    ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT:    ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT:    ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT:    fmov x17, d21
+; CHECK-SD-NEXT:    mov x2, v21.d[1]
+; CHECK-SD-NEXT:    ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT:    ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT:    fmov x18, d22
+; CHECK-SD-NEXT:    fmov x1, d17
+; CHECK-SD-NEXT:    fmov x3, d23
+; CHECK-SD-NEXT:    fmov x21, d20
+; CHECK-SD-NEXT:    fmov x22, d18
+; CHECK-SD-NEXT:    fmov x19, d21
+; CHECK-SD-NEXT:    mul x17, x13, x17
+; CHECK-SD-NEXT:    mov x4, v22.d[1]
+; CHECK-SD-NEXT:    fmov x24, d19
+; CHECK-SD-NEXT:    mov x5, v23.d[1]
+; CHECK-SD-NEXT:    mov x6, v21.d[1]
+; CHECK-SD-NEXT:    mov x7, v20.d[1]
+; CHECK-SD-NEXT:    mov x20, v18.d[1]
+; CHECK-SD-NEXT:    mov x23, v19.d[1]
+; CHECK-SD-NEXT:    mov x25, v17.d[1]
+; CHECK-SD-NEXT:    mul x18, x14, x18
+; CHECK-SD-NEXT:    mul x1, x13, x1
+; CHECK-SD-NEXT:    fmov d17, x17
+; CHECK-SD-NEXT:    mul x3, x13, x3
+; CHECK-SD-NEXT:    fmov d18, x18
+; CHECK-SD-NEXT:    mul x19, x13, x19
+; CHECK-SD-NEXT:    fmov d19, x1
+; CHECK-SD-NEXT:    mul x21, x13, x21
+; CHECK-SD-NEXT:    fmov d20, x3
+; CHECK-SD-NEXT:    mul x22, x13, x22
+; CHECK-SD-NEXT:    fmov d21, x19
+; CHECK-SD-NEXT:    mul x24, x13, x24
+; CHECK-SD-NEXT:    fmov d24, x21
+; CHECK-SD-NEXT:    mul x2, x8, x2
+; CHECK-SD-NEXT:    fmov d22, x22
+; CHECK-SD-NEXT:    mul x4, x8, x4
+; CHECK-SD-NEXT:    fmov d23, x24
+; CHECK-SD-NEXT:    mul x5, x8, x5
+; CHECK-SD-NEXT:    mov v17.d[1], x2
+; CHECK-SD-NEXT:    mul x6, x8, x6
+; CHECK-SD-NEXT:    mov v18.d[1], x4
+; CHECK-SD-NEXT:    mul x7, x8, x7
+; CHECK-SD-NEXT:    mov v20.d[1], x5
+; CHECK-SD-NEXT:    add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT:    mul x20, x8, x20
+; CHECK-SD-NEXT:    mov v21.d[1], x6
+; CHECK-SD-NEXT:    add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT:    mul x23, x8, x23
+; CHECK-SD-NEXT:    mov v24.d[1], x7
+; CHECK-SD-NEXT:    add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT:    mul x17, x8, x25
+; CHECK-SD-NEXT:    mov v22.d[1], x20
+; CHECK-SD-NEXT:    add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT:    mov v23.d[1], x23
+; CHECK-SD-NEXT:    add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT:    mov v19.d[1], x17
+; CHECK-SD-NEXT:    add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT:    add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT:    add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_7
+; CHECK-SD-NEXT:  // %bb.8: // %middle.block
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT:    add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT:    add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT:    add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT:    add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    addp d1, v1.2d
+; CHECK-SD-NEXT:    fmov x8, d1
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT:    cbz x12, .LBB6_13
+; CHECK-SD-NEXT:  .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT:    movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT:    mov x13, x11
+; CHECK-SD-NEXT:    movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT:    fmov x14, d0
+; CHECK-SD-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT:    fmov x15, d0
+; CHECK-SD-NEXT:    sub x12, x13, x11
+; CHECK-SD-NEXT:    add x13, x0, x13
+; CHECK-SD-NEXT:    mov v1.d[0], x8
+; CHECK-SD-NEXT:    mov x8, v0.d[1]
+; CHECK-SD-NEXT:  .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldr s0, [x13], #4
+; CHECK-SD-NEXT:    adds x12, x12, #4
+; CHECK-SD-NEXT:    ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT:    ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT:    ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT:    ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT:    and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT:    and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT:    fmov x16, d4
+; CHECK-SD-NEXT:    fmov x18, d0
+; CHECK-SD-NEXT:    mov x17, v4.d[1]
+; CHECK-SD-NEXT:    mov x1, v0.d[1]
+; CHECK-SD-NEXT:    mul x16, x14, x16
+; CHECK-SD-NEXT:    mul x18, x15, x18
+; CHECK-SD-NEXT:    mul x17, x8, x17
+; CHECK-SD-NEXT:    fmov d0, x16
+; CHECK-SD-NEXT:    mul x1, x8, x1
+; CHECK-SD-NEXT:    fmov d4, x18
+; CHECK-SD-NEXT:    mov v0.d[1], x17
+; CHECK-SD-NEXT:    mov v4.d[1], x1
+; CHECK-SD-NEXT:    add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT:    add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT:    b.ne .LBB6_11
+; CHECK-SD-NEXT:  // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT:    add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT:    cmp x11, x10
+; CHECK-SD-NEXT:    addp d0, v0.2d
+; CHECK-SD-NEXT:    fmov x8, d0
+; CHECK-SD-NEXT:    b.eq .LBB6_15
+; CHECK-SD-NEXT:  .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT:    sub x10, x10, x11
+; CHECK-SD-NEXT:    add x11, x0, x11
+; CHECK-SD-NEXT:  .LBB6_14: // %for.body
+; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT:    ldrb w12, [x11], #1
+; CHECK-SD-NEXT:    subs x10, x10, #1
+; CHECK-SD-NEXT:    smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT:    b.ne .LBB6_14
+; CHECK-SD-NEXT:  .LBB6_15:
+; CHECK-SD-NEXT:    ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT:    ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT:    mov x0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI:       // %bb.0: // %entry
+; CHECK-GI-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT:    cbz w2, .LBB6_7
+; CHECK-GI-NEXT:  // %bb.1: // %iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    sxtb x9, w1
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #4
+; CHECK-GI-NEXT:    mov w10, w2
+; CHECK-GI-NEXT:    b.lo .LBB6_12
+; CHECK-GI-NEXT:  // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT:    movi d0, #0000000000000000
+; CHECK-GI-NEXT:    dup v1.2d, x9
+; CHECK-GI-NEXT:    mov x11, xzr
+; CHECK-GI-NEXT:    cmp w2, #16
+; CHECK-GI-NEXT:    b.lo .LBB6_9
+; CHECK-GI-NEXT:  // %bb.3: // %vector.ph
+; CHECK-GI-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT:    xtn v2.2s, v1.2d
+; CHECK-GI-NEXT:    and x8, x10, #0xc
+; CHECK-GI-NEXT:    movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x0
+; CHECK-GI-NEXT:    movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT:    movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT:    and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT:    movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT:  .LBB6_4: // %vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr q18, [x12], #16
+; CHECK-GI-NEXT:    subs x13, x13, #16
+; CHECK-GI-NEXT:    ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT:    ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT:    ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT:    ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT:    ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT:    ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT:    mov d22, v20.d[1]
+; CHECK-GI-NEXT:    mov d23, v19.d[1]
+; CHECK-GI-NEXT:    mov d24, v21.d[1]
+; CHECK-GI-NEXT:    mov d25, v18.d[1]
+; CHECK-GI-NEXT:    smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT:    smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT:    smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT:    smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT:    smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT:    smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT:    smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT:    smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_4
+; CHECK-GI-NEXT:  // %bb.5: // %middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT:    add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT:    add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    b.ne .LBB6_8
+; CHECK-GI-NEXT:  // %bb.6:
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_7:
+; CHECK-GI-NEXT:    mov x8, xzr
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+; CHECK-GI-NEXT:  .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT:    cbz x8, .LBB6_12
+; CHECK-GI-NEXT:  .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT:    mov v0.d[1], xzr
+; CHECK-GI-NEXT:    movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT:    mov x12, x11
+; CHECK-GI-NEXT:    xtn v1.2s, v1.2d
+; CHECK-GI-NEXT:    and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT:    sub x8, x12, x11
+; CHECK-GI-NEXT:    add x12, x0, x12
+; CHECK-GI-NEXT:  .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldr w13, [x12], #4
+; CHECK-GI-NEXT:    adds x8, x8, #4
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    uxtb w13, w13
+; CHECK-GI-NEXT:    mov b4, v3.b[2]
+; CHECK-GI-NEXT:    mov b5, v3.b[1]
+; CHECK-GI-NEXT:    mov b6, v3.b[3]
+; CHECK-GI-NEXT:    fmov s3, w13
+; CHECK-GI-NEXT:    fmov w14, s4
+; CHECK-GI-NEXT:    fmov w15, s5
+; CHECK-GI-NEXT:    fmov w16, s6
+; CHECK-GI-NEXT:    uxtb w14, w14
+; CHECK-GI-NEXT:    uxtb w15, w15
+; CHECK-GI-NEXT:    uxtb w16, w16
+; CHECK-GI-NEXT:    fmov s4, w14
+; CHECK-GI-NEXT:    mov v3.s[1], w15
+; CHECK-GI-NEXT:    mov v4.s[1], w16
+; CHECK-GI-NEXT:    smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT:    smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT:    b.ne .LBB6_10
+; CHECK-GI-NEXT:  // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT:    add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT:    cmp x11, x10
+; CHECK-GI-NEXT:    addp d0, v0.2d
+; CHECK-GI-NEXT:    fmov x8, d0
+; CHECK-GI-NEXT:    b.eq .LBB6_14
+; CHECK-GI-NEXT:  .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT:    sub x10, x10, x11
+; CHECK-GI-NEXT:    add x11, x0, x11
+; CHECK-GI-NEXT:  .LBB6_13: // %for.body
+; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT:    ldrb w8, [x11], #1
+; CHECK-GI-NEXT:    fmov x12, d0
+; CHECK-GI-NEXT:    subs x10, x10, #1
+; CHECK-GI-NEXT:    madd x8, x8, x9, x12
+; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    b.ne .LBB6_13
+; CHECK-GI-NEXT:  .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT:    mov x0, x8
+; CHECK-GI-NEXT:    ret
+entry:
+  %cmp5.not = icmp eq i32 %n, 0
+  br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check:                                       ; preds = %entry
+  %conv1 = sext i8 %B to i64
+  %wide.trip.count = zext i32 %n to i64
+  %min.iters.check = icmp ult i32 %n, 4
+  br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check:                      ; preds = %iter.check
+  %min.iters.check9 = icmp ult i32 %n, 16
+  br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph:                                        ; preds = %vector.main.loop.iter.check
+  %n.mod.vf = and i64 %wide.trip.count, 12
+  %n.vec = and i64 %wide.trip.count, 4294967280
+  %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+  br label %vector.body
+
+vector.body:                                      ; preds = %vector.body, %vector.ph
+  %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+  %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+  %wide.load = load <16 x i8>, ptr %0, align 1
+  %1 = zext <16 x i8> %wide.load to <16 x i64>
+  %2 = mul nsw <16 x i64> %broadcast.splat, %1
+  %3 = add <16 x i64> %2, %vec.phi
+  %index.next = add nuw i64 %index, 16
+  %4 = icmp eq i64 %index.next, %n.vec
+  br i1 %4, label %middle.block, label %vector.body
+
+middle.block:                                     ; preds = %vector.body
+  %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+  %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+  br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check:                            ; preds = %middle.block
+  %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+  br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph:                                    ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+  %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+  %n.vec11 = and i64 %wide.trip.count, 4294967292
+  %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+  %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+  %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+  br label %vec.epilog.vector.body
+
+vec.epilog.vector.body:                           ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+  %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+  %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+  %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+  %wide.load16 = load <4 x i8>, ptr %7, align 1
+  %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+  %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+  %10 = add <4 x i64> %9, %vec.phi15
+  %index.next17 = add nuw i64 %index14, 4
+  %11 = icmp eq i64 %index.next17, %n.vec11
+  br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block:                          ; preds = %vec.epilog.vector.body
+  %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+  %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+  br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+  %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+  %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+  %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+  ret i64 %s.0.lcssa
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+  %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+  %13 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %13 to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %add = add nsw i64 %mul, %s.06
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
 define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-LABEL: sink_v2z64_1:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB6_1: // %loop
+; CHECK-SD-NEXT:  .LBB7_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    umull v1.2d, v1.2s, v0.s[1]
 ; CHECK-SD-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB6_1
+; CHECK-SD-NEXT:    b.ne .LBB7_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB6_1: // %loop
+; CHECK-GI-NEXT:  .LBB7_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    umull v1.2d, v1.2s, v0.2s
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB6_1
+; CHECK-GI-NEXT:    b.ne .LBB7_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    mov x8, xzr
 ; CHECK-SD-NEXT:    // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT:  .LBB7_1: // %loop
+; CHECK-SD-NEXT:  .LBB8_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-SD-NEXT:    shrn v2.2s, v2.2d, #15
 ; CHECK-SD-NEXT:    shrn2 v2.4s, v1.2d, #15
 ; CHECK-SD-NEXT:    str q2, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB7_1
+; CHECK-SD-NEXT:    b.ne .LBB8_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.2d, v0.d[1]
 ; CHECK-GI-NEXT:    xtn v0.2s, v0.2d
-; CHECK-GI-NEXT:  .LBB7_1: // %loop
+; CHECK-GI-NEXT:  .LBB8_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
 ; CHECK-GI-NEXT:    shrn v1.2s, v1.2d, #15
 ; CHECK-GI-NEXT:    shrn2 v1.4s, v2.2d, #15
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB7_1
+; CHECK-GI-NEXT:    b.ne .LBB8_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.8b, v0.b[0]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB8_1: // %loop
+; CHECK-SD-NEXT:  .LBB9_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr d1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-SD-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-SD-NEXT:    str d1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB8_1
+; CHECK-SD-NEXT:    b.ne .LBB9_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[0]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB8_1: // %loop
+; CHECK-GI-NEXT:  .LBB9_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr d1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v1.8h, v1.8h, #0
 ; CHECK-GI-NEXT:    xtn v1.8b, v1.8h
 ; CHECK-GI-NEXT:    str d1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB8_1
+; CHECK-GI-NEXT:    b.ne .LBB9_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    dup v0.16b, v0.b[10]
 ; CHECK-SD-NEXT:    mov x8, xzr
-; CHECK-SD-NEXT:  .LBB9_1: // %loop
+; CHECK-SD-NEXT:  .LBB10_1: // %loop
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    ldr q1, [x0]
 ; CHECK-SD-NEXT:    subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-SD-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-SD-NEXT:    uzp1 v1.16b, v2.16b, v1.16b
 ; CHECK-SD-NEXT:    str q1, [x0], #32
-; CHECK-SD-NEXT:    b.ne .LBB9_1
+; CHECK-SD-NEXT:    b.ne .LBB10_1
 ; CHECK-SD-NEXT:  // %bb.2: // %exit
 ; CHECK-SD-NEXT:    ret
 ;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    mov x8, xzr
 ; CHECK-GI-NEXT:    dup v0.8h, v0.h[2]
 ; CHECK-GI-NEXT:    xtn v0.8b, v0.8h
-; CHECK-GI-NEXT:  .LBB9_1: // %loop
+; CHECK-GI-NEXT:  .LBB10_1: // %loop
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    ldr q1, [x0]
 ; CHECK-GI-NEXT:    subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
 ; CHECK-GI-NEXT:    cmlt v2.8h, v2.8h, #0
 ; CHECK-GI-NEXT:    uzp1 v1.16b, v1.16b, v2.16b
 ; CHECK-GI-NEXT:    str q1, [x0], #32
-; CHECK-GI-NEXT:    b.ne .LBB9_1
+; CHECK-GI-NEXT:    b.ne .LBB10_1
 ; CHECK-GI-NEXT:  // %bb.2: // %exit
 ; CHECK-GI-NEXT:    ret
 entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    dup v0.4h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT:  .LBB10_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-SD-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB10_1
+; CHECK-SD-NEXT:    b.ne .LBB11_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB10_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
 ; CHECK-GI-NEXT:    umull v1.4s, v0.4h, v1.4h
 ; CHECK-GI-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB10_1
+; CHECK-GI-NEXT:    b.ne .LBB11_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    dup v0.8h, w3
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT:  .LBB11_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-SD-NEXT:    umull v2.4s, v0.4h, v2.4h
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB11_1
+; CHECK-SD-NEXT:    b.ne .LBB12_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
 ; CHECK-GI-NEXT:    xtn v0.4h, v0.4s
-; CHECK-GI-NEXT:  .LBB11_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
 ; CHECK-GI-NEXT:    umull v4.4s, v0.4h, v4.4h
 ; CHECK-GI-NEXT:    stp q1, q3, [x9]
 ; CHECK-GI-NEXT:    stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB11_1
+; CHECK-GI-NEXT:    b.ne .LBB12_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff8
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB12_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-SD-NEXT:    mul v1.4s, v1.4s, v0.s[0]
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q2, [x9]
-; CHECK-SD-NEXT:    b.ne .LBB12_1
+; CHECK-SD-NEXT:    b.ne .LBB13_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT:  .LBB12_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
 ; CHECK-GI-NEXT:    mul v1.4s, v0.4s, v1.4s
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q1, q2, [x9]
-; CHECK-GI-NEXT:    b.ne .LBB12_1
+; CHECK-GI-NEXT:    b.ne .LBB13_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    // kill: def $w0 killed $w0 def $x0
 ; CHECK-SD-NEXT:    and x8, x0, #0xfffffff0
 ; CHECK-SD-NEXT:    fmov s0, w9
-; CHECK-SD-NEXT:  .LBB13_1: // %vector.body
+; CHECK-SD-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-SD-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-SD-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-SD-NEXT:    subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-SD-NEXT:    mul v2.4s, v2.4s, v0.s[0]
 ; CHECK-SD-NEXT:    stp q1, q3, [x9]
 ; CHECK-SD-NEXT:    stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT:    b.ne .LBB13_1
+; CHECK-SD-NEXT:    b.ne .LBB14_1
 ; CHECK-SD-NEXT:  // %bb.2: // %for.end12
 ; CHECK-SD-NEXT:    ret
 ;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    dup v0.4s, w8
 ; CHECK-GI-NEXT:    mov w8, w0
 ; CHECK-GI-NEXT:    and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT:  .LBB13_1: // %vector.body
+; CHECK-GI-NEXT:  .LBB14_1: // %vector.body
 ; CHECK-GI-NEXT:    // =>This Inner Loop Header: Depth=1
 ; CHECK-GI-NEXT:    add x9, x2, w0, uxtw #1
 ; CHECK-GI-NEXT:    subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
 ; CHECK-GI-NEXT:    mul v2.4s, v0.4s, v2.4s
 ; CHECK-GI-NEXT:    stp q3, q1, [x9]
 ; CHECK-GI-NEXT:    stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT:    b.ne .LBB13_1
+; CHECK-GI-NEXT:    b.ne .LBB14_1
 ; CHECK-GI-NEXT:  // %bb.2: // %for.end12
 ; CHECK-GI-NEXT:    ret
 vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-SD:       // %bb.0: // %entry
 ; CHECK-SD-NEXT:    lsr x9, x0, #16
-; CHECK-SD-NEXT:    adrp x8, .LCPI14_0
+; CHECK-SD-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-SD-NEXT:    dup v4.8h, w0
-; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT:    ldr q3, [x8, :lo12:.LCPI15_0]
 ; CHECK-SD-NEXT:    dup v2.8h, w9
 ; CHECK-SD-NEXT:    sqneg v1.8h, v2.8h
 ; CHECK-SD-NEXT:    tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
 ; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
 ; CHECK-GI:       // %bb.0: // %entry
 ; CHECK-GI-NEXT:    lsr w9, w0, #16
-; CHECK-GI-NEXT:    adrp x8, .LCPI14_0
+; CHECK-GI-NEXT:    adrp x8, .LCPI15_0
 ; CHECK-GI-NEXT:    rev32 v4.8h, v0.8h
 ; CHECK-GI-NEXT:    dup v1.8h, w9
 ; CHECK-GI-NEXT:    fmov s3, w9
 ; CHECK-GI-NEXT:    sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT:    ldr q1, [x8, :lo12:.LCPI15_0]
 ; CHECK-GI-NEXT:    tbl v1.16b, { v2.16b, v3.16b }, v1.16b
 ; CHECK-GI-NEXT:    mov d2, v0.d[1]
 ; CHECK-GI-NEXT:    dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
index 45fa2be5b59de..c05d661da1b92 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
+++ b/llvm/test/CodeGen/AArch64/aarch64-mov-debug-locs.mir
@@ -79,8 +79,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind readnone speculatable }
   attributes #3 = { nounwind }
   
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666bdbc75..0cd885e599817 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
 define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
 ; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-NEON:       // %bb.0:
-; CHECK-NEON-NEXT:    ldrh w8, [x0]
-; CHECK-NEON-NEXT:    ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT:    ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT:    ldr h0, [x0]
 ; CHECK-NEON-NEXT:    ldr d1, [x1]
-; CHECK-NEON-NEXT:    fmov d0, x8
-; CHECK-NEON-NEXT:    mov v0.d[1], x9
+; CHECK-NEON-NEXT:    mov v0.d[1], x8
 ; CHECK-NEON-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-NEON-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-NEON-NEXT:    ret
 ;
 ; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
 ; CHECK-SVE:       // %bb.0:
-; CHECK-SVE-NEXT:    ldrh w8, [x0]
-; CHECK-SVE-NEXT:    ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT:    ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT:    ldr h0, [x0]
 ; CHECK-SVE-NEXT:    ldr d1, [x1]
-; CHECK-SVE-NEXT:    fmov d0, x8
-; CHECK-SVE-NEXT:    mov v0.d[1], x9
+; CHECK-SVE-NEXT:    mov v0.d[1], x8
 ; CHECK-SVE-NEXT:    xtn v0.2s, v0.2d
 ; CHECK-SVE-NEXT:    smull v0.2d, v0.2s, v1.2s
 ; CHECK-SVE-NEXT:    ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
index f34d3ed510a97..6b2a31b02c097 100644
--- a/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
+++ b/llvm/test/CodeGen/AArch64/arm64-copy-phys-zero-reg.mir
@@ -35,7 +35,7 @@ body:             |
     ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
     ; CHECK-NOZCZ-GPR32-ZCZ-GPR64: liveins: $x0, $lr
     ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: {{  $}}
-    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $w0 = ORRWrr $wzr, $wzr
+    ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: $x0 = MOVZXi 0, 0
     ; CHECK-NOZCZ-GPR32-ZCZ-GPR64-NEXT: BL @f2, csr_darwin_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $w0, implicit-def $sp, implicit-def $w0
     ;
     ; CHECK-ZCZ-GPR32-ZCZ-GPR64-LABEL: name: f0
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
new file mode 100644
index 0000000000000..a729772f2897a
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
@@ -0,0 +1,1943 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFPRCVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 2>&1  | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-GI: warning: Instruction selection used fallback path for fptosi_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f32_simd
+
+;
+; FPTOI
+;
+
+define float @test_fptosi_f16_i32_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f16_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %r = fptosi half %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptosi_f16_i64_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f16_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %r = fptosi half %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define float @test_fptosi_f64_i32_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f64_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = fptosi double %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptosi_f32_i64_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f32_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi float %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define double @test_fptosi_f64_i64_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f64_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = fptosi double %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+
+define float @test_fptosi_f32_i32_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptosi_f32_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = fptosi float %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define float @test_fptoui_f16_i32_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f16_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %r = fptoui half %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptoui_f16_i64_simd(half %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f16_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %r = fptoui half %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define float @test_fptoui_f64_i32_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f64_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = fptoui double %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+define double @test_fptoui_f32_i64_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f32_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui float %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+define double @test_fptoui_f64_i64_simd(double %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f64_i64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %r = fptoui double %a to i64
+  %bc = bitcast i64 %r to double
+  ret double %bc
+}
+
+
+define float @test_fptoui_f32_i32_simd(float %a)  {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: test_fptoui_f32_i32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %r = fptoui float %a to i32
+  %bc = bitcast i32 %r to float
+  ret float %bc
+}
+
+
+;
+; FPTOI strictfp
+;
+
+define float @fptosi_i32_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i32 %val to float
+  ret float %sum
+}
+
+define double @fptosi_i64_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i64 %val to double
+  ret double %sum
+}
+
+define double @fptosi_i64_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptosi_i32_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+define double @fptosi_i64_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i64_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptosi_i32_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptosi_i32_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+
+
+define float @fptoui_i32_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i32 %val to float
+  ret float %sum
+}
+
+define double @fptoui_i64_f16_simd(half %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f16_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f16_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict")
+  %sum = bitcast i64 %val to double
+  ret double %sum
+}
+
+define double @fptoui_i64_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptoui_i32_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+define double @fptoui_i64_f64_simd(double %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f64_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i64_f64_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %val = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict")
+  %bc = bitcast i64 %val to double
+  ret double %bc
+}
+
+define float @fptoui_i32_f32_simd(float %x)  {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f32_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fptoui_i32_f32_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %val = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict")
+  %bc = bitcast i32 %val to float
+  ret float %bc
+}
+
+;
+; FPTOI rounding
+;
+
+
+define double @fcvtas_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtas_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtas_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtau_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtau_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtms_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtms_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+
+define double @fcvtmu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtmu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtps_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtps_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtpu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtpu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+define double @fcvtzs_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptoui float %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptoui double %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = fptosi float %r to i32
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_round_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_round_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = fptosi double %r to i64
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+
+;
+; FPTOI saturating
+;
+
+define float @fcvtzs_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzs_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dh_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_sat_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_sat_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+;
+; FPTOI saturating with rounding
+;
+
+define float @fcvtas_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtas_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtas_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtas_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtas_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtas_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.round.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtau_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtau_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtau s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtau_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.round.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtau_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtau_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtas d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.round.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtms_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtms_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtms_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtms_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtms_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.floor.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtmu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtmu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtmu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtmu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.floor.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtmu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtmu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtms d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.floor.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtps_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtps_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtps_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtps_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtps_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtpu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtpu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtpu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtpu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.ceil.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtpu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtpu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtps d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.ceil.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzs_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzs_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzs_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzs_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzs_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzs d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dh_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, h0
+; CHECK-NEXT:    ret
+  %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+  %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define double @fcvtzu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT:    fmov d0, x8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ds_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
+
+define float @fcvtzu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT:    fmov s0, w8
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_sd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define float @fcvtzu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_ss_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ret
+  %r = call float @llvm.trunc.f32(float %a)
+  %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+  %bc = bitcast i32 %i to float
+  ret float %bc
+}
+
+define double @fcvtzu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_simd:
+; CHECK-NOFPRCVT:       // %bb.0:
+; CHECK-NOFPRCVT-NEXT:    fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT:    ret
+;
+; CHECK-LABEL: fcvtzu_dd_simd:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    fcvtzu d0, d0
+; CHECK-NEXT:    ret
+  %r = call double @llvm.trunc.f64(double %a)
+  %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+  %bc = bitcast i64 %i to double
+  ret double %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
index 4e86f5258ef19..071344da03f47 100644
--- a/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-detect-vec-redux.ll
@@ -47,6 +47,6 @@ declare <2 x i64> @llvm.aarch64.neon.addp.v2i64(<2 x i64>, <2 x i64>) #1
 ; Function Attrs: nounwind readnone
 declare <2 x i32> @llvm.aarch64.neon.sqdmulh.v2i32(<2 x i32>, <2 x i32>) #1
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
index 9b3d53911ba06..0ddcdccca4ec4 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combine-with-fpfusion.ll
@@ -8,5 +8,5 @@ define float @mul_add(float %a, float %b, float %c) local_unnamed_addr #0 {
     ret float %add
 }
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
index e17a0a96955b1..54f752efb2b3d 100644
--- a/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-fma-combines.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -enable-unsafe-fp-math -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -mattr=+fullfp16 -verify-machineinstrs | FileCheck %s
 
 define void @foo_2d(ptr %src) {
 ; CHECK-LABEL: %entry
diff --git a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
index d2ce7e6cf0320..41f57bfaa6b8b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-indexed-vector-ldst-2.ll
@@ -84,7 +84,7 @@ bb3:                                              ; preds = %bb3, %bb
 ; Function Attrs: nounwind readnone
 declare i64 @llvm.objectsize.i64.p0(ptr, i1) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !1 = !{!2, !2, i64 0}
diff --git a/llvm/test/CodeGen/AArch64/arm64-ld1.ll b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
index 0b22fa49cb5c1..c2b2c1ebf58fe 100644
--- a/llvm/test/CodeGen/AArch64/arm64-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-ld1.ll
@@ -1654,24 +1654,14 @@ define %struct.__neon_float64x2x4_t @ld1_x4_v2f64(ptr %addr) {
 }
 
 define <8 x i8> @dup_ld1_from_stack(ptr %__ret) {
-; CHECK-SD-LABEL: dup_ld1_from_stack:
-; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    sub sp, sp, #16
-; CHECK-SD-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-SD-NEXT:    add x8, sp, #15
-; CHECK-SD-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-SD-NEXT:    add sp, sp, #16
-; CHECK-SD-NEXT:    ret
-;
-; CHECK-GI-LABEL: dup_ld1_from_stack:
-; CHECK-GI:       // %bb.0: // %entry
-; CHECK-GI-NEXT:    str x29, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-GI-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-GI-NEXT:    .cfi_offset w29, -16
-; CHECK-GI-NEXT:    add x8, sp, #15
-; CHECK-GI-NEXT:    ld1r.8b { v0 }, [x8]
-; CHECK-GI-NEXT:    ldr x29, [sp], #16 // 8-byte Folded Reload
-; CHECK-GI-NEXT:    ret
+; CHECK-LABEL: dup_ld1_from_stack:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    add x8, sp, #15
+; CHECK-NEXT:    ld1r.8b { v0 }, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
 entry:
   %item = alloca i8, align 1
   %0 = load i8, ptr %item, align 1
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
index 4cdc6cc117bb8..c6cf2403135a7 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A53.ll
@@ -107,7 +107,7 @@ define <4 x float> @neon4xfloat(<4 x float> %A, <4 x float> %B) {
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
index 82b34efab6834..bb1a6b02a63de 100644
--- a/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-misched-basic-A57.ll
@@ -108,5 +108,5 @@ for.end:                                          ; preds = %for.cond
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index e18a5f695ba29..d8f370884c84a 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -980,12 +980,18 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
 }
 
 define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.8b, v0.8b
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.8b, v0.8b
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.8b, v0.8b
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <8 x i8> zeroinitializer, %a
   %1 = bitcast <8 x i8> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -993,12 +999,18 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
 }
 
 define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.4h, v0.4h
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.4h, v0.4h
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.4h, v0.4h
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <4 x i16> zeroinitializer, %a
   %1 = bitcast <4 x i16> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1006,12 +1018,18 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
 }
 
 define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    neg v0.2s, v0.2s
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    neg v0.2s, v0.2s
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    neg v0.2s, v0.2s
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
   %sub.i = sub <2 x i32> zeroinitializer, %a
   %1 = bitcast <2 x i32> %sub.i to <1 x double>
   %vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1031,8 +1049,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
 ; CHECK-GI-NEXT:    fmov x8, d0
 ; CHECK-GI-NEXT:    neg x8, x8
 ; CHECK-GI-NEXT:    fmov d0, x8
-; CHECK-GI-NEXT:    fcvtzs x8, d0
-; CHECK-GI-NEXT:    fmov d0, x8
+; CHECK-GI-NEXT:    fcvtzs d0, d0
 ; CHECK-GI-NEXT:    ret
   %sub.i = sub <1 x i64> zeroinitializer, %a
   %1 = bitcast <1 x i64> %sub.i to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/arm64-rounding.ll b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
index d487aabccc4f1..3ce35bfc4537c 100644
--- a/llvm/test/CodeGen/AArch64/arm64-rounding.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-rounding.ll
@@ -201,4 +201,4 @@ entry:
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
index db65fdde0ae25..1486b3a8b9cff 100644
--- a/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-storebytesmerge.ll
@@ -36,6 +36,6 @@ for.end705.i:                                     ; preds = %for.body453.i
 
 declare void @f() local_unnamed_addr #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a57" "target-features"="+crc,+crypto,+fp-armv8,+neon" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
index fc5935082ff07..593d629e17883 100644
--- a/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-triv-disjoint-mem-access.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 %1
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
index 627d31f9a64fc..1e0cfa0201263 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -359,11 +359,16 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
 
 ; FIXME: Generate "fcvtzs d0, d0"?
 define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzs_1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzs x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fcvtzs_1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzs x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fcvtzs_1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtzs d0, d0
+; CHECK-GI-NEXT:    ret
 	%tmp3 = fptosi <1 x double> %A to <1 x i64>
 	ret <1 x i64> %tmp3
 }
@@ -438,11 +443,16 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
 
 ; FIXME: Generate "fcvtzu d0, d0"?
 define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzu_1d:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtzu x8, d0
-; CHECK-NEXT:    fmov d0, x8
-; CHECK-NEXT:    ret
+; CHECK-SD-LABEL: fcvtzu_1d:
+; CHECK-SD:       // %bb.0:
+; CHECK-SD-NEXT:    fcvtzu x8, d0
+; CHECK-SD-NEXT:    fmov d0, x8
+; CHECK-SD-NEXT:    ret
+;
+; CHECK-GI-LABEL: fcvtzu_1d:
+; CHECK-GI:       // %bb.0:
+; CHECK-GI-NEXT:    fcvtzu d0, d0
+; CHECK-GI-NEXT:    ret
 	%tmp3 = fptoui <1 x double> %A to <1 x i64>
 	ret <1 x i64> %tmp3
 }
diff --git a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
index dc643062d8697..0f284aa9ad282 100644
--- a/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-zero-cycle-zeroing-gpr.ll
@@ -1,41 +1,44 @@
-; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr32 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR64
-; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+zcz-gpr64 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-ZCZ-GPR64
+; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=generic | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64
 ; RUN: llc < %s -mtriple=arm64-apple-ios -mcpu=cyclone | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
 ; RUN: llc < %s -mtriple=arm64-apple-macosx -mcpu=apple-m1 | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
-; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m3 | FileCheck %s -check-prefixes=ALL,NOZCZ-GPR32-NOZCZ-GPR64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
 ; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor | FileCheck %s -check-prefixes=ALL,ZCZ-GPR32,ZCZ-GPR64
 
 define i8 @ti8() {
 entry:
 ; ALL-LABEL: ti8:
-; NOZCZ-GPR: mov w0, wzr
+; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr
 ; ZCZ-GPR32: mov w0, #0
+; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0
   ret i8 0
 }
 
 define i16 @ti16() {
 entry:
 ; ALL-LABEL: ti16:
-; NOZCZ-GPR: mov w0, wzr
+; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr
 ; ZCZ-GPR32: mov w0, #0
+; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0
   ret i16 0
 }
 
 define i32 @ti32() {
 entry:
 ; ALL-LABEL: ti32:
-; NOZCZ-GPR: mov w0, wzr
+; NOZCZ-GPR32-NOZCZ-GPR64: mov w0, wzr
 ; ZCZ-GPR32: mov w0, #0
+; NOZCZ-GPR32-ZCZ-GPR64: mov x0, #0
   ret i32 0
 }
 
 define i64 @ti64() {
 entry:
 ; ALL-LABEL: ti64:
-; NOZCZ-GPR: mov x0, xzr
+; NOZCZ-GPR32-NOZCZ-GPR64 mov x0, xzr
 ; ZCZ-GPR64: mov x0, #0
   ret i64 0
 }
diff --git a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
index 2e3b99fd8ef34..c4bf7d22f7a23 100644
--- a/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
+++ b/llvm/test/CodeGen/AArch64/bti-branch-relaxation.ll
@@ -61,4 +61,4 @@ declare dso_local void @e(...) local_unnamed_addr #0
 
 declare dso_local i64 @llvm.aarch64.space(i32, i64) local_unnamed_addr #0
 
-attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "branch-target-enforcement" "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon,+v8.5a" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/consthoist-gep.ll b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
index 031ee353c0e8e..7d2aaecbda67e 100644
--- a/llvm/test/CodeGen/AArch64/consthoist-gep.ll
+++ b/llvm/test/CodeGen/AArch64/consthoist-gep.ll
@@ -108,7 +108,7 @@ bb19:                                             ; preds = %bb3, %bb
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/AArch64/csel-zero-float.ll b/llvm/test/CodeGen/AArch64/csel-zero-float.ll
index 6edde13f0a7c7..56a33cc4936be 100644
--- a/llvm/test/CodeGen/AArch64/csel-zero-float.ll
+++ b/llvm/test/CodeGen/AArch64/csel-zero-float.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=aarch64-none-linux-gnu -enable-unsafe-fp-math < %s
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s
 ; There is no invocation to FileCheck as this
 ; caused a crash in "Post-RA pseudo instruction expansion"
 
diff --git a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
index 61df3965ca1be..e5614814bf021 100644
--- a/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
+++ b/llvm/test/CodeGen/AArch64/dag-combine-invaraints.ll
@@ -32,5 +32,5 @@ main_:
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000000000..cf529343c6728
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+  ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.8h, v0.h[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i16
+  %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+  %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+  ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, #4]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 4
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0, x1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+  %load = load i8, ptr %addr, align 1
+  %ext = zext i8 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i8, ptr %p, align 1
+  %ext = zext i8 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+  ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, #8]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 4
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT:    dup v0.4s, v0.s[0]
+; CHECK-NEXT:    ret
+  %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+  %load = load i16, ptr %addr, align 1
+  %ext = zext i16 %load to i32
+  %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+  %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+  ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr h0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i16, ptr %p, align 1
+  %ext = zext i16 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    ldr s0, [x0]
+; CHECK-NEXT:    dup v0.2d, v0.d[0]
+; CHECK-NEXT:    ret
+  %load = load i32, ptr %p, align 1
+  %ext = zext i32 %load to i64
+  %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+  %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+  ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff1076b110..670574f24b4a4 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
 define <2 x i8> @loaddup_v2i8(ptr %p) {
 ; CHECK-LABEL: loaddup_v2i8:
 ; CHECK:       // %bb.0: // %entry
-; CHECK-NEXT:    ldrb w8, [x0]
-; CHECK-NEXT:    dup v0.2s, w8
+; CHECK-NEXT:    ldr b0, [x0]
+; CHECK-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-NEXT:    ret
 entry:
   %a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
 define <4 x i8> @loaddup_v4i8(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v4i8:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrb w8, [x0]
-; CHECK-SD-NEXT:    dup v0.4h, w8
+; CHECK-SD-NEXT:    ldr b0, [x0]
+; CHECK-SD-NEXT:    dup v0.4h, v0.h[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
 define <2 x i16> @loaddup_v2i16(ptr %p) {
 ; CHECK-SD-LABEL: loaddup_v2i16:
 ; CHECK-SD:       // %bb.0: // %entry
-; CHECK-SD-NEXT:    ldrh w8, [x0]
-; CHECK-SD-NEXT:    dup v0.2s, w8
+; CHECK-SD-NEXT:    ldr h0, [x0]
+; CHECK-SD-NEXT:    dup v0.2s, v0.s[0]
 ; CHECK-SD-NEXT:    ret
 ;
 ; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c74112937ba53..b963acd8cb2a1 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtzs w8, s0
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fcvtzs s0, s0
 ; CHECK-GI-NEXT:    ret
     %x = call <1 x i32> @llvm.fptosi.sat.v1f32.v1i32(<1 x float> %f)
     ret <1 x i32> %x
@@ -1162,18 +1161,24 @@ declare <7 x i32> @llvm.fptosi.sat.v7f16.v7i32 (<7 x half>)
 declare <8 x i32> @llvm.fptosi.sat.v8f16.v8i32 (<8 x half>)
 
 define <1 x i32> @test_signed_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_signed_v1f16_v1i32:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    fcvtzs w8, s0
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    ret
+; CHECK-SD-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-SD-CVT:       // %bb.0:
+; CHECK-SD-CVT-NEXT:    fcvt s0, h0
+; CHECK-SD-CVT-NEXT:    fcvtzs w8, s0
+; CHECK-SD-CVT-NEXT:    fmov s0, w8
+; CHECK-SD-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_signed_v1f16_v1i32:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzs w8, h0
 ; CHECK-FP16-NEXT:    fmov s0, w8
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-GI-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-GI-CVT:       // %bb.0:
+; CHECK-GI-CVT-NEXT:    fcvt s0, h0
+; CHECK-GI-CVT-NEXT:    fcvtzs s0, s0
+; CHECK-GI-CVT-NEXT:    ret
     %x = call <1 x i32> @llvm.fptosi.sat.v1f16.v1i32(<1 x half> %f)
     ret <1 x i32> %x
 }
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index efe0a1bedbc9e..5a66b68af8e96 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
 ;
 ; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
 ; CHECK-GI:       // %bb.0:
-; CHECK-GI-NEXT:    fcvtzu w8, s0
-; CHECK-GI-NEXT:    fmov s0, w8
+; CHECK-GI-NEXT:    fcvtzu s0, s0
 ; CHECK-GI-NEXT:    ret
     %x = call <1 x i32> @llvm.fptoui.sat.v1f32.v1i32(<1 x float> %f)
     ret <1 x i32> %x
@@ -993,18 +992,24 @@ declare <7 x i32> @llvm.fptoui.sat.v7f16.v7i32 (<7 x half>)
 declare <8 x i32> @llvm.fptoui.sat.v8f16.v8i32 (<8 x half>)
 
 define <1 x i32> @test_unsigned_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_unsigned_v1f16_v1i32:
-; CHECK-CVT:       // %bb.0:
-; CHECK-CVT-NEXT:    fcvt s0, h0
-; CHECK-CVT-NEXT:    fcvtzu w8, s0
-; CHECK-CVT-NEXT:    fmov s0, w8
-; CHECK-CVT-NEXT:    ret
+; CHECK-SD-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-SD-CVT:       // %bb.0:
+; CHECK-SD-CVT-NEXT:    fcvt s0, h0
+; CHECK-SD-CVT-NEXT:    fcvtzu w8, s0
+; CHECK-SD-CVT-NEXT:    fmov s0, w8
+; CHECK-SD-CVT-NEXT:    ret
 ;
 ; CHECK-FP16-LABEL: test_unsigned_v1f16_v1i32:
 ; CHECK-FP16:       // %bb.0:
 ; CHECK-FP16-NEXT:    fcvtzu w8, h0
 ; CHECK-FP16-NEXT:    fmov s0, w8
 ; CHECK-FP16-NEXT:    ret
+;
+; CHECK-GI-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-GI-CVT:       // %bb.0:
+; CHECK-GI-CVT-NEXT:    fcvt s0, h0
+; CHECK-GI-CVT-NEXT:    fcvtzu s0, s0
+; CHECK-GI-CVT-NEXT:    ret
     %x = call <1 x i32> @llvm.fptoui.sat.v1f16.v1i32(<1 x half> %f)
     ret <1 x i32> %x
 }
diff --git a/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
new file mode 100644
index 0000000000000..85991fb9dfeb5
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
+
+; From #164775, this generates a pre-index load feeding a post-index store, that
+; was checking the wrong uses for post-inc. It seems quite delicate for it to
+; generate this combination at the wrong point to hit the same issue.
+
+@g_260 = dso_local global i16 0
+@g_480 = dso_local global i16 0
+
+define i32 @func_1(ptr %l_3253) {
+; CHECK-LABEL: func_1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #128
+; CHECK-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-NEXT:    movi v0.2d, #0000000000000000
+; CHECK-NEXT:    mov w9, #2 // =0x2
+; CHECK-NEXT:    mov w10, #96 // =0x60
+; CHECK-NEXT:    strb wzr, [x9]
+; CHECK-NEXT:    mov w9, #111 // =0x6f
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    str wzr, [x9]
+; CHECK-NEXT:    mov w9, #80 // =0x50
+; CHECK-NEXT:    adrp x1, .L_MergedGlobals
+; CHECK-NEXT:    add x1, x1, :lo12:.L_MergedGlobals
+; CHECK-NEXT:    strh wzr, [x8]
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #48 // =0x30
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #32 // =0x20
+; CHECK-NEXT:    str q0, [x10]
+; CHECK-NEXT:    mov w10, #64 // =0x40
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    mov w9, #16 // =0x10
+; CHECK-NEXT:    str q0, [x10]
+; CHECK-NEXT:    str q0, [x9]
+; CHECK-NEXT:    str q0, [x8]
+; CHECK-NEXT:    adrp x8, .L_MergedGlobals
+; CHECK-NEXT:    strb wzr, [x0, #8]
+; CHECK-NEXT:    strb wzr, [x0, #12]
+; CHECK-NEXT:    strb wzr, [x0, #16]
+; CHECK-NEXT:    strb wzr, [x0, #20]
+; CHECK-NEXT:    mov w0, wzr
+; CHECK-NEXT:    ldrh wzr, [x8, :lo12:.L_MergedGlobals]
+; CHECK-NEXT:    ldrh w8, [x1, #4]!
+; CHECK-NEXT:    sub w8, w8, #1
+; CHECK-NEXT:    strh w8, [x1]
+; CHECK-NEXT:    add sp, sp, #128
+; CHECK-NEXT:    b use
+entry:
+  %l_32531.sroa.3 = alloca [3 x i8], align 4
+  %l_32531.sroa.4 = alloca [115 x i8], align 4
+  call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.3)
+  call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.4)
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 3, i1 false)
+  call void @llvm.memset.p0.i64(ptr null, i8 0, i64 115, i1 false)
+  %0 = getelementptr inbounds i8, ptr %l_3253, i64 8
+  store i8 0, ptr %0, align 4
+  %1 = getelementptr inbounds i8, ptr %l_3253, i64 12
+  store i8 0, ptr %1, align 4
+  %2 = getelementptr inbounds i8, ptr %l_3253, i64 16
+  store i8 0, ptr %2, align 4
+  %3 = getelementptr inbounds i8, ptr %l_3253, i64 20
+  store i8 0, ptr %3, align 4
+  %4 = load volatile i16, ptr @g_260, align 4
+  %5 = load i16, ptr @g_480, align 4
+  %dec.i.i = add i16 %5, -1
+  store i16 %dec.i.i, ptr @g_480, align 4
+  %call1 = tail call i32 @use(i32 0, ptr @g_480)
+  ret i32 %call1
+}
+
+declare i32 @use(i32, ptr)
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 1a83930dfafa2..628506b6d55ef 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -2,8 +2,8 @@
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s | FileCheck %s
 
 ; load zero-extended i32, bitcast to f64
-define double @_Z9load_u64_from_u32_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u32_testPj:
+define double @load_u64_from_u32(ptr %n){
+; CHECK-LABEL: load_u64_from_u32:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr s0, [x0]
 ; CHECK-NEXT:    ret
@@ -15,8 +15,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f64
-define double @_Z9load_u64_from_u16_testPj(ptr %n){
-; CHECK-LABEL: _Z9load_u64_from_u16_testPj:
+define double @load_u64_from_u16(ptr %n){
+; CHECK-LABEL: load_u64_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -28,8 +28,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f64
-define double @_Z16load_u64_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u64_from_u8Ph:
+define double @load_u64_from_u8(ptr %n){
+; CHECK-LABEL: load_u64_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -41,8 +41,8 @@ entry:
 }
 
 ; load zero-extended i16, bitcast to f32
-define float @_Z17load_u32_from_u16Pt(ptr %n){
-; CHECK-LABEL: _Z17load_u32_from_u16Pt:
+define float @load_u32_from_u16(ptr %n){
+; CHECK-LABEL: load_u32_from_u16:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr h0, [x0]
 ; CHECK-NEXT:    ret
@@ -54,8 +54,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f32
-define float @_Z16load_u32_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u32_from_u8Ph:
+define float @load_u32_from_u8(ptr %n){
+; CHECK-LABEL: load_u32_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    ret
@@ -67,8 +67,8 @@ entry:
 }
 
 ; load zero-extended i8, bitcast to f16
-define half @_Z16load_u16_from_u8Ph(ptr %n){
-; CHECK-LABEL: _Z16load_u16_from_u8Ph:
+define half @load_u16_from_u8(ptr %n){
+; CHECK-LABEL: load_u16_from_u8:
 ; CHECK:       // %bb.0: // %entry
 ; CHECK-NEXT:    ldr b0, [x0]
 ; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
@@ -80,3 +80,492 @@ entry:
   ret half %1
 }
 
+
+define double @load_u64_from_u32_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur s0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur h0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #1]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 1
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur s0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off2(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #2]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 2
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur s0, [x0, #255]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur h0, [x0, #255]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #255]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldur h0, [x0, #255]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #255]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off255(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off255:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #255]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 255
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #256]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_off256(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_off256:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #256]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 256
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+
+define double @load_u64_from_u32_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr s0, [x0, #16380]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16380
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #8190]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr h0, [x0, #8190]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8190
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offn(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offn:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr b0, [x0, #4095]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4095
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
+
+define double @load_u64_from_u32_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u32_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #16384 // =0x4000
+; CHECK-NEXT:    ldr s0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 16384
+  %0 = load i32, ptr %p, align 4
+  %conv = zext i32 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define double @load_u64_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u64_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i64
+  %1 = bitcast i64 %conv to double
+  ret double %1
+}
+
+define float @load_u32_from_u16_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u16_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #8192 // =0x2000
+; CHECK-NEXT:    ldr h0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 8192
+  %0 = load i16, ptr %p, align 2
+  %conv = zext i16 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define float @load_u32_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u32_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i32
+  %1 = bitcast i32 %conv to float
+  ret float %1
+}
+
+define half @load_u16_from_u8_offnp1(ptr %n){
+; CHECK-LABEL: load_u16_from_u8_offnp1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    mov w8, #4096 // =0x1000
+; CHECK-NEXT:    ldr b0, [x0, x8]
+; CHECK-NEXT:    // kill: def $h0 killed $h0 killed $s0
+; CHECK-NEXT:    ret
+entry:
+  %p = getelementptr i8, ptr %n, i64 4096
+  %0 = load i8, ptr %p, align 1
+  %conv = zext i8 %0 to i16
+  %1 = bitcast i16 %conv to half
+  ret half %1
+}
+
diff --git a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
index c2ef2fa16a9a2..00a8c30229e57 100644
--- a/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
+++ b/llvm/test/CodeGen/AArch64/partial-pipeline-execution.ll
@@ -74,7 +74,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup, !llvm.loop !10
 }
 
-attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="non-leaf" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/AArch64/pr164181.ll b/llvm/test/CodeGen/AArch64/pr164181.ll
new file mode 100644
index 0000000000000..4ec63ecb2eeb4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/pr164181.ll
@@ -0,0 +1,640 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -verify-machineinstrs < %s | FileCheck %s
+
+; This test recreates a regalloc crash reported in
+; https://github.com/llvm/llvm-project/issues/164181
+; When rematting an instruction we need to make sure to constrain the newly
+; allocated register to both the rematted def's reg class and the use's reg
+; class.
+
+target triple = "aarch64-unknown-linux-gnu"
+
+@var_32 = external global i16
+@var_35 = external global i64
+@var_39 = external global i64
+@var_46 = external global i64
+@var_50 = external global i32
+
+define void @f(i1 %var_0, i16 %var_1, i64 %var_2, i8 %var_3, i16 %var_4, i1 %var_5, i32 %var_6, i32 %var_7, i8 %var_10, i64 %var_11, i8 %var_14, i32 %var_15, i64 %var_16, ptr %arr_3, ptr %arr_4, ptr %arr_6, ptr %arr_7, ptr %arr_12, ptr %arr_13, ptr %arr_19, i64 %mul, i64 %conv35, i64 %idxprom138.us16, i8 %0, i8 %1, ptr %invariant.gep875.us) #0 {
+; CHECK-LABEL: f:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #240
+; CHECK-NEXT:    str x30, [sp, #144] // 8-byte Folded Spill
+; CHECK-NEXT:    stp x28, x27, [sp, #160] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x26, x25, [sp, #176] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x24, x23, [sp, #192] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x22, x21, [sp, #208] // 16-byte Folded Spill
+; CHECK-NEXT:    stp x20, x19, [sp, #224] // 16-byte Folded Spill
+; CHECK-NEXT:    str w6, [sp, #20] // 4-byte Folded Spill
+; CHECK-NEXT:    str w4, [sp, #72] // 4-byte Folded Spill
+; CHECK-NEXT:    str w3, [sp, #112] // 4-byte Folded Spill
+; CHECK-NEXT:    str w5, [sp, #36] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w5, #0, .LBB0_43
+; CHECK-NEXT:  // %bb.1: // %for.body41.lr.ph
+; CHECK-NEXT:    ldr x4, [sp, #312]
+; CHECK-NEXT:    ldr x14, [sp, #280]
+; CHECK-NEXT:    tbz w0, #0, .LBB0_42
+; CHECK-NEXT:  // %bb.2: // %for.body41.us.preheader
+; CHECK-NEXT:    ldrb w8, [sp, #368]
+; CHECK-NEXT:    ldrb w12, [sp, #256]
+; CHECK-NEXT:    ldr w26, [sp, #264]
+; CHECK-NEXT:    adrp x20, :got:var_50
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov w21, #36006 // =0x8ca6
+; CHECK-NEXT:    ldr x11, [sp, #376]
+; CHECK-NEXT:    ldrb w13, [sp, #360]
+; CHECK-NEXT:    ldp x17, x16, [sp, #296]
+; CHECK-NEXT:    mov w22, #1 // =0x1
+; CHECK-NEXT:    add x27, x14, #120
+; CHECK-NEXT:    ldr x18, [sp, #288]
+; CHECK-NEXT:    ldr x7, [sp, #272]
+; CHECK-NEXT:    ldr x5, [sp, #248]
+; CHECK-NEXT:    mov x10, xzr
+; CHECK-NEXT:    mov w23, wzr
+; CHECK-NEXT:    mov w30, wzr
+; CHECK-NEXT:    ldrb w19, [sp, #240]
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    str w8, [sp, #108] // 4-byte Folded Spill
+; CHECK-NEXT:    mov x3, x26
+; CHECK-NEXT:    ldp x9, x8, [sp, #344]
+; CHECK-NEXT:    str w12, [sp, #92] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #1 // =0x1
+; CHECK-NEXT:    bic w12, w12, w0
+; CHECK-NEXT:    str w12, [sp, #76] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w12, #48 // =0x30
+; CHECK-NEXT:    str x9, [sp, #136] // 8-byte Folded Spill
+; CHECK-NEXT:    ldp x9, x15, [sp, #328]
+; CHECK-NEXT:    madd x8, x8, x12, x9
+; CHECK-NEXT:    str x8, [sp, #64] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x26, w26, uxtw #1
+; CHECK-NEXT:    ldr x20, [x20, :got_lo12:var_50]
+; CHECK-NEXT:    str x26, [sp, #96] // 8-byte Folded Spill
+; CHECK-NEXT:    str x14, [sp, #152] // 8-byte Folded Spill
+; CHECK-NEXT:    lsl x6, x8, #3
+; CHECK-NEXT:    add x8, x14, #120
+; CHECK-NEXT:    str x4, [sp, #24] // 8-byte Folded Spill
+; CHECK-NEXT:    str w19, [sp, #16] // 4-byte Folded Spill
+; CHECK-NEXT:    str x8, [sp, #80] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_4
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_3: // in Loop: Header=BB0_4 Depth=1
+; CHECK-NEXT:    ldr w19, [sp, #16] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x24, [sp, #40] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x14, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w23, #1 // =0x1
+; CHECK-NEXT:    mov w30, #1 // =0x1
+; CHECK-NEXT:    mov w25, w19
+; CHECK-NEXT:  .LBB0_4: // %for.body41.us
+; CHECK-NEXT:    // =>This Loop Header: Depth=1
+; CHECK-NEXT:    // Child Loop BB0_6 Depth 2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #20] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x12, x24
+; CHECK-NEXT:    str x24, [sp, #48] // 8-byte Folded Spill
+; CHECK-NEXT:    str w8, [x14]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    strb w19, [x14]
+; CHECK-NEXT:    b .LBB0_6
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_5: // %for.cond.cleanup93.us
+; CHECK-NEXT:    // in Loop: Header=BB0_6 Depth=2
+; CHECK-NEXT:    ldr w9, [sp, #36] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr x4, [sp, #24] // 8-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x12, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    tbz w9, #0, .LBB0_3
+; CHECK-NEXT:  .LBB0_6: // %for.body67.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // => This Loop Header: Depth=2
+; CHECK-NEXT:    // Child Loop BB0_8 Depth 3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    str x12, [sp, #40] // 8-byte Folded Spill
+; CHECK-NEXT:    cmn x24, #30
+; CHECK-NEXT:    mov x12, #-30 // =0xffffffffffffffe2
+; CHECK-NEXT:    add x19, x4, w8, sxtw #2
+; CHECK-NEXT:    mov x9, xzr
+; CHECK-NEXT:    csel x12, x24, x12, lo
+; CHECK-NEXT:    mov w4, w30
+; CHECK-NEXT:    str x12, [sp, #56] // 8-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_8
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_7: // %for.cond.cleanup98.us
+; CHECK-NEXT:    // in Loop: Header=BB0_8 Depth=3
+; CHECK-NEXT:    ldr w4, [sp, #72] // 4-byte Folded Reload
+; CHECK-NEXT:    ldr w23, [sp, #128] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_5
+; CHECK-NEXT:  .LBB0_8: // %for.cond95.preheader.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // => This Loop Header: Depth=3
+; CHECK-NEXT:    // Child Loop BB0_10 Depth 4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr x8, [sp, #64] // 8-byte Folded Reload
+; CHECK-NEXT:    mov w14, #1152 // =0x480
+; CHECK-NEXT:    mov w24, #1 // =0x1
+; CHECK-NEXT:    mov w12, wzr
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    mov w30, w4
+; CHECK-NEXT:    madd x8, x9, x14, x8
+; CHECK-NEXT:    mov w14, #1 // =0x1
+; CHECK-NEXT:    str x8, [sp, #120] // 8-byte Folded Spill
+; CHECK-NEXT:    add x8, x9, x9, lsl #1
+; CHECK-NEXT:    lsl x26, x8, #4
+; CHECK-NEXT:    sxtb w8, w23
+; CHECK-NEXT:    mov w23, w25
+; CHECK-NEXT:    str w8, [sp, #116] // 4-byte Folded Spill
+; CHECK-NEXT:    b .LBB0_10
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_9: // %for.cond510.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #92] // 4-byte Folded Reload
+; CHECK-NEXT:    mov x22, x8
+; CHECK-NEXT:    ldr x3, [sp, #96] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x27, [sp, #80] // 8-byte Folded Reload
+; CHECK-NEXT:    mov x28, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    mov x14, xzr
+; CHECK-NEXT:    ldr w8, [sp, #76] // 4-byte Folded Reload
+; CHECK-NEXT:    tbz w8, #31, .LBB0_7
+; CHECK-NEXT:  .LBB0_10: // %for.body99.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // => This Loop Header: Depth=4
+; CHECK-NEXT:    // Child Loop BB0_11 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_28 Depth 5
+; CHECK-NEXT:    // Child Loop BB0_39 Depth 5
+; CHECK-NEXT:    ldr w8, [sp, #116] // 4-byte Folded Reload
+; CHECK-NEXT:    and w8, w8, w8, asr #31
+; CHECK-NEXT:    str w8, [sp, #128] // 4-byte Folded Spill
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_11: // %for.body113.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    tbnz w0, #0, .LBB0_11
+; CHECK-NEXT:  // %bb.12: // %for.cond131.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #112] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #1 // =0x1
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldr x8, [sp, #120] // 8-byte Folded Reload
+; CHECK-NEXT:    ldrh w8, [x8]
+; CHECK-NEXT:    cbnz w4, .LBB0_14
+; CHECK-NEXT:  // %bb.13: // %cond.true146.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w4, [x27, x3]
+; CHECK-NEXT:    b .LBB0_15
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_14: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:  .LBB0_15: // %cond.end154.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w25, #18984 // =0x4a28
+; CHECK-NEXT:    mul w8, w8, w25
+; CHECK-NEXT:    and w8, w8, #0xfff8
+; CHECK-NEXT:    lsl w8, w8, w4
+; CHECK-NEXT:    cbz w8, .LBB0_17
+; CHECK-NEXT:  // %bb.16: // %if.then.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x18]
+; CHECK-NEXT:  .LBB0_17: // %if.end.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w8, [sp, #108] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w4, #18984 // =0x4a28
+; CHECK-NEXT:    mov w25, w23
+; CHECK-NEXT:    strb w8, [x18]
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:    lsl w8, w4, w8
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:    cbz w8, .LBB0_19
+; CHECK-NEXT:  // %bb.18: // %if.then.us.2
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    strb wzr, [x18]
+; CHECK-NEXT:  .LBB0_19: // %if.then.us.5
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr w23, [sp, #132] // 4-byte Folded Reload
+; CHECK-NEXT:    mov w8, #29625 // =0x73b9
+; CHECK-NEXT:    movk w8, #21515, lsl #16
+; CHECK-NEXT:    cmp w23, w8
+; CHECK-NEXT:    csel w23, w23, w8, lt
+; CHECK-NEXT:    str w23, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    tbz w0, #0, .LBB0_21
+; CHECK-NEXT:  // %bb.20: // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w8, wzr
+; CHECK-NEXT:    b .LBB0_22
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_21: // %cond.true146.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldrsb w8, [x27, x3]
+; CHECK-NEXT:  .LBB0_22: // %cond.end154.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w23, #18984 // =0x4a28
+; CHECK-NEXT:    mov w3, #149 // =0x95
+; CHECK-NEXT:    lsl w8, w23, w8
+; CHECK-NEXT:    cbz w8, .LBB0_24
+; CHECK-NEXT:  // %bb.23: // %if.then.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    ldr x8, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    str wzr, [sp, #132] // 4-byte Folded Spill
+; CHECK-NEXT:    str wzr, [x8]
+; CHECK-NEXT:  .LBB0_24: // %if.end.us.7
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov x23, xzr
+; CHECK-NEXT:    b .LBB0_28
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_25: // %cond.true331.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrsb w4, [x10]
+; CHECK-NEXT:  .LBB0_26: // %cond.end345.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strh w4, [x18]
+; CHECK-NEXT:    mul x4, x22, x28
+; CHECK-NEXT:    adrp x22, :got:var_46
+; CHECK-NEXT:    mov x8, xzr
+; CHECK-NEXT:    ldr x22, [x22, :got_lo12:var_46]
+; CHECK-NEXT:    str x4, [x22]
+; CHECK-NEXT:    mov x4, #-18403 // =0xffffffffffffb81d
+; CHECK-NEXT:    movk x4, #58909, lsl #16
+; CHECK-NEXT:  .LBB0_27: // %for.inc371.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w22, #-18978 // =0xffffb5de
+; CHECK-NEXT:    orr x23, x23, #0x1
+; CHECK-NEXT:    mov x24, xzr
+; CHECK-NEXT:    mul w12, w12, w22
+; CHECK-NEXT:    mov x22, x5
+; CHECK-NEXT:    tbz w0, #0, .LBB0_36
+; CHECK-NEXT:  .LBB0_28: // %for.body194.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    cbnz wzr, .LBB0_30
+; CHECK-NEXT:  // %bb.29: // %if.then222.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    adrp x27, :got:var_32
+; CHECK-NEXT:    ldur w8, [x19, #-12]
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_32]
+; CHECK-NEXT:    strh w8, [x27]
+; CHECK-NEXT:    sxtb w8, w25
+; CHECK-NEXT:    bic w25, w8, w8, asr #31
+; CHECK-NEXT:    b .LBB0_31
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_30: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w25, wzr
+; CHECK-NEXT:  .LBB0_31: // %if.end239.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    strb w3, [x16]
+; CHECK-NEXT:    tst w13, #0xff
+; CHECK-NEXT:    b.eq .LBB0_33
+; CHECK-NEXT:  // %bb.32: // %if.then254.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    ldrh w8, [x26, x14, lsl #1]
+; CHECK-NEXT:    adrp x27, :got:var_35
+; CHECK-NEXT:    ldr x27, [x27, :got_lo12:var_35]
+; CHECK-NEXT:    cmp w8, #0
+; CHECK-NEXT:    csel x8, xzr, x7, eq
+; CHECK-NEXT:    str x8, [x27]
+; CHECK-NEXT:    strh w1, [x17]
+; CHECK-NEXT:  .LBB0_33: // %if.end282.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    orr x27, x24, x4
+; CHECK-NEXT:    adrp x8, :got:var_39
+; CHECK-NEXT:    str x27, [x18]
+; CHECK-NEXT:    ldr x8, [x8, :got_lo12:var_39]
+; CHECK-NEXT:    str x10, [x8]
+; CHECK-NEXT:    ldrb w8, [x6, x9]
+; CHECK-NEXT:    str x8, [x18]
+; CHECK-NEXT:    mov w8, #1 // =0x1
+; CHECK-NEXT:    cbnz x2, .LBB0_27
+; CHECK-NEXT:  // %bb.34: // %if.then327.us
+; CHECK-NEXT:    // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    cbz w8, .LBB0_25
+; CHECK-NEXT:  // %bb.35: // in Loop: Header=BB0_28 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_26
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_36: // %for.cond376.preheader.us
+; CHECK-NEXT:    // in Loop: Header=BB0_10 Depth=4
+; CHECK-NEXT:    mov w3, #1152 // =0x480
+; CHECK-NEXT:    mov x22, xzr
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    mov x24, x27
+; CHECK-NEXT:    lsl x23, x14, #1
+; CHECK-NEXT:    mov x27, #-1 // =0xffffffffffffffff
+; CHECK-NEXT:    madd x14, x14, x3, x11
+; CHECK-NEXT:    mov w28, w30
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:    b .LBB0_39
+; CHECK-NEXT:    .p2align 5, , 16
+; CHECK-NEXT:  .LBB0_37: // %if.then466.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldr x28, [sp, #152] // 8-byte Folded Reload
+; CHECK-NEXT:    ldr x3, [sp, #136] // 8-byte Folded Reload
+; CHECK-NEXT:    sxtb w4, w4
+; CHECK-NEXT:    bic w4, w4, w4, asr #31
+; CHECK-NEXT:    str x3, [x28]
+; CHECK-NEXT:    mov w3, #-7680 // =0xffffe200
+; CHECK-NEXT:  .LBB0_38: // %for.inc505.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    add x22, x22, #1
+; CHECK-NEXT:    add x27, x27, #1
+; CHECK-NEXT:    mov w28, wzr
+; CHECK-NEXT:    cmp x27, #0
+; CHECK-NEXT:    b.hs .LBB0_9
+; CHECK-NEXT:  .LBB0_39: // %for.body380.us
+; CHECK-NEXT:    // Parent Loop BB0_4 Depth=1
+; CHECK-NEXT:    // Parent Loop BB0_6 Depth=2
+; CHECK-NEXT:    // Parent Loop BB0_8 Depth=3
+; CHECK-NEXT:    // Parent Loop BB0_10 Depth=4
+; CHECK-NEXT:    // => This Inner Loop Header: Depth=5
+; CHECK-NEXT:    mov w30, w28
+; CHECK-NEXT:    ldrh w28, [x23]
+; CHECK-NEXT:    tst w0, #0x1
+; CHECK-NEXT:    strh w28, [x11]
+; CHECK-NEXT:    csel w28, w21, w3, ne
+; CHECK-NEXT:    str w28, [x20]
+; CHECK-NEXT:    cbz x15, .LBB0_38
+; CHECK-NEXT:  // %bb.40: // %if.then436.us
+; CHECK-NEXT:    // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    ldrh w28, [x14]
+; CHECK-NEXT:    cbnz w28, .LBB0_37
+; CHECK-NEXT:  // %bb.41: // in Loop: Header=BB0_39 Depth=5
+; CHECK-NEXT:    mov w4, wzr
+; CHECK-NEXT:    b .LBB0_38
+; CHECK-NEXT:  .LBB0_42: // %for.body41
+; CHECK-NEXT:    strb wzr, [x4]
+; CHECK-NEXT:    strb wzr, [x14]
+; CHECK-NEXT:  .LBB0_43: // %for.cond563.preheader
+; CHECK-NEXT:    ldp x20, x19, [sp, #224] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x22, x21, [sp, #208] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x24, x23, [sp, #192] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x26, x25, [sp, #176] // 16-byte Folded Reload
+; CHECK-NEXT:    ldp x28, x27, [sp, #160] // 16-byte Folded Reload
+; CHECK-NEXT:    ldr x30, [sp, #144] // 8-byte Folded Reload
+; CHECK-NEXT:    add sp, sp, #240
+; CHECK-NEXT:    ret
+entry:
+  br i1 %var_5, label %for.body41.lr.ph, label %for.cond563.preheader
+
+for.body41.lr.ph:                                 ; preds = %entry
+  %arrayidx147 = getelementptr i8, ptr %arr_3, i64 120
+  %tobool326.not = icmp eq i64 %var_2, 0
+  %not353 = xor i64 0, -1
+  %add538 = select i1 %var_0, i16 0, i16 1
+  br i1 %var_0, label %for.body41.us, label %for.body41
+
+for.body41.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.lr.ph
+  %var_24.promoted9271009.us = phi i64 [ 0, %for.body41.lr.ph ], [ %6, %for.cond.cleanup93.us ]
+  %var_37.promoted9301008.us = phi i64 [ 1, %for.body41.lr.ph ], [ 0, %for.cond.cleanup93.us ]
+  %2 = phi i8 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %add4139751001.us = phi i16 [ 0, %for.body41.lr.ph ], [ 1, %for.cond.cleanup93.us ]
+  %3 = phi i8 [ 0, %for.body41.lr.ph ], [ %var_10, %for.cond.cleanup93.us ]
+  store i32 %var_6, ptr %arr_3, align 4
+  store i8 %var_10, ptr %arr_3, align 1
+  br label %for.body67.us
+
+for.body67.us:                                    ; preds = %for.cond.cleanup93.us, %for.body41.us
+  %4 = phi i8 [ %3, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %add413977.us = phi i16 [ %add4139751001.us, %for.body41.us ], [ %add413.us17, %for.cond.cleanup93.us ]
+  %5 = phi i8 [ %2, %for.body41.us ], [ %.sroa.speculated829.us, %for.cond.cleanup93.us ]
+  %conv64922.us = phi i32 [ 1, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %6 = phi i64 [ %var_24.promoted9271009.us, %for.body41.us ], [ %.sroa.speculated832.us, %for.cond.cleanup93.us ]
+  %mul354903918.us = phi i64 [ %var_37.promoted9301008.us, %for.body41.us ], [ 0, %for.cond.cleanup93.us ]
+  %i_2.0921.us = zext i32 %var_15 to i64
+  %.sroa.speculated832.us = tail call i64 @llvm.umin.i64(i64 %var_24.promoted9271009.us, i64 -30)
+  %sext1023 = shl i64 %i_2.0921.us, 1
+  %idxprom138.us162 = ashr i64 %sext1023, 1
+  %gep889.us = getelementptr [24 x i16], ptr %arr_19, i64 %idxprom138.us16
+  %arrayidx149.us = getelementptr i8, ptr %arrayidx147, i64 %idxprom138.us162
+  %arrayidx319.us = getelementptr [24 x i8], ptr null, i64 %idxprom138.us162
+  %7 = sext i32 %conv64922.us to i64
+  %8 = getelementptr i32, ptr %arr_12, i64 %7
+  %arrayidx226.us = getelementptr i8, ptr %8, i64 -12
+  br label %for.cond95.preheader.us
+
+for.cond.cleanup93.us:                            ; preds = %for.cond.cleanup98.us
+  br i1 %var_5, label %for.body67.us, label %for.body41.us
+
+for.cond.cleanup98.us:                            ; preds = %for.cond510.preheader.us
+  br i1 %var_0, label %for.cond.cleanup93.us, label %for.cond95.preheader.us
+
+for.body99.us:                                    ; preds = %for.cond95.preheader.us, %for.cond510.preheader.us
+  %mul287985.us = phi i16 [ 0, %for.cond95.preheader.us ], [ %mul287.us, %for.cond510.preheader.us ]
+  %9 = phi i8 [ %29, %for.cond95.preheader.us ], [ %var_14, %for.cond510.preheader.us ]
+  %add413979.us = phi i16 [ %add413978.us, %for.cond95.preheader.us ], [ %add413.us17, %for.cond510.preheader.us ]
+  %10 = phi i32 [ 0, %for.cond95.preheader.us ], [ %26, %for.cond510.preheader.us ]
+  %mul354905.us = phi i64 [ %mul354904.us, %for.cond95.preheader.us ], [ %mul354907.us, %for.cond510.preheader.us ]
+  %sub283896.us = phi i64 [ 1, %for.cond95.preheader.us ], [ %sub283.us, %for.cond510.preheader.us ]
+  %conv96880.us = phi i64 [ 1, %for.cond95.preheader.us ], [ 0, %for.cond510.preheader.us ]
+  %.sroa.speculated829.us = tail call i8 @llvm.smin.i8(i8 %30, i8 0)
+  br label %for.body113.us
+
+for.body380.us:                                   ; preds = %for.cond376.preheader.us, %for.inc505.us
+  %indvars.iv1018 = phi i64 [ 0, %for.cond376.preheader.us ], [ %indvars.iv.next1019, %for.inc505.us ]
+  %11 = phi i8 [ 0, %for.cond376.preheader.us ], [ %13, %for.inc505.us ]
+  %add413980.us = phi i16 [ %add413979.us, %for.cond376.preheader.us ], [ 0, %for.inc505.us ]
+  %12 = load i16, ptr %arrayidx384.us, align 2
+  store i16 %12, ptr %invariant.gep875.us, align 2
+  %add413.us17 = or i16 %add413980.us, 0
+  %arrayidx416.us = getelementptr i16, ptr %arr_13, i64 %indvars.iv1018
+  %conv419.us = select i1 %var_0, i32 36006, i32 -7680
+  store i32 %conv419.us, ptr @var_50, align 4
+  %tobool435.not.us = icmp eq i64 %mul, 0
+  br i1 %tobool435.not.us, label %for.inc505.us, label %if.then436.us
+
+if.then436.us:                                    ; preds = %for.body380.us
+  %.sroa.speculated817.us = tail call i8 @llvm.smax.i8(i8 %11, i8 0)
+  %cond464.in.us = load i16, ptr %gep876.us, align 2
+  %tobool465.not.us = icmp eq i16 %cond464.in.us, 0
+  br i1 %tobool465.not.us, label %for.inc505.us, label %if.then466.us
+
+if.then466.us:                                    ; preds = %if.then436.us
+  store i64 %conv35, ptr %arr_3, align 8
+  br label %for.inc505.us
+
+for.inc505.us:                                    ; preds = %if.then466.us, %if.then436.us, %for.body380.us
+  %13 = phi i8 [ %11, %for.body380.us ], [ %.sroa.speculated817.us, %if.then466.us ], [ 0, %if.then436.us ]
+  %indvars.iv.next1019 = add i64 %indvars.iv1018, 1
+  %cmp378.us = icmp ult i64 %indvars.iv1018, 0
+  br i1 %cmp378.us, label %for.body380.us, label %for.cond510.preheader.us
+
+for.body194.us:                                   ; preds = %if.end.us.7, %for.inc371.us
+  %indvars.iv = phi i64 [ 0, %if.end.us.7 ], [ %indvars.iv.next, %for.inc371.us ]
+  %mul287986.us = phi i16 [ %mul287985.us, %if.end.us.7 ], [ %mul287.us, %for.inc371.us ]
+  %14 = phi i8 [ %9, %if.end.us.7 ], [ %16, %for.inc371.us ]
+  %mul354906.us = phi i64 [ %mul354905.us, %if.end.us.7 ], [ %var_11, %for.inc371.us ]
+  %sub283897.us = phi i64 [ %sub283896.us, %if.end.us.7 ], [ 0, %for.inc371.us ]
+  %tobool221.not.us = icmp eq i32 1, 0
+  br i1 %tobool221.not.us, label %if.end239.us, label %if.then222.us
+
+if.then222.us:                                    ; preds = %for.body194.us
+  %15 = load i32, ptr %arrayidx226.us, align 4
+  %conv227.us = trunc i32 %15 to i16
+  store i16 %conv227.us, ptr @var_32, align 2
+  %.sroa.speculated820.us = tail call i8 @llvm.smax.i8(i8 %14, i8 0)
+  br label %if.end239.us
+
+if.end239.us:                                     ; preds = %if.then222.us, %for.body194.us
+  %16 = phi i8 [ %.sroa.speculated820.us, %if.then222.us ], [ 0, %for.body194.us ]
+  store i8 -107, ptr %arr_7, align 1
+  %tobool253.not.us = icmp eq i8 %0, 0
+  br i1 %tobool253.not.us, label %if.end282.us, label %if.then254.us
+
+if.then254.us:                                    ; preds = %if.end239.us
+  %17 = load i16, ptr %arrayidx259.us, align 2
+  %tobool261.not.us = icmp eq i16 %17, 0
+  %conv268.us = select i1 %tobool261.not.us, i64 0, i64 %var_16
+  store i64 %conv268.us, ptr @var_35, align 8
+  %gep867.us = getelementptr [24 x [24 x i64]], ptr null, i64 %indvars.iv
+  store i16 %var_1, ptr %arr_6, align 2
+  br label %if.end282.us
+
+if.end282.us:                                     ; preds = %if.then254.us, %if.end239.us
+  %sub283.us = or i64 %sub283897.us, -434259939
+  store i64 %sub283.us, ptr %arr_4, align 8
+  %mul287.us = mul i16 %mul287986.us, -18978
+  store i64 0, ptr @var_39, align 8
+  %18 = load i8, ptr %arrayidx321.us, align 1
+  %conv322.us = zext i8 %18 to i64
+  store i64 %conv322.us, ptr %arr_4, align 8
+  br i1 %tobool326.not, label %if.then327.us, label %for.inc371.us
+
+if.then327.us:                                    ; preds = %if.end282.us
+  %tobool330.not.us = icmp eq i32 0, 0
+  br i1 %tobool330.not.us, label %cond.end345.us, label %cond.true331.us
+
+cond.true331.us:                                  ; preds = %if.then327.us
+  %19 = load i8, ptr null, align 1
+  %20 = sext i8 %19 to i16
+  br label %cond.end345.us
+
+cond.end345.us:                                   ; preds = %cond.true331.us, %if.then327.us
+  %cond346.us = phi i16 [ %20, %cond.true331.us ], [ 0, %if.then327.us ]
+  store i16 %cond346.us, ptr %arr_4, align 2
+  %mul354.us = mul i64 %mul354906.us, %not353
+  store i64 %mul354.us, ptr @var_46, align 8
+  br label %for.inc371.us
+
+for.inc371.us:                                    ; preds = %cond.end345.us, %if.end282.us
+  %mul354907.us = phi i64 [ 1, %if.end282.us ], [ 0, %cond.end345.us ]
+  %indvars.iv.next = or i64 %indvars.iv, 1
+  br i1 %var_0, label %for.body194.us, label %for.cond376.preheader.us
+
+cond.true146.us:                                  ; preds = %for.cond131.preheader.us
+  %21 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us = sext i8 %21 to i32
+  br label %cond.end154.us
+
+cond.end154.us:                                   ; preds = %for.cond131.preheader.us, %cond.true146.us
+  %cond155.us = phi i32 [ %conv150.us, %cond.true146.us ], [ 0, %for.cond131.preheader.us ]
+  %shl.us = shl i32 %div.us, %cond155.us
+  %tobool157.not.us = icmp eq i32 %shl.us, 0
+  br i1 %tobool157.not.us, label %if.end.us, label %if.then.us
+
+if.then.us:                                       ; preds = %cond.end154.us
+  store i32 0, ptr %arr_4, align 4
+  br label %if.end.us
+
+if.end.us:                                        ; preds = %if.then.us, %cond.end154.us
+  %22 = phi i32 [ 0, %if.then.us ], [ %10, %cond.end154.us ]
+  store i8 %1, ptr %arr_4, align 1
+  call void @llvm.assume(i1 true)
+  %23 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.2 = sext i8 %23 to i32
+  %shl.us.2 = shl i32 18984, %conv150.us.2
+  %tobool157.not.us.2 = icmp eq i32 %shl.us.2, 0
+  br i1 %tobool157.not.us.2, label %if.then.us.5, label %if.then.us.2
+
+if.then.us.2:                                     ; preds = %if.end.us
+  %.sroa.speculated826.us.2 = tail call i32 @llvm.smin.i32(i32 %10, i32 0)
+  store i8 0, ptr %arr_4, align 1
+  br label %if.then.us.5
+
+if.then.us.5:                                     ; preds = %if.then.us.2, %if.end.us
+  %24 = phi i32 [ 0, %if.then.us.2 ], [ %22, %if.end.us ]
+  %.sroa.speculated826.us.5 = tail call i32 @llvm.smin.i32(i32 %24, i32 1410036665)
+  br i1 %var_0, label %cond.end154.us.7, label %cond.true146.us.7
+
+cond.true146.us.7:                                ; preds = %if.then.us.5
+  %25 = load i8, ptr %arrayidx149.us, align 1
+  %conv150.us.7 = sext i8 %25 to i32
+  br label %cond.end154.us.7
+
+cond.end154.us.7:                                 ; preds = %cond.true146.us.7, %if.then.us.5
+  %cond155.us.7 = phi i32 [ %conv150.us.7, %cond.true146.us.7 ], [ 0, %if.then.us.5 ]
+  %shl.us.7 = shl i32 18984, %cond155.us.7
+  %tobool157.not.us.7 = icmp eq i32 %shl.us.7, 0
+  br i1 %tobool157.not.us.7, label %if.end.us.7, label %if.then.us.7
+
+if.then.us.7:                                     ; preds = %cond.end154.us.7
+  store i32 0, ptr %arr_3, align 4
+  br label %if.end.us.7
+
+if.end.us.7:                                      ; preds = %if.then.us.7, %cond.end154.us.7
+  %26 = phi i32 [ 0, %if.then.us.7 ], [ %.sroa.speculated826.us.5, %cond.end154.us.7 ]
+  %arrayidx259.us = getelementptr i16, ptr %arrayidx257.us, i64 %conv96880.us
+  br label %for.body194.us
+
+for.body113.us:                                   ; preds = %for.body113.us, %for.body99.us
+  br i1 %var_0, label %for.body113.us, label %for.cond131.preheader.us
+
+for.cond510.preheader.us:                         ; preds = %for.inc505.us
+  %cmp97.us = icmp slt i16 %add538, 0
+  br i1 %cmp97.us, label %for.body99.us, label %for.cond.cleanup98.us
+
+for.cond376.preheader.us:                         ; preds = %for.inc371.us
+  %arrayidx384.us = getelementptr i16, ptr null, i64 %conv96880.us
+  %gep876.us = getelementptr [24 x [24 x i16]], ptr %invariant.gep875.us, i64 %conv96880.us
+  br label %for.body380.us
+
+for.cond131.preheader.us:                         ; preds = %for.body113.us
+  store i8 %var_3, ptr %arr_4, align 1
+  %27 = load i16, ptr %gep884.us, align 2
+  %28 = mul i16 18984, %27
+  %div.us = zext i16 %28 to i32
+  %tobool145.not.us = icmp eq i8 0, 0
+  br i1 %tobool145.not.us, label %cond.end154.us, label %cond.true146.us
+
+for.cond95.preheader.us:                          ; preds = %for.cond.cleanup98.us, %for.body67.us
+  %indvars.iv1021 = phi i64 [ 1, %for.cond.cleanup98.us ], [ 0, %for.body67.us ]
+  %29 = phi i8 [ %16, %for.cond.cleanup98.us ], [ %4, %for.body67.us ]
+  %add413978.us = phi i16 [ %var_4, %for.cond.cleanup98.us ], [ %add413977.us, %for.body67.us ]
+  %30 = phi i8 [ %.sroa.speculated829.us, %for.cond.cleanup98.us ], [ %5, %for.body67.us ]
+  %mul354904.us = phi i64 [ 0, %for.cond.cleanup98.us ], [ %mul354903918.us, %for.body67.us ]
+  %gep884.us = getelementptr [24 x [24 x i16]], ptr %gep889.us, i64 %indvars.iv1021
+  %arrayidx321.us = getelementptr i8, ptr %arrayidx319.us, i64 %indvars.iv1021
+  %arrayidx257.us = getelementptr [24 x i16], ptr null, i64 %indvars.iv1021
+  br label %for.body99.us
+
+for.cond563.preheader:                            ; preds = %for.body41, %entry
+  ret void
+
+for.body41:                                       ; preds = %for.body41.lr.ph
+  store i8 0, ptr %arr_12, align 1
+  store i8 0, ptr %arr_3, align 1
+  br label %for.cond563.preheader
+}
+
+attributes #0 = { nounwind "frame-pointer"="non-leaf" "target-cpu"="grace" }
+attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
+attributes #2 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write) }
diff --git a/llvm/test/CodeGen/AArch64/recp-fastmath.ll b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
index 9f00621eff6b4..fa1da33007667 100644
--- a/llvm/test/CodeGen/AArch64/recp-fastmath.ll
+++ b/llvm/test/CodeGen/AArch64/recp-fastmath.ll
@@ -164,5 +164,5 @@ define <4 x double> @d4recp1(<4 x double> %x) #1 {
 ; CHECK-NOT: frecps {{v[0-7]\.2d}}, {{v[0-7]\.2d}}, {{v[0-7]\.2d}}
 }
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "reciprocal-estimates"="div,vec-div" }
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "reciprocal-estimates"="div,vec-div" }
diff --git a/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
new file mode 100644
index 0000000000000..6f33a75ab9fcb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/shrink-wrap-const-pool-access.mir
@@ -0,0 +1,76 @@
+# RUN: llc -mtriple=aarch64 -simplify-mir -run-pass=shrink-wrap -o - %s | FileCheck %s
+--- |
+  declare double @foo()
+
+  define double @shrink_wrap_load_from_const_pool(double %q) {
+  entry:
+    %0 = fcmp oeq double %q, 3.125500e+02
+    br i1 %0, label %common.ret, label %if.else
+
+  common.ret:                                       ; preds = %if.else, %entry, %exit1
+    %common.ret.op = phi double [ %3, %exit1 ], [ 0.000000e+00, %entry ], [ 0.000000e+00, %if.else ]
+    ret double %common.ret.op
+
+  if.else:                                          ; preds = %entry
+    %1 = call double @foo()
+    %2 = fcmp oeq double %1, 0.000000e+00
+    br i1 %2, label %exit1, label %common.ret
+
+  exit1:                                            ; preds = %if.else
+    %3 = call double @foo()
+    br label %common.ret
+  }
+...
+# Following code has a load from constant pool. Accessing constant pool
+# must not be considered as a stack access and hence, shrink wrapping must
+# happen.
+# CHECK-LABEL:name:            shrink_wrap_load_from_const_pool
+# CHECK:  savePoint:
+# CHECK:    - point:           '%bb.3'
+# CHECK:  restorePoint:
+# CHECK:    - point:           '%bb.5'
+---
+name:            shrink_wrap_load_from_const_pool
+tracksRegLiveness: true
+constants:
+  - id:              0
+    value:           'double 3.125500e+02'
+    alignment:       8
+body:             |
+  bb.0.entry:
+    successors: %bb.4(0x50000000), %bb.2(0x30000000)
+    liveins: $d0
+
+    renamable $d1 = COPY $d0
+    renamable $x8 = ADRP target-flags(aarch64-page) %const.0
+    renamable $d2 = LDRDui killed renamable $x8, target-flags(aarch64-pageoff, aarch64-nc) %const.0 :: (load (s64) from constant-pool)
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDrr killed renamable $d1, killed renamable $d2, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.2, implicit killed $nzcv
+
+  bb.4:
+    liveins: $d0
+
+  bb.1.common.ret:
+    liveins: $d0
+
+    RET_ReallyLR implicit $d0
+
+  bb.2.if.else:
+    successors: %bb.3(0x50000000), %bb.1(0x30000000)
+
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    renamable $d1 = COPY $d0
+    renamable $d0 = FMOVD0
+    nofpexcept FCMPDri killed renamable $d1, implicit-def $nzcv, implicit $fpcr
+    Bcc 1, %bb.1, implicit killed $nzcv
+    B %bb.3
+
+  bb.3.exit1:
+    ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
+    BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit-def $sp, implicit-def $d0
+    ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
+    B %bb.1
+...
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8070781..b17b48c7e04d3 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,111 @@ define i64 @sme_cntsd_mul() {
   ret i64 %res
 }
 
-declare i64 @llvm.aarch64.sme.cntsd()
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, 96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, 3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, 62
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #31
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw i64 %v, 992
+  ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-24
+; CHECK-NEXT:    lsl x0, x8, #2
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 3
+  %res = mul nuw nsw i64 %shl, -96
+  ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #1
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 2
+  %res = mul nuw nsw i64 %shl, -3
+  ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-31
+; CHECK-NEXT:    lsl x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %shl = shl nuw nsw i64 %v, 1
+  %res = mul nuw nsw i64 %shl, -992
+  ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #-3
+; CHECK-NEXT:    lsr x0, x8, #3
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw  i64 %v, -3
+  ret i64 %res
+}
+
+; Negative test for optimization failure
+define i64 @sme_cntsd_mul_fail() {
+; CHECK-LABEL: sme_cntsd_mul_fail:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    rdsvl x8, #1
+; CHECK-NEXT:    mov w9, #993 // =0x3e1
+; CHECK-NEXT:    lsr x8, x8, #3
+; CHECK-NEXT:    mul x0, x8, x9
+; CHECK-NEXT:    ret
+  %v = call i64 @llvm.aarch64.sme.cntsd()
+  %res = mul nuw nsw i64 %v, 993
+  ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c8824759a..3f35cb576694e 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
 ; CHECK-SDAG-NEXT:    bl __arm_sme_restore
 ; CHECK-SDAG-NEXT:    b .LBB5_1
 entry:
-  invoke void @agnostic_za_call()
+  invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
           to label %exit unwind label %catch
 
 catch:
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
index 66ac04e52394a..22abb8ccceb19 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-1.ll
@@ -64,6 +64,6 @@ declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
index e5725bcf66fdf..d689a76ec7708 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-ex-2.ll
@@ -158,10 +158,10 @@ eh.resume:                                        ; preds = %lpad.body
   resume { ptr, i32 } %eh.lpad-body
 }
 
-attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind readnone }
-attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { norecurse sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn }
 
diff --git a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
index 91adf8257c992..74836224052b0 100644
--- a/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
+++ b/llvm/test/CodeGen/AArch64/stack-tagging-untag-placement.ll
@@ -77,6 +77,6 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { sanitize_memtag "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+mte,+neon,+v8.5a" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
index 523eda6149eef..e41d82cd2cf17 100644
--- a/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/AArch64/stack_guard_remat.ll
@@ -54,7 +54,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspstrong "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ;--- pic.ll
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
index f78fcea9e3a1f..b8dcd6f120957 100644
--- a/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
+++ b/llvm/test/CodeGen/AArch64/svtcf-fmul-fdiv-combine.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple aarch64-none-linux-gnu -enable-unsafe-fp-math -mattr=+fullfp16 < %s | FileCheck %s
+; RUN: llc -mtriple aarch64-none-linux-gnu -mattr=+fullfp16 < %s | FileCheck %s
 
 define half @scvtf_f16_2(i32 %state) {
 ; CHECK-LABEL: scvtf_f16_2:
diff --git a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
index 623ea22000c83..89b3b8906f7d2 100644
--- a/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
+++ b/llvm/test/CodeGen/AArch64/vector_merge_dep_check.ll
@@ -24,7 +24,7 @@ define void @fn(ptr %argA, ptr %argB, ptr %a) #0 align 2 {
 
 ; CHECK: ret
 
-attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-features"="+crc,+crypto,+neon" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame5.mir b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
index 97c5c85c1279d..32580f4f3f41c 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame5.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame6.mir b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
index 5ba7842642d47..d76fae1aad3dc 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame6.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame6.mir
@@ -47,8 +47,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame7.mir b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
index 159909814ff33..d4e71d94c7d73 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame7.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame7.mir
@@ -71,8 +71,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
 ...
diff --git a/llvm/test/CodeGen/AArch64/wineh-frame8.mir b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
index 9de99ac25bb6c..56f92f230fcf6 100644
--- a/llvm/test/CodeGen/AArch64/wineh-frame8.mir
+++ b/llvm/test/CodeGen/AArch64/wineh-frame8.mir
@@ -29,7 +29,7 @@
     ret i32 %add
   }
 
-  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/AArch64/wineh5.mir b/llvm/test/CodeGen/AArch64/wineh5.mir
index efdd4b00c90b6..1c09b7875d9f5 100644
--- a/llvm/test/CodeGen/AArch64/wineh5.mir
+++ b/llvm/test/CodeGen/AArch64/wineh5.mir
@@ -73,8 +73,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #2 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
index 2f631c217e8eb..52d0dffc20712 100644
--- a/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
+++ b/llvm/test/CodeGen/AArch64/wineh_shrinkwrap.mir
@@ -56,9 +56,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
index 353c09b4b0bfb..ecd7cc24fd920 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ashr.ll
@@ -1778,7 +1778,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1790,7 +1790,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1802,7 +1802,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1815,7 +1815,7 @@ define i65 @v_ashr_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %result = ashr i65 %value, 33
   ret i65 %result
@@ -1875,21 +1875,19 @@ define amdgpu_ps i65 @s_ashr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_ashr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_ashr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[4:5], s[2:3], 31
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = ashr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
index d9ac9a71ebb25..de1bb477a3e55 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-fma-unmerge-values.mir
@@ -1,5 +1,5 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1010 -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck -check-prefix=GFX10 %s
 
 # Test that we fold correct element from G_UNMERGE_VALUES into fma
 
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
new file mode 100644
index 0000000000000..48e9818d08d0b
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/combine-or-s64-s32.mir
@@ -0,0 +1,97 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+# RUN: llc -mtriple=amdgcn -mcpu=tahiti -run-pass=amdgpu-prelegalizer-combiner %s -o - | FileCheck %s
+
+---
+name: test_combine_or_s64_s32
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %0, %2
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_rhs
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_rhs
+    ; CHECK: liveins: $sgpr0_sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $sgpr0_sgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[UV]], [[COPY1]]
+    ; CHECK-NEXT: [[MV:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[OR]](s32), [[UV1]](s32)
+    ; CHECK-NEXT: $sgpr0_sgpr1 = COPY [[MV]](s64)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+    %0:_(s64) = COPY $sgpr0_sgpr1
+    %1:_(s32) = COPY $sgpr2
+    %2:_(s64) = G_ZEXT %1(s32)
+    %3:_(s64) = G_OR %2, %0
+    $sgpr0_sgpr1 = COPY %3(s64)
+    SI_RETURN_TO_EPILOG implicit $sgpr0_sgpr1
+...
+---
+name: test_combine_or_s64_s32_merge_unmerge
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-LABEL: name: test_combine_or_s64_s32_merge_unmerge
+    ; CHECK: liveins: $sgpr0, $sgpr1, $sgpr2
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $sgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $sgpr2
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[COPY]], [[COPY2]]
+    ; CHECK-NEXT: $sgpr0 = COPY [[OR]](s32)
+    ; CHECK-NEXT: $sgpr1 = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+    %0:_(s32) = COPY $sgpr0
+    %1:_(s32) = COPY $sgpr1
+    %2:_(s32) = COPY $sgpr2
+    %3:_(s64) = G_MERGE_VALUES %0(s32), %1(s32)
+    %4:_(s64) = G_ZEXT %2(s32)
+    %5:_(s64) = G_OR %3, %4
+    %6:_(s32), %7:_(s32) = G_UNMERGE_VALUES %5(s64)
+    $sgpr0 = COPY %6(s32)
+    $sgpr1 = COPY %7(s32)
+    SI_RETURN_TO_EPILOG implicit $sgpr0, implicit $sgpr1
+...
+---
+name: negative_test_incorrect_types
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-LABEL: name: negative_test_incorrect_types
+    ; CHECK: liveins: $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4_vgpr5
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr4_vgpr5
+    ; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s128) = G_ZEXT [[COPY1]](s64)
+    ; CHECK-NEXT: [[OR:%[0-9]+]]:_(s128) = G_OR [[COPY]], [[ZEXT]]
+    ; CHECK-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[OR]](s128)
+    %0:_(s128) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
+    %1:_(s64) = COPY $vgpr4_vgpr5
+    %2:_(s128) = G_ZEXT %1
+    %3:_(s128) = G_OR %0, %2
+    $vgpr0_vgpr1_vgpr2_vgpr3 = COPY %3
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
index 5dff8c16f7c89..667fa988d8f9c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/divergence-divergent-i1-phis-no-lane-mask-merging.ll
@@ -227,39 +227,38 @@ exit:
 define amdgpu_cs void @single_lane_execution_attribute(i32 inreg %.userdata0, <3 x i32> inreg %.WorkgroupId, <3 x i32> %.LocalInvocationId) #0 {
 ; GFX10-LABEL: single_lane_execution_attribute:
 ; GFX10:       ; %bb.0: ; %.entry
-; GFX10-NEXT:    s_getpc_b64 s[12:13]
-; GFX10-NEXT:    s_mov_b32 s12, 0
+; GFX10-NEXT:    s_getpc_b64 s[4:5]
 ; GFX10-NEXT:    s_mov_b32 s2, s0
-; GFX10-NEXT:    s_mov_b32 s3, s12
+; GFX10-NEXT:    s_mov_b32 s3, s5
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v1, -1, 0
-; GFX10-NEXT:    s_or_b64 s[2:3], s[12:13], s[2:3]
 ; GFX10-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x0
 ; GFX10-NEXT:    v_mbcnt_hi_u32_b32 v1, -1, v1
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v2, 2, v1
 ; GFX10-NEXT:    v_and_b32_e32 v3, 1, v1
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v3
 ; GFX10-NEXT:    s_xor_b32 s2, vcc_lo, exec_lo
-; GFX10-NEXT:    s_and_b32 vcc_lo, s2, exec_lo
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    buffer_load_dword v2, v2, s[4:7], 0 offen
+; GFX10-NEXT:    s_and_b32 vcc_lo, exec_lo, s2
+; GFX10-NEXT:    s_mov_b32 s2, 0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v2
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_4
 ; GFX10-NEXT:  ; %bb.1: ; %.preheader.preheader
-; GFX10-NEXT:    s_mov_b32 s2, 0
+; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:  .LBB4_2: ; %.preheader
 ; GFX10-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX10-NEXT:    v_mov_b32_e32 v3, s12
+; GFX10-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v1, -1, v1
-; GFX10-NEXT:    s_add_i32 s12, s12, 4
+; GFX10-NEXT:    s_add_i32 s2, s2, 4
 ; GFX10-NEXT:    buffer_load_dword v3, v3, s[4:7], 0 offen
 ; GFX10-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v1
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_readfirstlane_b32 s3, v3
-; GFX10-NEXT:    s_add_i32 s2, s3, s2
+; GFX10-NEXT:    v_readfirstlane_b32 s12, v3
+; GFX10-NEXT:    s_add_i32 s3, s12, s3
 ; GFX10-NEXT:    s_cbranch_vccnz .LBB4_2
 ; GFX10-NEXT:  ; %bb.3: ; %.preheader._crit_edge
-; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s2, v2
+; GFX10-NEXT:    v_cmp_eq_u32_e32 vcc_lo, s3, v2
 ; GFX10-NEXT:    s_or_b32 s2, s0, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s2
 ; GFX10-NEXT:    s_branch .LBB4_6
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
index bd5303213a690..715a7778351cc 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshl.ll
@@ -4934,17 +4934,15 @@ define amdgpu_ps i64 @s_fshl_i64_5(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 27
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_5:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 5
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 27
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 5)
   ret i64 %result
@@ -4954,20 +4952,13 @@ define amdgpu_ps i64 @s_fshl_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshl_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshl.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -6823,56 +6814,50 @@ define amdgpu_ps i128 @s_fshl_i128_65(i128 inreg %lhs, i128 inreg %rhs) {
 ; GFX6:       ; %bb.0:
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX6-NEXT:    s_mov_b32 s5, 0
 ; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX6-NEXT:    s_or_b32 s0, s0, s4
 ; GFX6-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX6-NEXT:    s_or_b32 s2, s2, s4
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshl_i128_65:
 ; GFX8:       ; %bb.0:
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX8-NEXT:    s_mov_b32 s5, 0
 ; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX8-NEXT:    s_or_b32 s0, s0, s4
 ; GFX8-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX8-NEXT:    s_or_b32 s2, s2, s4
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshl_i128_65:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
 ; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
-; GFX9-NEXT:    s_mov_b32 s5, 0
 ; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; GFX9-NEXT:    s_or_b32 s0, s0, s4
 ; GFX9-NEXT:    s_lshr_b32 s4, s7, 31
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX9-NEXT:    s_or_b32 s2, s2, s4
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshl_i128_65:
 ; GFX10:       ; %bb.0:
-; GFX10-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX10-NEXT:    s_mov_b32 s3, 0
-; GFX10-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX10-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX10-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX10-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX10-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX10-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX10-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX10-NEXT:    s_or_b32 s0, s0, s4
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
 ; GFX10-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshl_i128_65:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_lshr_b32 s2, s5, 31
-; GFX11-NEXT:    s_mov_b32 s3, 0
-; GFX11-NEXT:    s_lshl_b64 s[4:5], s[6:7], 1
-; GFX11-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX11-NEXT:    s_lshr_b32 s2, s7, 31
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[8:9], s[2:3]
+; GFX11-NEXT:    s_lshl_b64 s[2:3], s[0:1], 1
+; GFX11-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[6:7], 1
+; GFX11-NEXT:    s_lshr_b32 s5, s7, 31
+; GFX11-NEXT:    s_or_b32 s0, s0, s4
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
   ret i128 %result
@@ -6885,7 +6870,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX6-NEXT:    v_lshl_b64 v[2:3], v[0:1], 1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[6:7], 1
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX6-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX6-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
@@ -6896,7 +6881,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX8-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX8-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX8-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -6907,7 +6892,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX9-NEXT:    v_lshlrev_b64 v[2:3], 1, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
-; GFX9-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 31, v7
 ; GFX9-NEXT:    v_or_b32_e32 v2, v2, v4
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
@@ -6919,7 +6904,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 1, v[6:7]
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
-; GFX10-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX10-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -6931,7 +6916,7 @@ define i128 @v_fshl_i128_65(i128 %lhs, i128 %rhs) {
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v4, 31, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 31, v7
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_or_b32_e32 v0, v4, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v4
 ; GFX11-NEXT:    v_or_b32_e32 v2, v2, v5
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = call i128 @llvm.fshl.i128(i128 %lhs, i128 %rhs, i128 65)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
index ea6b3a3ad7866..5aa5a6716c152 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fshr.ll
@@ -4715,20 +4715,13 @@ define amdgpu_ps i64 @s_fshr_i64_32(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN-LABEL: s_fshr_i64_32:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_mov_b32 s1, s0
-; GCN-NEXT:    s_mov_b32 s0, 0
-; GCN-NEXT:    s_mov_b32 s2, s3
-; GCN-NEXT:    s_mov_b32 s3, s0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_mov_b32 s0, s3
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_32:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_mov_b32 s1, s0
-; GFX11-NEXT:    s_mov_b32 s0, 0
-; GFX11-NEXT:    s_mov_b32 s2, s3
-; GFX11-NEXT:    s_mov_b32 s3, s0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_mov_b32 s0, s3
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 32)
   ret i64 %result
@@ -4739,17 +4732,15 @@ define amdgpu_ps i64 @s_fshr_i64_48(i64 inreg %lhs, i64 inreg %rhs) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GCN-NEXT:    s_lshr_b32 s2, s3, 16
-; GCN-NEXT:    s_mov_b32 s3, 0
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_or_b32 s0, s0, s2
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX11-LABEL: s_fshr_i64_48:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 16
 ; GFX11-NEXT:    s_lshr_b32 s2, s3, 16
-; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX11-NEXT:    s_or_b32 s0, s0, s2
 ; GFX11-NEXT:    ; return to shader part epilog
   %result = call i64 @llvm.fshr.i64(i64 %lhs, i64 %rhs, i64 48)
   ret i64 %result
@@ -5293,34 +5284,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX6-NEXT:    s_not_b32 s9, s8
-; GFX6-NEXT:    s_sub_i32 s16, s2, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s16, s0, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX6-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX6-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX6-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX6-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX6-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX6-NEXT:    s_sub_i32 s14, s0, 64
-; GFX6-NEXT:    s_sub_i32 s12, 64, s0
-; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX6-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX6-NEXT:    s_sub_i32 s14, s9, 64
+; GFX6-NEXT:    s_sub_i32 s12, 64, s9
+; GFX6-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX6-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX6-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX6-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX6-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5330,9 +5320,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX6-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_i128:
@@ -5340,34 +5330,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX8-NEXT:    s_not_b32 s9, s8
-; GFX8-NEXT:    s_sub_i32 s16, s2, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s16, s0, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX8-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX8-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX8-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX8-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX8-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX8-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX8-NEXT:    s_sub_i32 s14, s0, 64
-; GFX8-NEXT:    s_sub_i32 s12, 64, s0
-; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX8-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX8-NEXT:    s_sub_i32 s14, s9, 64
+; GFX8-NEXT:    s_sub_i32 s12, 64, s9
+; GFX8-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX8-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX8-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX8-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX8-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5377,9 +5366,9 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX8-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX8-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_i128:
@@ -5387,34 +5376,33 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s8
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s8
 ; GFX9-NEXT:    s_not_b32 s9, s8
-; GFX9-NEXT:    s_sub_i32 s16, s2, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s16, s0, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s18, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[12:13], s[10:11], s12
-; GFX9-NEXT:    s_lshl_b64 s[14:15], s[0:1], s9
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[10:11], s9
+; GFX9-NEXT:    s_lshl_b64 s[14:15], s[2:3], s9
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], s9
 ; GFX9-NEXT:    s_or_b64 s[12:13], s[12:13], s[14:15]
 ; GFX9-NEXT:    s_lshl_b64 s[10:11], s[10:11], s16
 ; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], s[10:11]
-; GFX9-NEXT:    s_and_b32 s0, s8, 0x7f
-; GFX9-NEXT:    s_sub_i32 s14, s0, 64
-; GFX9-NEXT:    s_sub_i32 s12, 64, s0
-; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[10:11]
+; GFX9-NEXT:    s_and_b32 s9, s8, 0x7f
+; GFX9-NEXT:    s_sub_i32 s14, s9, 64
+; GFX9-NEXT:    s_sub_i32 s12, 64, s9
+; GFX9-NEXT:    s_cmp_lt_u32 s9, 64
 ; GFX9-NEXT:    s_cselect_b32 s15, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s9, 0
 ; GFX9-NEXT:    s_cselect_b32 s16, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[6:7], s8
+; GFX9-NEXT:    s_lshr_b64 s[10:11], s[6:7], s8
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[4:5], s8
 ; GFX9-NEXT:    s_lshl_b64 s[12:13], s[6:7], s12
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[12:13]
@@ -5424,19 +5412,18 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[4:5]
-; GFX9-NEXT:    s_or_b64 s[2:3], s[10:11], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[10:11], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX10-NEXT:    s_mov_b32 s11, 0
-; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
+; GFX10-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX10-NEXT:    s_or_b32 s2, s2, s9
+; GFX10-NEXT:    s_andn2_b32 s9, 0x7f, s8
 ; GFX10-NEXT:    s_not_b32 s14, s8
 ; GFX10-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX10-NEXT:    s_sub_i32 s10, 64, s9
@@ -5479,11 +5466,10 @@ define amdgpu_ps i128 @s_fshr_i128(i128 inreg %lhs, i128 inreg %rhs, i128 inreg
 ; GFX11-LABEL: s_fshr_i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s10, s1, 31
-; GFX11-NEXT:    s_mov_b32 s11, 0
-; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
+; GFX11-NEXT:    s_lshr_b32 s9, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
+; GFX11-NEXT:    s_or_b32 s2, s2, s9
+; GFX11-NEXT:    s_and_not1_b32 s9, 0x7f, s8
 ; GFX11-NEXT:    s_not_b32 s14, s8
 ; GFX11-NEXT:    s_sub_i32 s16, s9, 64
 ; GFX11-NEXT:    s_sub_i32 s10, 64, s9
@@ -5786,13 +5772,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
 ; GFX6-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
 ; GFX6-NEXT:    v_sub_i32_e32 v1, vcc, 64, v7
 ; GFX6-NEXT:    v_not_b32_e32 v8, 63
 ; GFX6-NEXT:    v_lshr_b64 v[1:2], s[8:9], v1
-; GFX6-NEXT:    v_lshl_b64 v[3:4], s[0:1], v7
+; GFX6-NEXT:    v_lshl_b64 v[3:4], s[2:3], v7
 ; GFX6-NEXT:    v_add_i32_e32 v9, vcc, v7, v8
 ; GFX6-NEXT:    v_lshl_b64 v[5:6], s[8:9], v7
 ; GFX6-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5803,8 +5788,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX6-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX6-NEXT:    v_mov_b32_e32 v3, s0
-; GFX6-NEXT:    v_mov_b32_e32 v4, s1
+; GFX6-NEXT:    v_mov_b32_e32 v3, s2
+; GFX6-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX6-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX6-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX6-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5839,13 +5824,12 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
 ; GFX8-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
 ; GFX8-NEXT:    v_sub_u32_e32 v1, vcc, 64, v7
 ; GFX8-NEXT:    v_not_b32_e32 v8, 63
 ; GFX8-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX8-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX8-NEXT:    v_add_u32_e32 v9, vcc, v7, v8
 ; GFX8-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX8-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5856,8 +5840,8 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX8-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX8-NEXT:    v_mov_b32_e32 v3, s0
-; GFX8-NEXT:    v_mov_b32_e32 v4, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX8-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX8-NEXT:    v_and_b32_e32 v11, 0x7f, v0
 ; GFX8-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5892,12 +5876,11 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
 ; GFX9-NEXT:    v_bfi_b32 v7, v0, 0, v1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
 ; GFX9-NEXT:    v_sub_u32_e32 v1, 64, v7
 ; GFX9-NEXT:    v_lshrrev_b64 v[1:2], v1, s[8:9]
-; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[0:1]
+; GFX9-NEXT:    v_lshlrev_b64 v[3:4], v7, s[2:3]
 ; GFX9-NEXT:    v_add_u32_e32 v8, 0xffffffc0, v7
 ; GFX9-NEXT:    v_lshlrev_b64 v[5:6], v7, s[8:9]
 ; GFX9-NEXT:    v_or_b32_e32 v3, v1, v3
@@ -5908,10 +5891,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
-; GFX9-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    v_cmp_eq_u32_e32 vcc, 0, v7
 ; GFX9-NEXT:    v_and_b32_e32 v10, 0x7f, v0
-; GFX9-NEXT:    v_mov_b32_e32 v3, s0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_cndmask_b32_e32 v9, v2, v4, vcc
 ; GFX9-NEXT:    v_sub_u32_e32 v2, 64, v10
 ; GFX9-NEXT:    v_cndmask_b32_e32 v7, v1, v3, vcc
@@ -5941,34 +5924,33 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX10-LABEL: v_fshr_i128_ssv:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s8, s1, 31
-; GFX10-NEXT:    s_mov_b32 s9, 0
+; GFX10-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX10-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX10-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
+; GFX10-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
+; GFX10-NEXT:    s_or_b32 s8, s8, s2
 ; GFX10-NEXT:    v_add_nc_u32_e32 v0, 0xffffffc0, v11
 ; GFX10-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
-; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
+; GFX10-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX10-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX10-NEXT:    v_add_nc_u32_e32 v13, 0xffffffc0, v12
 ; GFX10-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
+; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX10-NEXT:    v_or_b32_e32 v3, v1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], v0, s[0:1]
-; GFX10-NEXT:    v_lshlrev_b64 v[9:10], v9, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v4, v2, v4
 ; GFX10-NEXT:    v_cmp_gt_u32_e64 s1, 64, v12
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s0, 0, v11
 ; GFX10-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
+; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v14, v0, v3, vcc_lo
 ; GFX10-NEXT:    v_lshrrev_b64 v[2:3], v13, s[6:7]
 ; GFX10-NEXT:    v_or_b32_e32 v0, v7, v9
 ; GFX10-NEXT:    v_or_b32_e32 v7, v8, v10
 ; GFX10-NEXT:    v_cndmask_b32_e32 v4, v1, v4, vcc_lo
-; GFX10-NEXT:    v_cndmask_b32_e32 v5, 0, v5, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e32 v6, 0, v6, vcc_lo
 ; GFX10-NEXT:    v_cndmask_b32_e64 v2, v2, v0, s1
 ; GFX10-NEXT:    v_lshrrev_b64 v[0:1], v12, s[6:7]
@@ -5988,18 +5970,18 @@ define amdgpu_ps <4 x float> @v_fshr_i128_ssv(i128 inreg %lhs, i128 inreg %rhs,
 ; GFX11-LABEL: v_fshr_i128_ssv:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    v_bfi_b32 v11, v0, 0, 0x7f
-; GFX11-NEXT:    s_lshr_b32 s8, s1, 31
+; GFX11-NEXT:    s_lshl_b64 s[8:9], s[2:3], 1
+; GFX11-NEXT:    s_lshr_b32 s2, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_mov_b32 s9, 0
+; GFX11-NEXT:    s_or_b32 s8, s8, s2
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v1, 64, v11
 ; GFX11-NEXT:    v_lshlrev_b64 v[5:6], v11, s[0:1]
 ; GFX11-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 64, v11
 ; GFX11-NEXT:    v_and_b32_e32 v12, 0x7f, v0
-; GFX11-NEXT:    s_or_b64 s[8:9], s[2:3], s[8:9]
-; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_lshlrev_b64 v[3:4], v11, s[8:9]
+; GFX11-NEXT:    v_lshrrev_b64 v[1:2], v1, s[0:1]
 ; GFX11-NEXT:    v_dual_cndmask_b32 v5, 0, v5 :: v_dual_add_nc_u32 v0, 0xffffffc0, v11
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v9, 64, v12
 ; GFX11-NEXT:    v_lshrrev_b64 v[7:8], v12, s[4:5]
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s2, 0, v12
@@ -6045,26 +6027,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX6-NEXT:    s_mov_b32 s1, 0
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX6-NEXT:    s_not_b32 s5, s4
-; GFX6-NEXT:    s_sub_i32 s12, s2, 64
-; GFX6-NEXT:    s_sub_i32 s8, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX6-NEXT:    s_sub_i32 s12, s0, 64
+; GFX6-NEXT:    s_sub_i32 s8, 64, s0
+; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX6-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX6-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX6-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX6-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s4, 64, s0
@@ -6073,14 +6054,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    v_lshr_b64 v[4:5], v[0:1], s0
 ; GFX6-NEXT:    v_lshl_b64 v[6:7], v[2:3], s4
-; GFX6-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX6-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX6-NEXT:    v_lshr_b64 v[8:9], v[2:3], s0
 ; GFX6-NEXT:    v_lshr_b64 v[2:3], v[2:3], s1
 ; GFX6-NEXT:    s_and_b32 s0, 1, s5
 ; GFX6-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX6-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX6-NEXT:    s_and_b32 s0, 1, s8
+; GFX6-NEXT:    s_and_b32 s0, 1, s6
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX6-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6088,10 +6069,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX6-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX6-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX6-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX6-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX6-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX6-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX6-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX6-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX6-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX6-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX6-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: v_fshr_i128_svs:
@@ -6099,26 +6080,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX8-NEXT:    s_mov_b32 s1, 0
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX8-NEXT:    s_not_b32 s5, s4
-; GFX8-NEXT:    s_sub_i32 s12, s2, 64
-; GFX8-NEXT:    s_sub_i32 s8, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX8-NEXT:    s_sub_i32 s12, s0, 64
+; GFX8-NEXT:    s_sub_i32 s8, 64, s0
+; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX8-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX8-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX8-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX8-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX8-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX8-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s4, 64, s0
@@ -6127,14 +6107,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX8-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX8-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX8-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX8-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX8-NEXT:    s_and_b32 s0, 1, s5
 ; GFX8-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX8-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX8-NEXT:    s_and_b32 s0, 1, s8
+; GFX8-NEXT:    s_and_b32 s0, 1, s6
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX8-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6142,10 +6122,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX8-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX8-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX8-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX8-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX8-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX8-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX8-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX8-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX8-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX8-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: v_fshr_i128_svs:
@@ -6153,26 +6133,25 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[0:1], 1
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
-; GFX9-NEXT:    s_mov_b32 s1, 0
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[0:1]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s4
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s4
 ; GFX9-NEXT:    s_not_b32 s5, s4
-; GFX9-NEXT:    s_sub_i32 s12, s2, 64
-; GFX9-NEXT:    s_sub_i32 s8, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
+; GFX9-NEXT:    s_sub_i32 s12, s0, 64
+; GFX9-NEXT:    s_sub_i32 s8, 64, s0
+; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s13, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s14, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[8:9], s[6:7], s8
-; GFX9-NEXT:    s_lshl_b64 s[10:11], s[0:1], s5
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[6:7], s5
+; GFX9-NEXT:    s_lshl_b64 s[10:11], s[2:3], s5
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[6:7], s5
 ; GFX9-NEXT:    s_or_b64 s[8:9], s[8:9], s[10:11]
 ; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], s12
 ; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[8:9], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[6:7]
 ; GFX9-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[0:1], s[6:7]
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX9-NEXT:    s_and_b32 s0, s4, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s1, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s4, 64, s0
@@ -6181,14 +6160,14 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[4:5], s0, v[0:1]
 ; GFX9-NEXT:    v_lshlrev_b64 v[6:7], s4, v[2:3]
-; GFX9-NEXT:    s_cselect_b32 s8, 1, 0
+; GFX9-NEXT:    s_cselect_b32 s6, 1, 0
 ; GFX9-NEXT:    v_lshrrev_b64 v[8:9], s0, v[2:3]
 ; GFX9-NEXT:    v_lshrrev_b64 v[2:3], s1, v[2:3]
 ; GFX9-NEXT:    s_and_b32 s0, 1, s5
 ; GFX9-NEXT:    v_or_b32_e32 v4, v4, v6
 ; GFX9-NEXT:    v_or_b32_e32 v5, v5, v7
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 vcc, 0, s0
-; GFX9-NEXT:    s_and_b32 s0, 1, s8
+; GFX9-NEXT:    s_and_b32 s0, 1, s6
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, v2, v4, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, v3, v5, vcc
 ; GFX9-NEXT:    v_cmp_ne_u32_e64 s[0:1], 0, s0
@@ -6196,20 +6175,19 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX9-NEXT:    v_cndmask_b32_e64 v1, v3, v1, s[0:1]
 ; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v8, vcc
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
-; GFX9-NEXT:    v_or_b32_e32 v0, s2, v0
-; GFX9-NEXT:    v_or_b32_e32 v1, s3, v1
-; GFX9-NEXT:    v_or_b32_e32 v2, s6, v2
-; GFX9-NEXT:    v_or_b32_e32 v3, s7, v3
+; GFX9-NEXT:    v_or_b32_e32 v0, s10, v0
+; GFX9-NEXT:    v_or_b32_e32 v1, s11, v1
+; GFX9-NEXT:    v_or_b32_e32 v2, s2, v2
+; GFX9-NEXT:    v_or_b32_e32 v3, s3, v3
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: v_fshr_i128_svs:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX10-NEXT:    s_mov_b32 s7, 0
-; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
+; GFX10-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX10-NEXT:    s_or_b32 s2, s2, s5
+; GFX10-NEXT:    s_andn2_b32 s5, 0x7f, s4
 ; GFX10-NEXT:    s_not_b32 s10, s4
 ; GFX10-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX10-NEXT:    s_sub_i32 s6, 64, s5
@@ -6259,11 +6237,10 @@ define amdgpu_ps <4 x float> @v_fshr_i128_svs(i128 inreg %lhs, i128 %rhs, i128 i
 ; GFX11-LABEL: v_fshr_i128_svs:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s6, s1, 31
-; GFX11-NEXT:    s_mov_b32 s7, 0
-; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
+; GFX11-NEXT:    s_lshr_b32 s5, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[6:7]
+; GFX11-NEXT:    s_or_b32 s2, s2, s5
+; GFX11-NEXT:    s_and_not1_b32 s5, 0x7f, s4
 ; GFX11-NEXT:    s_not_b32 s10, s4
 ; GFX11-NEXT:    s_sub_i32 s12, s5, 64
 ; GFX11-NEXT:    s_sub_i32 s6, 64, s5
@@ -6714,81 +6691,80 @@ define i128 @v_fshr_i128_65(i128 %lhs, i128 %rhs) {
 define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inreg %rhs, <2 x i128> inreg %amt) {
 ; GFX6-LABEL: s_fshr_v2i128:
 ; GFX6:       ; %bb.0:
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX6-NEXT:    s_mov_b32 s23, 0
 ; GFX6-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX6-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX6-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX6-NEXT:    s_or_b32 s2, s2, s0
+; GFX6-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX6-NEXT:    s_not_b32 s17, s16
-; GFX6-NEXT:    s_sub_i32 s21, s2, 64
-; GFX6-NEXT:    s_sub_i32 s22, 64, s2
-; GFX6-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX6-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX6-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX6-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX6-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX6-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX6-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX6-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX6-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX6-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX6-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX6-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX6-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX6-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX6-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX6-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX6-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX6-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX6-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX6-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX6-NEXT:    s_sub_i32 s21, s17, 64
+; GFX6-NEXT:    s_sub_i32 s22, 64, s17
+; GFX6-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX6-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX6-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX6-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX6-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX6-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX6-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX6-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX6-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX6-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX6-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX6-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX6-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX6-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX6-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX6-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX6-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX6-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX6-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX6-NEXT:    s_or_b32 s6, s6, s4
+; GFX6-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX6-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX6-NEXT:    s_not_b32 s16, s20
-; GFX6-NEXT:    s_sub_i32 s18, s6, 64
-; GFX6-NEXT:    s_sub_i32 s10, 64, s6
-; GFX6-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX6-NEXT:    s_sub_i32 s18, s4, 64
+; GFX6-NEXT:    s_sub_i32 s10, 64, s4
+; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX6-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX6-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX6-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX6-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX6-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX6-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX6-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX6-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX6-NEXT:    s_sub_i32 s18, s4, 64
-; GFX6-NEXT:    s_sub_i32 s16, 64, s4
-; GFX6-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX6-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX6-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX6-NEXT:    s_sub_i32 s18, s8, 64
+; GFX6-NEXT:    s_sub_i32 s16, 64, s8
+; GFX6-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX6-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX6-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX6-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX6-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX6-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX6-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX6-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX6-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6796,88 +6772,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX6-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX6-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX6-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX6-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX6-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX6-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX6-NEXT:    ; return to shader part epilog
 ;
 ; GFX8-LABEL: s_fshr_v2i128:
 ; GFX8:       ; %bb.0:
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX8-NEXT:    s_mov_b32 s23, 0
 ; GFX8-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX8-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX8-NEXT:    s_or_b32 s2, s2, s0
+; GFX8-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX8-NEXT:    s_not_b32 s17, s16
-; GFX8-NEXT:    s_sub_i32 s21, s2, 64
-; GFX8-NEXT:    s_sub_i32 s22, 64, s2
-; GFX8-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX8-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX8-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX8-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX8-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX8-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX8-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX8-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX8-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX8-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX8-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX8-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX8-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX8-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX8-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX8-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX8-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX8-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX8-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX8-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX8-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX8-NEXT:    s_sub_i32 s21, s17, 64
+; GFX8-NEXT:    s_sub_i32 s22, 64, s17
+; GFX8-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX8-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX8-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX8-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX8-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX8-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX8-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX8-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX8-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX8-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX8-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX8-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX8-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX8-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX8-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX8-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX8-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX8-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX8-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX8-NEXT:    s_or_b32 s6, s6, s4
+; GFX8-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX8-NEXT:    s_not_b32 s16, s20
-; GFX8-NEXT:    s_sub_i32 s18, s6, 64
-; GFX8-NEXT:    s_sub_i32 s10, 64, s6
-; GFX8-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX8-NEXT:    s_sub_i32 s18, s4, 64
+; GFX8-NEXT:    s_sub_i32 s10, 64, s4
+; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX8-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX8-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX8-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX8-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX8-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX8-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX8-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX8-NEXT:    s_sub_i32 s18, s4, 64
-; GFX8-NEXT:    s_sub_i32 s16, 64, s4
-; GFX8-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX8-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX8-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX8-NEXT:    s_sub_i32 s18, s8, 64
+; GFX8-NEXT:    s_sub_i32 s16, 64, s8
+; GFX8-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX8-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX8-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX8-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX8-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX8-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX8-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX8-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX8-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX8-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX8-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6885,88 +6860,87 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX8-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX8-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX8-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX8-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX8-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX8-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX8-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX8-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX8-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX8-NEXT:    ; return to shader part epilog
 ;
 ; GFX9-LABEL: s_fshr_v2i128:
 ; GFX9:       ; %bb.0:
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s1, 31
-; GFX9-NEXT:    s_mov_b32 s23, 0
 ; GFX9-NEXT:    s_lshl_b64 s[18:19], s[0:1], 1
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s2, 0x7f, s16
+; GFX9-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
+; GFX9-NEXT:    s_lshr_b32 s0, s1, 31
+; GFX9-NEXT:    s_or_b32 s2, s2, s0
+; GFX9-NEXT:    s_andn2_b32 s0, 0x7f, s16
 ; GFX9-NEXT:    s_not_b32 s17, s16
-; GFX9-NEXT:    s_sub_i32 s21, s2, 64
-; GFX9-NEXT:    s_sub_i32 s22, 64, s2
-; GFX9-NEXT:    s_cmp_lt_u32 s2, 64
-; GFX9-NEXT:    s_cselect_b32 s28, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s29, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[24:25], s[18:19], s22
-; GFX9-NEXT:    s_lshl_b64 s[26:27], s[0:1], s17
-; GFX9-NEXT:    s_lshl_b64 s[2:3], s[18:19], s17
-; GFX9-NEXT:    s_or_b64 s[24:25], s[24:25], s[26:27]
-; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[24:25], s[18:19]
-; GFX9-NEXT:    s_cmp_lg_u32 s29, 0
-; GFX9-NEXT:    s_cselect_b64 s[18:19], s[0:1], s[18:19]
-; GFX9-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX9-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX9-NEXT:    s_sub_i32 s22, 64, s0
 ; GFX9-NEXT:    s_cmp_lt_u32 s0, 64
 ; GFX9-NEXT:    s_cselect_b32 s26, 1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b32 s27, 1, 0
-; GFX9-NEXT:    s_lshr_b64 s[0:1], s[10:11], s16
+; GFX9-NEXT:    s_lshr_b64 s[22:23], s[18:19], s22
+; GFX9-NEXT:    s_lshl_b64 s[24:25], s[2:3], s17
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[18:19], s17
+; GFX9-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[18:19], s[18:19], s21
+; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cselect_b64 s[0:1], s[0:1], 0
+; GFX9-NEXT:    s_cselect_b64 s[18:19], s[22:23], s[18:19]
+; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[18:19]
+; GFX9-NEXT:    s_and_b32 s17, s16, 0x7f
+; GFX9-NEXT:    s_sub_i32 s21, s17, 64
+; GFX9-NEXT:    s_sub_i32 s22, 64, s17
+; GFX9-NEXT:    s_cmp_lt_u32 s17, 64
+; GFX9-NEXT:    s_cselect_b32 s24, 1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s25, 1, 0
+; GFX9-NEXT:    s_lshr_b64 s[18:19], s[10:11], s16
 ; GFX9-NEXT:    s_lshr_b64 s[16:17], s[8:9], s16
-; GFX9-NEXT:    s_lshl_b64 s[24:25], s[10:11], s22
-; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[24:25]
+; GFX9-NEXT:    s_lshl_b64 s[22:23], s[10:11], s22
+; GFX9-NEXT:    s_or_b64 s[16:17], s[16:17], s[22:23]
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[16:17], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], s[10:11]
-; GFX9-NEXT:    s_cmp_lg_u32 s26, 0
-; GFX9-NEXT:    s_cselect_b64 s[10:11], s[0:1], 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
-; GFX9-NEXT:    s_lshr_b32 s22, s5, 31
-; GFX9-NEXT:    s_or_b64 s[0:1], s[2:3], s[8:9]
+; GFX9-NEXT:    s_cmp_lg_u32 s24, 0
+; GFX9-NEXT:    s_cselect_b64 s[10:11], s[18:19], 0
+; GFX9-NEXT:    s_or_b64 s[0:1], s[0:1], s[8:9]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[4:5], 1
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[22:23]
-; GFX9-NEXT:    s_andn2_b32 s6, 0x7f, s20
-; GFX9-NEXT:    s_or_b64 s[2:3], s[18:19], s[10:11]
+; GFX9-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
+; GFX9-NEXT:    s_lshr_b32 s4, s5, 31
+; GFX9-NEXT:    s_or_b32 s6, s6, s4
+; GFX9-NEXT:    s_andn2_b32 s4, 0x7f, s20
+; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], s[10:11]
 ; GFX9-NEXT:    s_not_b32 s16, s20
-; GFX9-NEXT:    s_sub_i32 s18, s6, 64
-; GFX9-NEXT:    s_sub_i32 s10, 64, s6
-; GFX9-NEXT:    s_cmp_lt_u32 s6, 64
+; GFX9-NEXT:    s_sub_i32 s18, s4, 64
+; GFX9-NEXT:    s_sub_i32 s10, 64, s4
+; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s6, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
-; GFX9-NEXT:    s_lshl_b64 s[6:7], s[8:9], s16
+; GFX9-NEXT:    s_lshl_b64 s[4:5], s[8:9], s16
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[8:9], s10
-; GFX9-NEXT:    s_lshl_b64 s[16:17], s[4:5], s16
+; GFX9-NEXT:    s_lshl_b64 s[16:17], s[6:7], s16
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshl_b64 s[8:9], s[8:9], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], 0
+; GFX9-NEXT:    s_cselect_b64 s[4:5], s[4:5], 0
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], s[10:11], s[8:9]
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
-; GFX9-NEXT:    s_cselect_b64 s[8:9], s[4:5], s[8:9]
-; GFX9-NEXT:    s_and_b32 s4, s20, 0x7f
-; GFX9-NEXT:    s_sub_i32 s18, s4, 64
-; GFX9-NEXT:    s_sub_i32 s16, 64, s4
-; GFX9-NEXT:    s_cmp_lt_u32 s4, 64
+; GFX9-NEXT:    s_cselect_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-NEXT:    s_and_b32 s8, s20, 0x7f
+; GFX9-NEXT:    s_sub_i32 s18, s8, 64
+; GFX9-NEXT:    s_sub_i32 s16, 64, s8
+; GFX9-NEXT:    s_cmp_lt_u32 s8, 64
 ; GFX9-NEXT:    s_cselect_b32 s19, 1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s4, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s8, 0
 ; GFX9-NEXT:    s_cselect_b32 s21, 1, 0
 ; GFX9-NEXT:    s_lshr_b64 s[10:11], s[12:13], s20
 ; GFX9-NEXT:    s_lshl_b64 s[16:17], s[14:15], s16
-; GFX9-NEXT:    s_lshr_b64 s[4:5], s[14:15], s20
+; GFX9-NEXT:    s_lshr_b64 s[8:9], s[14:15], s20
 ; GFX9-NEXT:    s_or_b64 s[10:11], s[10:11], s[16:17]
 ; GFX9-NEXT:    s_lshr_b64 s[14:15], s[14:15], s18
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
@@ -6974,61 +6948,60 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX9-NEXT:    s_cmp_lg_u32 s21, 0
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], s[12:13], s[10:11]
 ; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX9-NEXT:    s_cselect_b64 s[12:13], s[4:5], 0
-; GFX9-NEXT:    s_or_b64 s[4:5], s[6:7], s[10:11]
-; GFX9-NEXT:    s_or_b64 s[6:7], s[8:9], s[12:13]
+; GFX9-NEXT:    s_cselect_b64 s[8:9], s[8:9], 0
+; GFX9-NEXT:    s_or_b64 s[4:5], s[4:5], s[10:11]
+; GFX9-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX9-NEXT:    ; return to shader part epilog
 ;
 ; GFX10-LABEL: s_fshr_v2i128:
 ; GFX10:       ; %bb.0:
 ; GFX10-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX10-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX10-NEXT:    s_mov_b32 s19, 0
-; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX10-NEXT:    s_not_b32 s18, s16
-; GFX10-NEXT:    s_sub_i32 s21, s17, 64
-; GFX10-NEXT:    s_sub_i32 s22, 64, s17
+; GFX10-NEXT:    s_or_b32 s2, s2, s17
+; GFX10-NEXT:    s_andn2_b32 s17, 0x7f, s16
+; GFX10-NEXT:    s_not_b32 s21, s16
+; GFX10-NEXT:    s_sub_i32 s26, s17, 64
+; GFX10-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX10-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX10-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX10-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX10-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX10-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX10-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX10-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX10-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX10-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX10-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX10-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX10-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX10-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX10-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX10-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX10-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX10-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX10-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX10-NEXT:    s_and_b32 s0, s16, 0x7f
-; GFX10-NEXT:    s_sub_i32 s18, s0, 64
+; GFX10-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX10-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX10-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX10-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX10-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX10-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX10-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX10-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX10-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX10-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX10-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX10-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX10-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX10-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX10-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX10-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX10-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX10-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX10-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX10-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX10-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX10-NEXT:    s_or_b32 s6, s6, s8
 ; GFX10-NEXT:    s_andn2_b32 s8, 0x7f, s20
-; GFX10-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX10-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX10-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX10-NEXT:    s_not_b32 s16, s20
 ; GFX10-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX10-NEXT:    s_sub_i32 s9, 64, s8
@@ -7071,54 +7044,53 @@ define amdgpu_ps <2 x i128> @s_fshr_v2i128(<2 x i128> inreg %lhs, <2 x i128> inr
 ; GFX11-LABEL: s_fshr_v2i128:
 ; GFX11:       ; %bb.0:
 ; GFX11-NEXT:    s_lshl_b64 s[2:3], s[2:3], 1
-; GFX11-NEXT:    s_lshr_b32 s18, s1, 31
-; GFX11-NEXT:    s_mov_b32 s19, 0
-; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_lshr_b32 s17, s1, 31
 ; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], 1
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[18:19]
-; GFX11-NEXT:    s_not_b32 s18, s16
-; GFX11-NEXT:    s_sub_i32 s21, s17, 64
-; GFX11-NEXT:    s_sub_i32 s22, 64, s17
+; GFX11-NEXT:    s_or_b32 s2, s2, s17
+; GFX11-NEXT:    s_and_not1_b32 s17, 0x7f, s16
+; GFX11-NEXT:    s_not_b32 s21, s16
+; GFX11-NEXT:    s_sub_i32 s26, s17, 64
+; GFX11-NEXT:    s_sub_i32 s18, 64, s17
 ; GFX11-NEXT:    s_cmp_lt_u32 s17, 64
-; GFX11-NEXT:    s_cselect_b32 s28, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s27, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b32 s17, 1, 0
-; GFX11-NEXT:    s_lshr_b64 s[22:23], s[0:1], s22
-; GFX11-NEXT:    s_lshl_b64 s[24:25], s[2:3], s18
-; GFX11-NEXT:    s_lshl_b64 s[26:27], s[0:1], s18
-; GFX11-NEXT:    s_or_b64 s[22:23], s[22:23], s[24:25]
-; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s21
-; GFX11-NEXT:    s_cmp_lg_u32 s28, 0
-; GFX11-NEXT:    s_cselect_b64 s[24:25], s[26:27], 0
-; GFX11-NEXT:    s_cselect_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_lshr_b64 s[18:19], s[0:1], s18
+; GFX11-NEXT:    s_lshl_b64 s[22:23], s[2:3], s21
+; GFX11-NEXT:    s_lshl_b64 s[24:25], s[0:1], s21
+; GFX11-NEXT:    s_or_b64 s[18:19], s[18:19], s[22:23]
+; GFX11-NEXT:    s_lshl_b64 s[0:1], s[0:1], s26
+; GFX11-NEXT:    s_cmp_lg_u32 s27, 0
+; GFX11-NEXT:    s_cselect_b64 s[22:23], s[24:25], 0
+; GFX11-NEXT:    s_cselect_b64 s[0:1], s[18:19], s[0:1]
 ; GFX11-NEXT:    s_cmp_lg_u32 s17, 0
 ; GFX11-NEXT:    s_cselect_b64 s[2:3], s[2:3], s[0:1]
 ; GFX11-NEXT:    s_and_b32 s0, s16, 0x7f
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_sub_i32 s18, s0, 64
+; GFX11-NEXT:    s_sub_i32 s21, s0, 64
 ; GFX11-NEXT:    s_sub_i32 s17, 64, s0
 ; GFX11-NEXT:    s_cmp_lt_u32 s0, 64
-; GFX11-NEXT:    s_cselect_b32 s21, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s24, 1, 0
 ; GFX11-NEXT:    s_cmp_eq_u32 s0, 0
-; GFX11-NEXT:    s_cselect_b32 s26, 1, 0
+; GFX11-NEXT:    s_cselect_b32 s25, 1, 0
 ; GFX11-NEXT:    s_lshr_b64 s[0:1], s[8:9], s16
-; GFX11-NEXT:    s_lshl_b64 s[22:23], s[10:11], s17
+; GFX11-NEXT:    s_lshl_b64 s[18:19], s[10:11], s17
 ; GFX11-NEXT:    s_lshr_b64 s[16:17], s[10:11], s16
-; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[22:23]
-; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s18
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_or_b64 s[0:1], s[0:1], s[18:19]
+; GFX11-NEXT:    s_lshr_b64 s[10:11], s[10:11], s21
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[0:1], s[10:11]
-; GFX11-NEXT:    s_cmp_lg_u32 s26, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s25, 0
 ; GFX11-NEXT:    s_cselect_b64 s[0:1], s[8:9], s[0:1]
-; GFX11-NEXT:    s_cmp_lg_u32 s21, 0
+; GFX11-NEXT:    s_cmp_lg_u32 s24, 0
 ; GFX11-NEXT:    s_cselect_b64 s[8:9], s[16:17], 0
 ; GFX11-NEXT:    s_lshl_b64 s[6:7], s[6:7], 1
 ; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], s[8:9]
-; GFX11-NEXT:    s_lshr_b32 s18, s5, 31
+; GFX11-NEXT:    s_lshr_b32 s8, s5, 31
+; GFX11-NEXT:    s_or_b64 s[0:1], s[22:23], s[0:1]
+; GFX11-NEXT:    s_or_b32 s6, s6, s8
 ; GFX11-NEXT:    s_and_not1_b32 s8, 0x7f, s20
-; GFX11-NEXT:    s_or_b64 s[0:1], s[24:25], s[0:1]
 ; GFX11-NEXT:    s_lshl_b64 s[4:5], s[4:5], 1
-; GFX11-NEXT:    s_or_b64 s[6:7], s[6:7], s[18:19]
 ; GFX11-NEXT:    s_not_b32 s16, s20
 ; GFX11-NEXT:    s_sub_i32 s18, s8, 64
 ; GFX11-NEXT:    s_sub_i32 s9, 64, s8
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
index 52b1beb0b0594..91f2f6f15b931 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fract.f64.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -enable-unsafe-fp-math -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX10
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s -check-prefix=GFX11
 
 ---
 name:            fract_f64_neg
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
new file mode 100644
index 0000000000000..4b214e66ea994
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fmaximum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fmaximum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s16) = G_FMAXNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s16) = G_FMAXIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMAXIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMAXIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s64) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s64) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMAXIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMAXIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fmaximum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMAXNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fmaximum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMAXNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMAXNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMAXIMUM1:%[0-9]+]]:_(s32) = G_FMAXIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMAXIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMAXIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fmaximum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMAXNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nsz G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fmaximum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMAXNUM_IEEE:%[0-9]+]]:_(s32) = G_FMAXNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMAXNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fmaximum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMAXIMUM:%[0-9]+]]:_(s32) = nnan G_FMAXIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMAXIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMAXIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
new file mode 100644
index 0000000000000..8ba0794be3069
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-fminimum.mir
@@ -0,0 +1,275 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -run-pass=legalizer %s -o - | FileCheck -check-prefix=GFX9  %s
+# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -mattr=-real-true16 -run-pass=legalizer %s -o - | FileCheck -check-prefixes=GFX12  %s
+
+---
+name: test_fminimum_f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s16) = G_FMINNUM_IEEE [[TRUNC]], [[TRUNC1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s16) = COPY [[SELECT]](s16)
+    ; GFX9-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[COPY2]](s16)
+    ; GFX9-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[COPY]](s32)
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[COPY1]](s32)
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s16) = G_FMINIMUM [[TRUNC]], [[TRUNC1]]
+    ; GFX12-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[FMINIMUM]](s16)
+    ; GFX12-NEXT: $vgpr0 = COPY [[ANYEXT]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s16) = G_TRUNC %0(s32)
+    %2:_(s32) = COPY $vgpr1
+    %3:_(s16) = G_TRUNC %2(s32)
+    %4:_(s16) = G_FMINIMUM %1, %3
+    %5:_(s32) = G_ANYEXT %4(s16)
+    $vgpr0 = COPY %5(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_f64
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_f64
+    ; GFX9: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s64) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s64), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s64) = G_FCONSTANT double 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[SELECT]](s64)
+    ; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_f64
+    ; GFX12: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $vgpr0_vgpr1
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $vgpr2_vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s64) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0_vgpr1 = COPY [[FMINIMUM]](s64)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0_vgpr1
+    %0:_(s64) = COPY $vgpr0_vgpr1
+    %1:_(s64) = COPY $vgpr2_vgpr3
+    %2:_(s64) = G_FMINIMUM %0, %1
+    $vgpr0_vgpr1 = COPY %2(s64)
+    SI_RETURN implicit $vgpr0_vgpr1
+...
+---
+name: test_fminimum_v2f16
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_v2f16
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(<2 x s16>) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST]](s32)
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
+    ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
+    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC2:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST1]](s32)
+    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC3:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC]](s16), [[TRUNC2]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[TRUNC1]](s16), [[TRUNC3]]
+    ; GFX9-NEXT: [[C1:%[0-9]+]]:_(s16) = G_FCONSTANT half 0xH7E00
+    ; GFX9-NEXT: [[BITCAST2:%[0-9]+]]:_(s32) = G_BITCAST [[FMINNUM_IEEE]](<2 x s16>)
+    ; GFX9-NEXT: [[TRUNC4:%[0-9]+]]:_(s16) = G_TRUNC [[BITCAST2]](s32)
+    ; GFX9-NEXT: [[LSHR2:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST2]], [[C]](s32)
+    ; GFX9-NEXT: [[TRUNC5:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR2]](s32)
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s16) = G_SELECT [[FCMP]](s1), [[TRUNC4]], [[C1]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s16) = G_SELECT [[FCMP1]](s1), [[TRUNC5]], [[C1]]
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[SELECT]](s16), [[SELECT1]](s16)
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s16>) = COPY [[BUILD_VECTOR]](<2 x s16>)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](<2 x s16>)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f16
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s16>) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(<2 x s16>) = G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](<2 x s16>)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(<2 x s16>) = COPY $vgpr0
+    %1:_(<2 x s16>) = COPY $vgpr1
+    %2:_(<2 x s16>) = G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(<2 x s16>)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_v2f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+
+    ; GFX9-LABEL: name: test_fminimum_v2f32
+    ; GFX9: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY2]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY2]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY4:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: [[FMINNUM_IEEE1:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY1]], [[COPY3]]
+    ; GFX9-NEXT: [[FCMP1:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY1]](s32), [[COPY3]]
+    ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s32) = G_SELECT [[FCMP1]](s1), [[FMINNUM_IEEE1]], [[C]]
+    ; GFX9-NEXT: [[COPY5:%[0-9]+]]:_(s32) = COPY [[SELECT1]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY4]](s32)
+    ; GFX9-NEXT: $vgpr1 = COPY [[COPY5]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    ;
+    ; GFX12-LABEL: name: test_fminimum_v2f32
+    ; GFX12: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
+    ; GFX12-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $vgpr3
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY]], [[COPY2]]
+    ; GFX12-NEXT: [[FMINIMUM1:%[0-9]+]]:_(s32) = G_FMINIMUM [[COPY1]], [[COPY3]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: $vgpr1 = COPY [[FMINIMUM1]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0(s32), %1(s32)
+    %3:_(s32) = COPY $vgpr2
+    %4:_(s32) = COPY $vgpr3
+    %5:_(<2 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32)
+    %6:_(<2 x s32>) = G_FMINIMUM %2, %5
+    %7:_(s32), %8:_(s32) = G_UNMERGE_VALUES %6(<2 x s32>)
+    $vgpr0 = COPY %7(s32)
+    $vgpr1 = COPY %8(s32)
+    SI_RETURN implicit $vgpr0, implicit $vgpr1
+...
+---
+name: test_fminimum_nsz_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nsz_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[FCMP:%[0-9]+]]:_(s1) = G_FCMP floatpred(ord), [[COPY]](s32), [[COPY1]]
+    ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_FCONSTANT float 0x7FF8000000000000
+    ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s32) = G_SELECT [[FCMP]](s1), [[FMINNUM_IEEE]], [[C]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SELECT]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nsz_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nsz G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nsz G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+---
+name: test_fminimum_nnan_f32
+body: |
+  bb.0:
+    liveins: $vgpr0, $vgpr1
+
+    ; GFX9-LABEL: name: test_fminimum_nnan_f32
+    ; GFX9: liveins: $vgpr0, $vgpr1
+    ; GFX9-NEXT: {{  $}}
+    ; GFX9-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX9-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX9-NEXT: [[FMINNUM_IEEE:%[0-9]+]]:_(s32) = G_FMINNUM_IEEE [[COPY]], [[COPY1]]
+    ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[FMINNUM_IEEE]](s32)
+    ; GFX9-NEXT: $vgpr0 = COPY [[COPY2]](s32)
+    ; GFX9-NEXT: SI_RETURN implicit $vgpr0
+    ;
+    ; GFX12-LABEL: name: test_fminimum_nnan_f32
+    ; GFX12: liveins: $vgpr0, $vgpr1
+    ; GFX12-NEXT: {{  $}}
+    ; GFX12-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; GFX12-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; GFX12-NEXT: [[FMINIMUM:%[0-9]+]]:_(s32) = nnan G_FMINIMUM [[COPY]], [[COPY1]]
+    ; GFX12-NEXT: $vgpr0 = COPY [[FMINIMUM]](s32)
+    ; GFX12-NEXT: SI_RETURN implicit $vgpr0
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(s32) = nnan G_FMINIMUM %0, %1
+    $vgpr0 = COPY %2(s32)
+    SI_RETURN implicit $vgpr0
+...
+
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
index c8bd8ab33f18c..d5947c829f192 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.mir
@@ -18,57 +18,11 @@ body: |
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(s32) = COPY $vgpr0
     %1:_(s32) = COPY $vgpr1
-    %2:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 1)
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
     $vgpr0_vgpr1 = COPY %2
 
 ...
 
----
-name: shufflevector_scalar_src_dst
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0, $vgpr1
-
-    ; CHECK-LABEL: name: shufflevector_scalar_src_dst
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
-    ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
-    %0:_(s32) = COPY $vgpr0
-    %1:_(s32) = COPY $vgpr1
-    %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
-    $vgpr0 = COPY %2
-
-...
-
----
-name: shufflevector_scalar_dst
-tracksRegLiveness: true
-
-body: |
-  bb.0:
-    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-
-    ; CHECK-LABEL: name: shufflevector_scalar_dst
-    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](<2 x s32>)
-    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY2]](s32)
-    ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](s32)
-    %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
-    %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
-    %2:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
-    $vgpr0 = COPY %2
-
-...
-
 ---
 name: shufflevector_v2s32_0_1
 tracksRegLiveness: true
@@ -136,9 +90,7 @@ body: |
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(<2 x s32>) = COPY $vgpr2_vgpr3
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
     ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[UV]](s32)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](<2 x s32>)
-    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[UV2]](s32)
-    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32)
     ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
     %0:_(<2 x s32>) = COPY $vgpr0_vgpr1
     %1:_(<2 x s32>) = COPY $vgpr2_vgpr3
@@ -374,3 +326,30 @@ body: |
     $vgpr0_vgpr1 = COPY %4
 
 ...
+
+---
+name: shufflevector_cache_element
+tracksRegLiveness: true
+
+body: |
+  bb.0:
+    liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+
+    ; CHECK-LABEL: name: shufflevector_cache_element
+    ; CHECK: liveins: $vgpr0_vgpr1, $vgpr2_vgpr3
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $vgpr1
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[COPY]](s32)
+    ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[COPY1]](s32)
+    ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<8 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY2]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32), [[COPY3]](s32)
+    ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[BUILD_VECTOR]](<8 x s32>), [[COPY1]](s32)
+    ; CHECK-NEXT: $vgpr0 = COPY [[EVEC]](s32)
+    %0:_(s32) = COPY $vgpr0
+    %1:_(s32) = COPY $vgpr1
+    %2:_(<2 x s32>) = G_BUILD_VECTOR %0, %1
+    %3:_(<8 x s32>) = G_SHUFFLE_VECTOR %2, %2, shufflemask(0, 0, 0, 0, 3, 3, 3, 3)
+    %4:_(s32) = G_EXTRACT_VECTOR_ELT %3(<8 x s32>), %1(s32)
+    $vgpr0 = COPY %4
+
+...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
index 10f425c96e6f4..dff8111de72d9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-shuffle-vector.s16.mir
@@ -237,16 +237,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_0_0
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -257,10 +254,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -285,12 +279,10 @@ body: |
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_1_1
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -301,10 +293,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -329,16 +318,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_2
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -349,10 +335,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -641,12 +624,10 @@ body: |
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR1]], [[C]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[LSHR]], [[C]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[LSHR]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_3_3
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -657,10 +638,7 @@ body: |
     ; GFX9-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY1]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
@@ -733,16 +711,13 @@ body: |
     ; GFX8-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 0
     ; GFX8-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX8-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
-    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX8-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
     ; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
     ; GFX8-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[LSHR]], [[C1]]
-    ; GFX8-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[LSHR1]], [[C1]]
     ; GFX8-NEXT: [[C2:%[0-9]+]]:_(s32) = G_CONSTANT i32 16
-    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND1]], [[C2]](s32)
+    ; GFX8-NEXT: [[SHL:%[0-9]+]]:_(s32) = G_SHL [[AND]], [[C2]](s32)
     ; GFX8-NEXT: [[OR:%[0-9]+]]:_(s32) = G_OR [[AND]], [[SHL]]
-    ; GFX8-NEXT: [[BITCAST2:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
-    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+    ; GFX8-NEXT: [[BITCAST1:%[0-9]+]]:_(<2 x s16>) = G_BITCAST [[OR]](s32)
+    ; GFX8-NEXT: $vgpr0 = COPY [[BITCAST1]](<2 x s16>)
     ;
     ; GFX9-LABEL: name: shufflevector_v2s16_v2s16_2_0
     ; GFX9: liveins: $vgpr0, $vgpr1
@@ -753,10 +728,7 @@ body: |
     ; GFX9-NEXT: [[BITCAST:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
     ; GFX9-NEXT: [[LSHR:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST]], [[C]](s32)
     ; GFX9-NEXT: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR]](s32)
-    ; GFX9-NEXT: [[BITCAST1:%[0-9]+]]:_(s32) = G_BITCAST [[COPY]](<2 x s16>)
-    ; GFX9-NEXT: [[LSHR1:%[0-9]+]]:_(s32) = G_LSHR [[BITCAST1]], [[C]](s32)
-    ; GFX9-NEXT: [[TRUNC1:%[0-9]+]]:_(s16) = G_TRUNC [[LSHR1]](s32)
-    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC1]](s16)
+    ; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s16>) = G_BUILD_VECTOR [[TRUNC]](s16), [[TRUNC]](s16)
     ; GFX9-NEXT: $vgpr0 = COPY [[BUILD_VECTOR]](<2 x s16>)
     %0:_(<2 x s16>) = COPY $vgpr0
     %1:_(<2 x s16>) = COPY $vgpr1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
index 51714035352a3..7714c032d1737 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i32.ll
@@ -140,7 +140,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, exec_lo
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -345,7 +344,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc_lo, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s0, 34, v1
 ; CHECK-NEXT:    s_and_b32 s0, vcc_lo, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
index 7b01f13b9ef1c..7b8166948610b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.ballot.i64.ll
@@ -143,7 +143,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_non_compare(i32 %v) {
 ; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
 ; CHECK-NEXT:    v_cmp_ne_u32_e32 vcc, 0, v0
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, exec
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB9_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
@@ -348,7 +347,6 @@ define amdgpu_cs i32 @branch_divergent_ballot_eq_zero_and(i32 %v1, i32 %v2) {
 ; CHECK-NEXT:    v_cmp_gt_u32_e32 vcc, 12, v0
 ; CHECK-NEXT:    v_cmp_lt_u32_e64 s[0:1], 34, v1
 ; CHECK-NEXT:    s_and_b64 s[0:1], vcc, s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cbranch_scc0 .LBB17_2
 ; CHECK-NEXT:  ; %bb.1: ; %false
 ; CHECK-NEXT:    s_mov_b32 s0, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
index 8533e34ff13f8..518af70cbbf9f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/lshr.ll
@@ -1750,7 +1750,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[0:1], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX6-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1763,7 +1763,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX8-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1776,7 +1776,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1789,7 +1789,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX10-NEXT:    v_and_b32_e32 v0, 1, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX10-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX10-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX10-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1800,7 +1800,7 @@ define i65 @v_lshr_i65_33(i65 %value) {
 ; GFX11-NEXT:    v_dual_mov_b32 v1, 0 :: v_dual_and_b32 v0, 1, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX11-NEXT:    v_lshlrev_b64 v[0:1], 31, v[0:1]
-; GFX11-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %result = lshr i65 %value, 33
@@ -1859,21 +1859,19 @@ define amdgpu_ps i65 @s_lshr_i65_33(i65 inreg %value) {
 ; GCN-LABEL: s_lshr_i65_33:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GCN-NEXT:    s_lshr_b32 s0, s1, 1
-; GCN-NEXT:    s_mov_b32 s1, 0
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GCN-NEXT:    s_lshr_b32 s4, s1, 1
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_lshr_i65_33:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_and_b64 s[2:3], s[2:3], 1
-; GFX10PLUS-NEXT:    s_lshr_b32 s0, s1, 1
-; GFX10PLUS-NEXT:    s_mov_b32 s1, 0
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
+; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 1
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %result = lshr i65 %value, 33
   ret i65 %result
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
index af377b1d76817..e0581f01dda6a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/or.ll
@@ -597,13 +597,13 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; GFX7-NEXT:    s_mov_b32 s5, 0
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_load_dword s4, s[2:3], 0x0
+; GFX7-NEXT:    s_load_dword s3, s[2:3], 0x0
 ; GFX7-NEXT:    s_mov_b32 s2, -1
-; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX7-NEXT:    s_or_b64 s[4:5], s[4:5], 0x50
+; GFX7-NEXT:    s_or_b32 s4, s3, 0x50
 ; GFX7-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX7-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX7-NEXT:    s_endpgm
 ;
@@ -616,7 +616,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX8-NEXT:    s_mov_b32 s3, 0
 ; GFX8-NEXT:    v_mov_b32_e32 v2, s0
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX8-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
@@ -630,7 +630,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX9-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX9-NEXT:    s_mov_b32 s3, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX9-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX9-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -644,7 +644,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX10-NEXT:    s_load_dword s2, s[2:3], 0x0
 ; GFX10-NEXT:    s_mov_b32 s3, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX10-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX10-NEXT:    v_mov_b32_e32 v0, s2
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s3
 ; GFX10-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
@@ -658,7 +658,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX11-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX11-NEXT:    s_mov_b32 s3, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX11-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX11-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -671,7 +671,7 @@ define amdgpu_kernel void @s_or_u64_zext_with_sregs(ptr addrspace(1) %out, ptr a
 ; GFX12-NEXT:    s_load_b32 s2, s[2:3], 0x0
 ; GFX12-NEXT:    s_mov_b32 s3, 0
 ; GFX12-NEXT:    s_wait_kmcnt 0x0
-; GFX12-NEXT:    s_or_b64 s[2:3], s[2:3], 0x50
+; GFX12-NEXT:    s_or_b32 s2, s2, 0x50
 ; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
 ; GFX12-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
index 6e4c6bcf91d11..31e3d9732d1b9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/prelegalizer-combiner-shuffle.mir
@@ -147,15 +147,17 @@ body:             |
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p3) = COPY $vgpr0
     ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p3) = COPY $vgpr1
-    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(<8 x s16>) = G_LOAD [[COPY]](p3) :: (load (<8 x s16>), align 8, addrspace 3)
-    ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16), [[UV4:%[0-9]+]]:_(s16), [[UV5:%[0-9]+]]:_(s16), [[UV6:%[0-9]+]]:_(s16), [[UV7:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[LOAD]](<8 x s16>)
-    ; CHECK-NEXT: G_STORE [[UV4]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
+    ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 8
+    ; CHECK-NEXT: [[PTR_ADD:%[0-9]+]]:_(p3) = G_PTR_ADD [[COPY]], [[C]](s32)
+    ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[PTR_ADD]](p3) :: (load (s16), addrspace 3)
+    ; CHECK-NEXT: G_STORE [[LOAD]](s16), [[COPY1]](p3) :: (store (s16), addrspace 3)
     ; CHECK-NEXT: SI_RETURN
     %0:_(p3) = COPY $vgpr0
     %1:_(p3) = COPY $vgpr1
     %2:_(<8 x s16>) = G_IMPLICIT_DEF
     %3:_(<8 x s16>) = G_LOAD %0(p3) :: (load (<8 x s16>), align 8, addrspace 3)
-    %4:_(s16) = G_SHUFFLE_VECTOR %3(<8 x s16>), %2, shufflemask(4)
+    %idx:_(s32) = G_CONSTANT i32 4
+    %4:_(s16) = G_EXTRACT_VECTOR_ELT %3(<8 x s16>), %idx
     G_STORE %4(s16), %1(p3) :: (store (s16), addrspace 3)
     SI_RETURN
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
index a9b3deb3e49f4..cfe655ff97975 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sext_inreg.ll
@@ -1381,7 +1381,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX6-NEXT:    v_lshl_b64 v[0:1], v[1:2], 31
 ; GFX6-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX6-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX6-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX6-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX6-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1393,7 +1393,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX8-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX8-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX8-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX8-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1405,7 +1405,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX9-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 1, v3
-; GFX9-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 ;
@@ -1418,7 +1418,7 @@ define i65 @v_sext_inreg_i65_33(i65 %value) {
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 31, v1
 ; GFX10PLUS-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX10PLUS-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
-; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX10PLUS-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX10PLUS-NEXT:    s_setpc_b64 s[30:31]
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %value, 33
@@ -1429,29 +1429,27 @@ define amdgpu_ps i65 @s_sext_inreg_i65_18(i65 inreg %value) {
 ; GCN-LABEL: s_sext_inreg_i65_18:
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GCN-NEXT:    s_lshr_b32 s4, s1, 14
-; GCN-NEXT:    s_mov_b32 s5, 0
-; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_lshr_b32 s3, s1, 14
+; GCN-NEXT:    s_or_b32 s2, s2, s3
 ; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
 ; GCN-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GCN-NEXT:    s_lshl_b32 s7, s2, 14
-; GCN-NEXT:    s_mov_b32 s6, s5
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GCN-NEXT:    s_lshl_b32 s5, s2, 14
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GCN-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_18:
 ; GFX10PLUS:       ; %bb.0:
 ; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_lshr_b32 s4, s1, 14
-; GFX10PLUS-NEXT:    s_mov_b32 s5, 0
+; GFX10PLUS-NEXT:    s_lshr_b32 s3, s1, 14
 ; GFX10PLUS-NEXT:    s_bfe_u64 s[0:1], s[0:1], 0x2e0000
-; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s6, s5
+; GFX10PLUS-NEXT:    s_or_b32 s2, s2, s3
+; GFX10PLUS-NEXT:    s_mov_b32 s4, 0
 ; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b32 s7, s2, 14
+; GFX10PLUS-NEXT:    s_lshl_b32 s5, s2, 14
 ; GFX10PLUS-NEXT:    s_ashr_i64 s[2:3], s[2:3], 18
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
+; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[4:5]
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 18
   %ashr = ashr i65 %shl, 18
@@ -1464,13 +1462,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GCN-NEXT:    s_lshl_b32 s3, s2, 1
 ; GCN-NEXT:    s_mov_b32 s2, 0
 ; GCN-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GCN-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GCN-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GCN-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GCN-NEXT:    s_mov_b32 s1, s2
-; GCN-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GCN-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    s_ashr_i32 s2, s5, 1
+; GCN-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GCN-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GCN-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GCN-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GCN-NEXT:    s_or_b32 s0, s0, s4
+; GCN-NEXT:    s_ashr_i32 s2, s3, 1
 ; GCN-NEXT:    ; return to shader part epilog
 ;
 ; GFX10PLUS-LABEL: s_sext_inreg_i65_33:
@@ -1478,13 +1475,12 @@ define amdgpu_ps i65 @s_sext_inreg_i65_33(i65 inreg %value) {
 ; GFX10PLUS-NEXT:    s_lshl_b32 s3, s2, 1
 ; GFX10PLUS-NEXT:    s_mov_b32 s2, 0
 ; GFX10PLUS-NEXT:    s_lshr_b64 s[4:5], s[0:1], 31
-; GFX10PLUS-NEXT:    s_bfe_u32 s0, s0, 0x1f0000
-; GFX10PLUS-NEXT:    s_or_b64 s[4:5], s[2:3], s[4:5]
-; GFX10PLUS-NEXT:    s_mov_b32 s1, s2
-; GFX10PLUS-NEXT:    s_bfe_i64 s[4:5], s[4:5], 0x10000
-; GFX10PLUS-NEXT:    s_lshl_b64 s[2:3], s[4:5], 31
-; GFX10PLUS-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; GFX10PLUS-NEXT:    s_ashr_i32 s2, s5, 1
+; GFX10PLUS-NEXT:    s_or_b64 s[2:3], s[2:3], s[4:5]
+; GFX10PLUS-NEXT:    s_bfe_u32 s4, s0, 0x1f0000
+; GFX10PLUS-NEXT:    s_bfe_i64 s[2:3], s[2:3], 0x10000
+; GFX10PLUS-NEXT:    s_lshl_b64 s[0:1], s[2:3], 31
+; GFX10PLUS-NEXT:    s_ashr_i32 s2, s3, 1
+; GFX10PLUS-NEXT:    s_or_b32 s0, s0, s4
 ; GFX10PLUS-NEXT:    ; return to shader part epilog
   %shl = shl i65 %value, 33
   %ashr = ashr i65 %shl, 33
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
index ac903adba9a47..e7bba9dc224ee 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shufflevector-pointer-crash.mir
@@ -19,15 +19,15 @@ body: |
     ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<2 x p0>) = G_BITCAST [[BUILD_VECTOR]](<4 x s32>)
     ; CHECK-NEXT: [[UV:%[0-9]+]]:_(p0), [[UV1:%[0-9]+]]:_(p0) = G_UNMERGE_VALUES [[BITCAST]](<2 x p0>)
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY [[UV]](p0)
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(p0) = COPY [[COPY]](p0)
-    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY1]](p0)
+    ; CHECK-NEXT: [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[COPY]](p0)
     ; CHECK-NEXT: $vgpr0 = COPY [[UV2]](s32)
     ; CHECK-NEXT: $vgpr1 = COPY [[UV3]](s32)
     ; CHECK-NEXT: SI_RETURN implicit $vgpr0, implicit $vgpr1
     %0:_(p0) = G_CONSTANT i64 0
     %1:_(<2 x p0>) = G_BUILD_VECTOR %0:_(p0), %0:_(p0)
     %2:_(<2 x p0>) = G_LOAD %0:_(p0) :: (load (<2 x p0>))
-    %3:_(p0) = G_SHUFFLE_VECTOR %2:_(<2 x p0>), %1:_, shufflemask(0)
+    %idx:_(s32) = G_CONSTANT i32 0
+    %3:_(p0) = G_EXTRACT_VECTOR_ELT %2:_(<2 x p0>), %idx
     %4:_(s32), %5:_(s32) = G_UNMERGE_VALUES %3:_(p0)
     $vgpr0 = COPY %4:_(s32)
     $vgpr1 = COPY %5:_(s32)
diff --git a/llvm/test/CodeGen/AMDGPU/add-max.ll b/llvm/test/CodeGen/AMDGPU/add-max.ll
index 00c66563572ea..c55137574a9a4 100644
--- a/llvm/test/CodeGen/AMDGPU/add-max.ll
+++ b/llvm/test/CodeGen/AMDGPU/add-max.ll
@@ -5,9 +5,9 @@
 define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_vvv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_max_u32_e64 v0, v0, v1, v2
+; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -16,41 +16,40 @@ define amdgpu_ps float @add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_max_u32_svv(i32 inreg %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_u32_svv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_max_u32_e64 v0, s0, v0, v1
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_ssv(i32 inreg %a, i32 inreg %b, i32 %c) {
-; SDAG-LABEL: add_max_u32_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32_e64 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; GCN-LABEL: add_max_u32_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c) {
-; GCN-LABEL: add_max_u32_sss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_add_co_i32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GCN-NEXT:    s_max_u32 s0, s0, s2
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+; SDAG-LABEL: add_max_u32_sss:
+; SDAG:       ; %bb.0:
+; SDAG-NEXT:    v_add_nc_u32_e64 v0, s0, s1 clamp
+; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; SDAG-NEXT:    v_max_u32_e32 v0, s2, v0
+; SDAG-NEXT:    ; return to shader part epilog
+;
+; GISEL-LABEL: add_max_u32_sss:
+; GISEL:       ; %bb.0:
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_add_max_u32 v0, s0, s1, v0
+; GISEL-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -59,9 +58,9 @@ define amdgpu_ps float @add_max_u32_sss(i32 inreg %a, i32 inreg %b, i32 inreg %c
 define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 ; GCN-LABEL: add_max_u32_vsi:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_max_u32_e64 v0, v0, s0, 4
+; GCN-NEXT:    v_add_max_u32 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 4)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -70,28 +69,21 @@ define amdgpu_ps float @add_max_u32_vsi(i32 %a, i32 inreg %b) {
 define amdgpu_ps float @add_max_u32_svl(i32 inreg %a, i32 %b) {
 ; GCN-LABEL: add_max_u32_svl:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_max_u32_e64 v0, s0, v0, 0x64
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, 0x64
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umax.i32(i32 %add, i32 100)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
 
-define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
-; SDAG-LABEL: add_max_u32_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_add_max_u32_e64 v0, 0x64, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_u32_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_addk_co_i32 s0, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    v_max_u32_e32 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, 100
-  %max = call i32 @llvm.umax.i32(i32 %add, i32 %b)
+define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b, i32 %c) {
+; GCN-LABEL: add_max_u32_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_add_max_u32 v0, s0, v0, v1
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
+  %max = call i32 @llvm.umax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
 }
@@ -99,9 +91,9 @@ define amdgpu_ps float @add_max_u32_slv(i32 inreg %a, i32 %b) {
 define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_max_i32_vvv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_max_i32_e64 v0, v0, v1, v2
+; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smax.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -110,9 +102,9 @@ define amdgpu_ps float @add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_min_u32_vvv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_min_u32_e64 v0, v0, v1, v2
+; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.uadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.umin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -121,9 +113,9 @@ define amdgpu_ps float @add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
 define amdgpu_ps float @add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
 ; GCN-LABEL: add_min_i32_vvv:
 ; GCN:       ; %bb.0:
-; GCN-NEXT:    v_add_min_i32_e64 v0, v0, v1, v2
+; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add i32 %a, %b
+  %add = call i32 @llvm.sadd.sat.i32(i32 %a, i32 %b)
   %max = call i32 @llvm.smin.i32(i32 %add, i32 %c)
   %ret = bitcast i32 %max to float
   ret float %ret
@@ -134,7 +126,7 @@ define amdgpu_ps float @add_max_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -145,29 +137,18 @@ define amdgpu_ps float @add_max_v2u16_svv(<2 x i16> inreg %a, <2 x i16> %b, <2 x
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, v1
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> %c) {
-; SDAG-LABEL: add_max_v2u16_ssv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_ssv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s2, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s3, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s2, s2, s3
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s2
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+; GCN-LABEL: add_max_v2u16_ssv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -176,30 +157,18 @@ define amdgpu_ps float @add_max_v2u16_ssv(<2 x i16> inreg %a, <2 x i16> inreg %b
 define amdgpu_ps float @add_max_v2u16_sss(<2 x i16> inreg %a, <2 x i16> inreg %b, <2 x i16> inreg %c) {
 ; SDAG-LABEL: add_max_v2u16_sss:
 ; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1
+; SDAG-NEXT:    v_pk_add_u16 v0, s0, s1 clamp
 ; SDAG-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; SDAG-NEXT:    v_pk_max_u16 v0, v0, s2
 ; SDAG-NEXT:    ; return to shader part epilog
 ;
 ; GISEL-LABEL: add_max_v2u16_sss:
 ; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s3, s0, 16
-; GISEL-NEXT:    s_lshr_b32 s4, s1, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, s1
-; GISEL-NEXT:    s_add_co_i32 s3, s3, s4
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s3
-; GISEL-NEXT:    s_and_b32 s3, s2, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_and_b32 s0, s0, 0xffff
-; GISEL-NEXT:    s_lshr_b32 s2, s2, 16
-; GISEL-NEXT:    s_max_u32 s0, s0, s3
-; GISEL-NEXT:    s_max_u32 s1, s1, s2
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GISEL-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GISEL-NEXT:    v_pk_add_max_u16 v0, s0, s1, v0
 ; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -210,7 +179,7 @@ define amdgpu_ps float @add_max_v2u16_vsi(<2 x i16> %a, <2 x i16> inreg %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, v0, s0, 4
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 4, i16 0>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -221,28 +190,18 @@ define amdgpu_ps float @add_max_v2u16_svl(<2 x i16> inreg %a, <2 x i16> %b) {
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_u16 v0, s0, v0, 0x650064
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> <i16 100, i16 101>)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
 }
 
 define amdgpu_ps float @add_max_v2u16_slv(<2 x i16> inreg %a, <2 x i16> %b) {
-; SDAG-LABEL: add_max_v2u16_slv:
-; SDAG:       ; %bb.0:
-; SDAG-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
-; SDAG-NEXT:    ; return to shader part epilog
-;
-; GISEL-LABEL: add_max_v2u16_slv:
-; GISEL:       ; %bb.0:
-; GISEL-NEXT:    s_lshr_b32 s1, s0, 16
-; GISEL-NEXT:    s_add_co_i32 s0, s0, 0x640064
-; GISEL-NEXT:    s_addk_co_i32 s1, 0x64
-; GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GISEL-NEXT:    s_pack_ll_b32_b16 s0, s0, s1
-; GISEL-NEXT:    v_pk_max_u16 v0, s0, v0
-; GISEL-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, <i16 100, i16 100>
+; GCN-LABEL: add_max_v2u16_slv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    v_pk_add_max_u16 v0, 0x640064, s0, v0
+; GCN-NEXT:    ; return to shader part epilog
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> <i16 100, i16 100>)
   %max = call <2 x i16> @llvm.umax.v216(<2 x i16> %add, <2 x i16> %b)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -253,7 +212,7 @@ define amdgpu_ps float @add_max_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smax.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -264,7 +223,7 @@ define amdgpu_ps float @add_min_v2u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.uadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.umin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
@@ -275,7 +234,7 @@ define amdgpu_ps float @add_min_v2s16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16>
 ; GCN:       ; %bb.0:
 ; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
 ; GCN-NEXT:    ; return to shader part epilog
-  %add = add <2 x i16> %a, %b
+  %add = call <2 x i16> @llvm.sadd.sat.i32(<2 x i16> %a, <2 x i16> %b)
   %max = call <2 x i16> @llvm.smin.v216(<2 x i16> %add, <2 x i16> %c)
   %ret = bitcast <2 x i16> %max to float
   ret float %ret
diff --git a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
index b72eba8cdb519..8088c1b4c8fc7 100644
--- a/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
+++ b/llvm/test/CodeGen/AMDGPU/addsub64_carry.ll
@@ -180,11 +180,7 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 ; CHECK-LABEL: s_add64_32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s2, s4, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %sum64 = add i64 %val64A, %val64B
@@ -199,14 +195,10 @@ define amdgpu_ps %struct.uint96 @s_add64_32(i64 inreg %val64A, i64 inreg %val64B
 define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_add_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_addc_u32 s8, s3, s7
+; CHECK-NEXT:    s_add_u32 s6, s2, s6
+; CHECK-NEXT:    s_addc_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_add_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -215,8 +207,8 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -233,14 +225,10 @@ define amdgpu_ps <2 x i64> @s_uadd_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_v2i64:
 ; CHECK:       ; %bb.0:
-; CHECK-NEXT:    s_sub_u32 s10, s2, s6
-; CHECK-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; CHECK-NEXT:    s_subb_u32 s8, s3, s7
+; CHECK-NEXT:    s_sub_u32 s6, s2, s6
+; CHECK-NEXT:    s_subb_u32 s7, s3, s7
 ; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; CHECK-NEXT:    s_sub_u32 s0, s0, s4
-; CHECK-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[6:7], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, s5
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -249,8 +237,8 @@ define amdgpu_ps <2 x i64> @s_usub_v2i64(<2 x i64> inreg %val0, <2 x i64> inreg
 ; CHECK-NEXT:    v_cndmask_b32_e64 v7, 0, -1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v7
 ; CHECK-NEXT:    v_readfirstlane_b32 s2, v6
-; CHECK-NEXT:    v_mov_b32_e32 v4, s10
-; CHECK-NEXT:    v_mov_b32_e32 v5, s8
+; CHECK-NEXT:    v_mov_b32_e32 v4, s6
+; CHECK-NEXT:    v_mov_b32_e32 v5, s7
 ; CHECK-NEXT:    s_mov_b32 s1, s0
 ; CHECK-NEXT:    s_mov_b32 s3, s2
 ; CHECK-NEXT:    flat_store_dwordx4 v[0:1], v[2:5]
@@ -268,8 +256,6 @@ define amdgpu_ps i64 @s_uadd_i64(i64 inreg %val0, i64 inreg %val1, ptr %ptrval)
 ; CHECK-LABEL: s_uadd_i64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, s2
-; CHECK-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, s3
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -292,8 +278,6 @@ define amdgpu_ps i64 @s_uadd_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_uadd_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_addc_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -339,8 +323,6 @@ define amdgpu_ps i64 @s_usub_p1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_p1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, 0
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
@@ -363,8 +345,6 @@ define amdgpu_ps i64 @s_usub_n1(i64 inreg %val0, i64 inreg %val1, ptr %ptrval) {
 ; CHECK-LABEL: s_usub_n1:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, -1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s1, s1, -1
 ; CHECK-NEXT:    v_mov_b32_e32 v2, s0
 ; CHECK-NEXT:    v_mov_b32_e32 v3, s1
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index 948811ea45f77..51df8c34cc55e 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -7821,10 +7821,9 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s0, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -7855,7 +7854,6 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s15, s16, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s12, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s12
@@ -7881,52 +7879,50 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s4, v0
 ; GFX6-NEXT:    s_addc_u32 s4, s4, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s7, s14
-; GFX6-NEXT:    s_add_u32 s14, s1, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    s_add_u32 s16, s1, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GFX6-NEXT:    s_addc_u32 s15, 0, s4
+; GFX6-NEXT:    s_addc_u32 s17, 0, s4
 ; GFX6-NEXT:    s_mov_b32 s1, s5
-; GFX6-NEXT:    s_mul_i32 s4, s10, s15
+; GFX6-NEXT:    s_mul_i32 s4, s10, s17
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
-; GFX6-NEXT:    s_mul_i32 s5, s11, s14
-; GFX6-NEXT:    s_add_i32 s16, s4, s5
-; GFX6-NEXT:    s_sub_i32 s17, s7, s16
-; GFX6-NEXT:    s_mul_i32 s4, s10, s14
+; GFX6-NEXT:    s_mul_i32 s5, s11, s16
+; GFX6-NEXT:    s_add_i32 s18, s4, s5
+; GFX6-NEXT:    s_sub_i32 s14, s7, s18
+; GFX6-NEXT:    s_mul_i32 s4, s10, s16
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s18, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s17, s17, s11
-; GFX6-NEXT:    s_sub_u32 s19, s6, s10
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s4, s5
+; GFX6-NEXT:    s_subb_u32 s19, s14, s11
+; GFX6-NEXT:    s_sub_u32 s20, s6, s10
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s19, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s20, s10
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s11
+; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
+; GFX6-NEXT:    s_add_u32 s15, s16, 1
+; GFX6-NEXT:    s_addc_u32 s19, s17, 0
+; GFX6-NEXT:    s_add_u32 s20, s16, 2
+; GFX6-NEXT:    s_addc_u32 s21, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s20, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s19
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s17, 0
+; GFX6-NEXT:    s_subb_u32 s4, s7, s18
 ; GFX6-NEXT:    s_cmp_ge_u32 s4, s11
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s19, s10
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
-; GFX6-NEXT:    s_cselect_b32 s4, s17, s5
-; GFX6-NEXT:    s_add_u32 s5, s14, 1
-; GFX6-NEXT:    s_addc_u32 s17, s15, 0
-; GFX6-NEXT:    s_add_u32 s19, s14, 2
-; GFX6-NEXT:    s_addc_u32 s20, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_cselect_b32 s4, s19, s5
-; GFX6-NEXT:    s_cselect_b32 s5, s20, s17
-; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX6-NEXT:    s_subb_u32 s7, s7, s16
-; GFX6-NEXT:    s_cmp_ge_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s10
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s7, s11
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s5, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s14
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s11
+; GFX6-NEXT:    s_cselect_b32 s4, s6, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s15, s17
+; GFX6-NEXT:    s_cselect_b32 s4, s14, s16
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s6
@@ -7949,8 +7945,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s10, 0, s8
-; GFX9-NEXT:    s_subb_u32 s11, 0, s9
+; GFX9-NEXT:    s_sub_u32 s4, 0, s8
+; GFX9-NEXT:    s_subb_u32 s5, 0, s9
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -7960,56 +7956,52 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s4
-; GFX9-NEXT:    s_mul_i32 s13, s11, s4
-; GFX9-NEXT:    s_add_i32 s5, s14, s5
-; GFX9-NEXT:    s_mul_i32 s15, s10, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s15
-; GFX9-NEXT:    s_mul_i32 s16, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v1
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s11
+; GFX9-NEXT:    s_mul_i32 s13, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s15, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_hi_u32 s14, s11, s15
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s5
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s5, s12, s5
-; GFX9-NEXT:    s_add_u32 s5, s13, s5
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s4, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s4, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s4, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s5, s5, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s4
+; GFX9-NEXT:    s_mul_i32 s14, s10, s4
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s4, s11, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s12
+; GFX9-NEXT:    s_addc_u32 s4, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s10, s4
-; GFX9-NEXT:    s_addc_u32 s10, 0, s5
-; GFX9-NEXT:    s_add_u32 s11, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s11, s11, s4
+; GFX9-NEXT:    s_addc_u32 s10, s10, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -8028,38 +8020,35 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_addc_u32 s11, s12, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
 ; GFX9-NEXT:    s_mul_i32 s10, s3, s10
-; GFX9-NEXT:    s_add_u32 s14, s11, s10
-; GFX9-NEXT:    s_addc_u32 s15, 0, s12
-; GFX9-NEXT:    s_mul_i32 s10, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s14
+; GFX9-NEXT:    s_add_u32 s13, s11, s10
+; GFX9-NEXT:    s_addc_u32 s12, 0, s12
+; GFX9-NEXT:    s_mul_i32 s10, s8, s12
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s13
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s9, s14
-; GFX9-NEXT:    s_add_i32 s16, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s3, s16
-; GFX9-NEXT:    s_mul_i32 s10, s8, s14
+; GFX9-NEXT:    s_mul_i32 s11, s9, s13
+; GFX9-NEXT:    s_add_i32 s14, s10, s11
+; GFX9-NEXT:    s_sub_i32 s15, s3, s14
+; GFX9-NEXT:    s_mul_i32 s10, s8, s13
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s17, s12, s9
-; GFX9-NEXT:    s_sub_u32 s18, s2, s8
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s17, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s18, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, s9
+; GFX9-NEXT:    s_sub_u32 s16, s2, s8
+; GFX9-NEXT:    s_subb_u32 s15, s15, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s9
 ; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX9-NEXT:    s_cselect_b32 s12, s17, s13
-; GFX9-NEXT:    s_add_u32 s13, s14, 1
-; GFX9-NEXT:    s_addc_u32 s17, s15, 0
-; GFX9-NEXT:    s_add_u32 s18, s14, 2
-; GFX9-NEXT:    s_addc_u32 s19, s15, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s18, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s19, s17
+; GFX9-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s15, s9
+; GFX9-NEXT:    s_cselect_b32 s15, s16, s17
+; GFX9-NEXT:    s_add_u32 s16, s13, 1
+; GFX9-NEXT:    s_addc_u32 s17, s12, 0
+; GFX9-NEXT:    s_add_u32 s18, s13, 2
+; GFX9-NEXT:    s_addc_u32 s19, s12, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s15, 0
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
+; GFX9-NEXT:    s_cselect_b32 s16, s19, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s16
+; GFX9-NEXT:    s_subb_u32 s3, s3, s14
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s2, s8
@@ -8067,8 +8056,8 @@ define amdgpu_kernel void @sdiv_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s9
 ; GFX9-NEXT:    s_cselect_b32 s2, s2, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX9-NEXT:    s_cselect_b32 s3, s13, s15
-; GFX9-NEXT:    s_cselect_b32 s2, s12, s14
+; GFX9-NEXT:    s_cselect_b32 s3, s16, s12
+; GFX9-NEXT:    s_cselect_b32 s2, s15, s13
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[4:5]
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s4
@@ -8328,10 +8317,9 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s17, 0, s18
 ; GFX6-NEXT:    s_add_u32 s18, s12, s13
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s18
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s14, v0
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, s17
 ; GFX6-NEXT:    s_mul_i32 s12, s14, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
@@ -8362,7 +8350,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s15, s18, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s16, s14
 ; GFX6-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s12
@@ -8387,55 +8374,53 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
 ; GFX6-NEXT:    s_addc_u32 s16, s16, 0
 ; GFX6-NEXT:    s_mul_i32 s14, s9, s14
-; GFX6-NEXT:    s_add_u32 s17, s15, s14
-; GFX6-NEXT:    v_mov_b32_e32 v0, s17
+; GFX6-NEXT:    s_add_u32 s18, s15, s14
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
-; GFX6-NEXT:    s_mul_i32 s14, s6, s16
+; GFX6-NEXT:    s_addc_u32 s19, 0, s16
+; GFX6-NEXT:    s_mul_i32 s14, s6, s19
 ; GFX6-NEXT:    v_readfirstlane_b32 s15, v0
 ; GFX6-NEXT:    s_add_i32 s14, s15, s14
-; GFX6-NEXT:    s_mul_i32 s15, s7, s17
-; GFX6-NEXT:    s_add_i32 s18, s14, s15
-; GFX6-NEXT:    s_sub_i32 s19, s9, s18
-; GFX6-NEXT:    s_mul_i32 s14, s6, s17
+; GFX6-NEXT:    s_mul_i32 s15, s7, s18
+; GFX6-NEXT:    s_add_i32 s20, s14, s15
+; GFX6-NEXT:    s_sub_i32 s16, s9, s20
+; GFX6-NEXT:    s_mul_i32 s14, s6, s18
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s7
-; GFX6-NEXT:    s_sub_u32 s21, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s14, s15
+; GFX6-NEXT:    s_subb_u32 s21, s16, s7
+; GFX6-NEXT:    s_sub_u32 s22, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[16:17], -1, 0
+; GFX6-NEXT:    s_or_b32 s16, s16, s17
+; GFX6-NEXT:    s_subb_u32 s16, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s16, s7
+; GFX6-NEXT:    s_cselect_b32 s16, s21, s17
+; GFX6-NEXT:    s_add_u32 s17, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
+; GFX6-NEXT:    s_cselect_b32 s16, s22, s17
+; GFX6-NEXT:    s_cselect_b32 s17, s23, s21
 ; GFX6-NEXT:    s_or_b32 s14, s14, s15
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_subb_u32 s14, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s14, s7
-; GFX6-NEXT:    s_cselect_b32 s14, s19, s15
-; GFX6-NEXT:    s_add_u32 s15, s17, 1
-; GFX6-NEXT:    s_addc_u32 s19, s16, 0
-; GFX6-NEXT:    s_add_u32 s21, s17, 2
-; GFX6-NEXT:    s_addc_u32 s22, s16, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
-; GFX6-NEXT:    s_cselect_b32 s15, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s18
+; GFX6-NEXT:    s_subb_u32 s9, s9, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s18
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s15, s16
-; GFX6-NEXT:    s_cselect_b32 s6, s14, s17
+; GFX6-NEXT:    s_cselect_b32 s7, s17, s19
+; GFX6-NEXT:    s_cselect_b32 s6, s16, s18
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX6-NEXT:    s_sub_u32 s14, s6, s2
-; GFX6-NEXT:    s_subb_u32 s15, s7, s3
+; GFX6-NEXT:    s_sub_u32 s16, s6, s2
+; GFX6-NEXT:    s_subb_u32 s17, s7, s3
 ; GFX6-NEXT:    s_ashr_i32 s6, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s6
 ; GFX6-NEXT:    s_mov_b32 s7, s6
@@ -8454,40 +8439,39 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s12, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s12, s16
+; GFX6-NEXT:    s_mul_i32 s1, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s13, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s17, s12, s2
+; GFX6-NEXT:    s_mul_i32 s15, s12, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s17
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s18, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s17, s16, s17
+; GFX6-NEXT:    s_mul_i32 s15, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s18, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s17
+; GFX6-NEXT:    s_add_u32 s4, s4, s15
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s18
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s16, s3
+; GFX6-NEXT:    s_mul_i32 s3, s14, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s16, s4
+; GFX6-NEXT:    s_addc_u32 s4, s14, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s12, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -8501,14 +8485,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s13, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v2
-; GFX6-NEXT:    s_add_u32 s13, s17, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX6-NEXT:    s_add_u32 s13, s15, s13
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v3
 ; GFX6-NEXT:    s_add_u32 s3, s13, s3
-; GFX6-NEXT:    s_addc_u32 s3, s16, s12
+; GFX6-NEXT:    s_addc_u32 s3, s14, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_addc_u32 s12, s12, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
@@ -8517,7 +8501,6 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s4, s12
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
@@ -8529,72 +8512,70 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s10, v2
 ; GFX6-NEXT:    s_mul_i32 s2, s10, s12
-; GFX6-NEXT:    v_readfirstlane_b32 s16, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s11, v2
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v3
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s11, v0
-; GFX6-NEXT:    s_add_u32 s2, s17, s2
-; GFX6-NEXT:    s_addc_u32 s16, 0, s16
+; GFX6-NEXT:    s_add_u32 s2, s15, s2
+; GFX6-NEXT:    s_addc_u32 s14, 0, s14
 ; GFX6-NEXT:    s_mul_i32 s13, s11, s13
-; GFX6-NEXT:    v_readfirstlane_b32 s17, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s15, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s13
-; GFX6-NEXT:    s_addc_u32 s2, s16, s17
+; GFX6-NEXT:    s_addc_u32 s2, s14, s15
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX6-NEXT:    s_mul_i32 s12, s11, s12
-; GFX6-NEXT:    s_add_u32 s16, s2, s12
-; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    s_add_u32 s18, s2, s12
+; GFX6-NEXT:    v_mov_b32_e32 v0, s18
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
-; GFX6-NEXT:    s_addc_u32 s17, 0, s13
-; GFX6-NEXT:    s_mul_i32 s12, s8, s17
+; GFX6-NEXT:    s_addc_u32 s19, 0, s13
+; GFX6-NEXT:    s_mul_i32 s12, s8, s19
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
 ; GFX6-NEXT:    v_readfirstlane_b32 s13, v0
 ; GFX6-NEXT:    s_add_i32 s12, s13, s12
-; GFX6-NEXT:    s_mul_i32 s13, s9, s16
-; GFX6-NEXT:    s_add_i32 s18, s12, s13
-; GFX6-NEXT:    s_sub_i32 s19, s11, s18
-; GFX6-NEXT:    s_mul_i32 s12, s8, s16
+; GFX6-NEXT:    s_mul_i32 s13, s9, s18
+; GFX6-NEXT:    s_add_i32 s20, s12, s13
+; GFX6-NEXT:    s_sub_i32 s14, s11, s20
+; GFX6-NEXT:    s_mul_i32 s12, s8, s18
 ; GFX6-NEXT:    s_sub_u32 s10, s10, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s20, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s19, s19, s9
-; GFX6-NEXT:    s_sub_u32 s21, s10, s8
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s21, s14, s9
+; GFX6-NEXT:    s_sub_u32 s22, s10, s8
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s21, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s22, s8
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s14, s9
+; GFX6-NEXT:    s_cselect_b32 s14, s21, s15
+; GFX6-NEXT:    s_add_u32 s15, s18, 1
+; GFX6-NEXT:    s_addc_u32 s21, s19, 0
+; GFX6-NEXT:    s_add_u32 s22, s18, 2
+; GFX6-NEXT:    s_addc_u32 s23, s19, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s14, 0
+; GFX6-NEXT:    s_cselect_b32 s14, s22, s15
+; GFX6-NEXT:    s_cselect_b32 s15, s23, s21
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s19, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s21, s8
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s12, s9
-; GFX6-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX6-NEXT:    s_add_u32 s13, s16, 1
-; GFX6-NEXT:    s_addc_u32 s19, s17, 0
-; GFX6-NEXT:    s_add_u32 s21, s16, 2
-; GFX6-NEXT:    s_addc_u32 s22, s17, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_cselect_b32 s12, s21, s13
-; GFX6-NEXT:    s_cselect_b32 s13, s22, s19
-; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX6-NEXT:    s_subb_u32 s11, s11, s18
+; GFX6-NEXT:    s_subb_u32 s11, s11, s20
 ; GFX6-NEXT:    s_cmp_ge_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s10, s8
 ; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s11, s9
-; GFX6-NEXT:    s_cselect_b32 s8, s8, s18
+; GFX6-NEXT:    s_cselect_b32 s8, s8, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX6-NEXT:    s_cselect_b32 s9, s13, s17
-; GFX6-NEXT:    s_cselect_b32 s8, s12, s16
+; GFX6-NEXT:    s_cselect_b32 s9, s15, s19
+; GFX6-NEXT:    s_cselect_b32 s8, s14, s18
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[8:9], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s4, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    v_mov_b32_e32 v1, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s16
+; GFX6-NEXT:    v_mov_b32_e32 v1, s17
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s5
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -8614,8 +8595,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s14, 0, s6
-; GFX9-NEXT:    s_subb_u32 s15, 0, s7
+; GFX9-NEXT:    s_sub_u32 s12, 0, s6
+; GFX9-NEXT:    s_subb_u32 s13, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -8624,56 +8605,52 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s16, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v0
-; GFX9-NEXT:    s_mul_i32 s13, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s12
-; GFX9-NEXT:    s_mul_i32 s17, s15, s12
-; GFX9-NEXT:    s_add_i32 s13, s18, s13
-; GFX9-NEXT:    s_mul_i32 s19, s14, s12
-; GFX9-NEXT:    s_add_i32 s13, s13, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s19
-; GFX9-NEXT:    s_mul_i32 s20, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s13
+; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v0
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s15
+; GFX9-NEXT:    s_mul_i32 s17, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s18, s16
+; GFX9-NEXT:    s_mul_i32 s19, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s19
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s17, s15, s16
 ; GFX9-NEXT:    s_add_u32 s18, s18, s20
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_mul_hi_u32 s20, s16, s19
-; GFX9-NEXT:    s_mul_i32 s19, s16, s19
+; GFX9-NEXT:    s_mul_hi_u32 s20, s14, s19
+; GFX9-NEXT:    s_mul_i32 s19, s14, s19
 ; GFX9-NEXT:    s_add_u32 s18, s18, s19
-; GFX9-NEXT:    s_mul_hi_u32 s21, s16, s13
+; GFX9-NEXT:    s_mul_hi_u32 s21, s14, s16
 ; GFX9-NEXT:    s_addc_u32 s17, s17, s20
 ; GFX9-NEXT:    s_addc_u32 s18, s21, 0
-; GFX9-NEXT:    s_mul_i32 s13, s16, s13
-; GFX9-NEXT:    s_add_u32 s13, s17, s13
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s16, s17, s16
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s18
-; GFX9-NEXT:    s_add_u32 s18, s12, s13
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_i32 s12, s14, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s18
-; GFX9-NEXT:    s_add_i32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s15, s15, s18
-; GFX9-NEXT:    s_add_i32 s12, s12, s15
-; GFX9-NEXT:    s_mul_i32 s14, s14, s18
-; GFX9-NEXT:    s_mul_hi_u32 s15, s16, s14
-; GFX9-NEXT:    s_mul_i32 s17, s16, s14
-; GFX9-NEXT:    s_mul_i32 s20, s18, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s18, s14
-; GFX9-NEXT:    s_mul_hi_u32 s19, s18, s12
-; GFX9-NEXT:    s_add_u32 s14, s14, s20
+; GFX9-NEXT:    s_add_u32 s15, s15, s16
+; GFX9-NEXT:    s_addc_u32 s14, s14, s17
+; GFX9-NEXT:    s_mul_i32 s16, s12, s14
+; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
+; GFX9-NEXT:    s_add_i32 s16, s17, s16
+; GFX9-NEXT:    s_mul_i32 s13, s13, s15
+; GFX9-NEXT:    s_add_i32 s16, s16, s13
+; GFX9-NEXT:    s_mul_i32 s12, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s14, s12
+; GFX9-NEXT:    s_mul_i32 s18, s14, s12
+; GFX9-NEXT:    s_mul_i32 s20, s15, s16
+; GFX9-NEXT:    s_mul_hi_u32 s12, s15, s12
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s20
 ; GFX9-NEXT:    s_addc_u32 s19, 0, s19
-; GFX9-NEXT:    s_add_u32 s14, s14, s17
-; GFX9-NEXT:    s_mul_hi_u32 s13, s16, s12
-; GFX9-NEXT:    s_addc_u32 s14, s19, s15
+; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s16
+; GFX9-NEXT:    s_addc_u32 s12, s19, s17
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
-; GFX9-NEXT:    s_mul_i32 s12, s16, s12
-; GFX9-NEXT:    s_add_u32 s12, s14, s12
-; GFX9-NEXT:    s_addc_u32 s14, 0, s13
-; GFX9-NEXT:    s_add_u32 s15, s18, s12
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_addc_u32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s16, s14, s16
+; GFX9-NEXT:    s_add_u32 s12, s12, s16
+; GFX9-NEXT:    s_addc_u32 s13, 0, s13
+; GFX9-NEXT:    s_add_u32 s15, s15, s12
+; GFX9-NEXT:    s_addc_u32 s14, s14, s13
 ; GFX9-NEXT:    s_ashr_i32 s12, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_mov_b32 s13, s12
@@ -8691,38 +8668,35 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_addc_u32 s15, s16, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
 ; GFX9-NEXT:    s_mul_i32 s14, s9, s14
-; GFX9-NEXT:    s_add_u32 s18, s15, s14
-; GFX9-NEXT:    s_addc_u32 s19, 0, s16
-; GFX9-NEXT:    s_mul_i32 s14, s6, s19
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s18
+; GFX9-NEXT:    s_add_u32 s17, s15, s14
+; GFX9-NEXT:    s_addc_u32 s16, 0, s16
+; GFX9-NEXT:    s_mul_i32 s14, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s17
 ; GFX9-NEXT:    s_add_i32 s14, s15, s14
-; GFX9-NEXT:    s_mul_i32 s15, s7, s18
-; GFX9-NEXT:    s_add_i32 s20, s14, s15
-; GFX9-NEXT:    s_sub_i32 s16, s9, s20
-; GFX9-NEXT:    s_mul_i32 s14, s6, s18
+; GFX9-NEXT:    s_mul_i32 s15, s7, s17
+; GFX9-NEXT:    s_add_i32 s18, s14, s15
+; GFX9-NEXT:    s_sub_i32 s19, s9, s18
+; GFX9-NEXT:    s_mul_i32 s14, s6, s17
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s14
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s21, s16, s7
-; GFX9-NEXT:    s_sub_u32 s22, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GFX9-NEXT:    s_subb_u32 s16, s21, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s22, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, s7
+; GFX9-NEXT:    s_sub_u32 s20, s8, s6
+; GFX9-NEXT:    s_subb_u32 s19, s19, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s19, s7
 ; GFX9-NEXT:    s_cselect_b32 s21, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s16, s7
-; GFX9-NEXT:    s_cselect_b32 s16, s21, s17
-; GFX9-NEXT:    s_add_u32 s17, s18, 1
-; GFX9-NEXT:    s_addc_u32 s21, s19, 0
-; GFX9-NEXT:    s_add_u32 s22, s18, 2
-; GFX9-NEXT:    s_addc_u32 s23, s19, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s16, s22, s17
-; GFX9-NEXT:    s_cselect_b32 s17, s23, s21
+; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX9-NEXT:    s_cselect_b32 s19, s20, s21
+; GFX9-NEXT:    s_add_u32 s20, s17, 1
+; GFX9-NEXT:    s_addc_u32 s21, s16, 0
+; GFX9-NEXT:    s_add_u32 s22, s17, 2
+; GFX9-NEXT:    s_addc_u32 s23, s16, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s19, 0
+; GFX9-NEXT:    s_cselect_b32 s19, s22, s20
+; GFX9-NEXT:    s_cselect_b32 s20, s23, s21
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s20
+; GFX9-NEXT:    s_subb_u32 s9, s9, s18
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8730,12 +8704,12 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s14
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s17, s19
-; GFX9-NEXT:    s_cselect_b32 s6, s16, s18
+; GFX9-NEXT:    s_cselect_b32 s7, s20, s16
+; GFX9-NEXT:    s_cselect_b32 s6, s19, s17
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[12:13], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[2:3]
-; GFX9-NEXT:    s_sub_u32 s14, s6, s2
-; GFX9-NEXT:    s_subb_u32 s15, s7, s3
+; GFX9-NEXT:    s_sub_u32 s12, s6, s2
+; GFX9-NEXT:    s_subb_u32 s13, s7, s3
 ; GFX9-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX9-NEXT:    s_add_u32 s0, s0, s2
 ; GFX9-NEXT:    s_mov_b32 s3, s2
@@ -8744,8 +8718,8 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -8755,105 +8729,98 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s13, v2
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s8, s13
-; GFX9-NEXT:    s_mul_i32 s5, s9, s4
-; GFX9-NEXT:    s_add_i32 s12, s12, s16
-; GFX9-NEXT:    s_add_i32 s12, s12, s5
-; GFX9-NEXT:    s_mul_i32 s17, s8, s4
-; GFX9-NEXT:    s_mul_i32 s16, s4, s12
-; GFX9-NEXT:    s_mul_hi_u32 s18, s4, s17
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s12
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s15, v2
+; GFX9-NEXT:    s_mul_hi_u32 s14, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s4, s15
+; GFX9-NEXT:    s_mul_i32 s9, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s16
+; GFX9-NEXT:    s_add_i32 s14, s14, s9
+; GFX9-NEXT:    s_mul_i32 s17, s4, s8
+; GFX9-NEXT:    s_mul_i32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s18, s8, s17
+; GFX9-NEXT:    s_mul_hi_u32 s9, s8, s14
 ; GFX9-NEXT:    s_add_u32 s16, s18, s16
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
-; GFX9-NEXT:    s_mul_hi_u32 s19, s13, s17
-; GFX9-NEXT:    s_mul_i32 s17, s13, s17
+; GFX9-NEXT:    s_addc_u32 s9, 0, s9
+; GFX9-NEXT:    s_mul_hi_u32 s19, s15, s17
+; GFX9-NEXT:    s_mul_i32 s17, s15, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s18, s13, s12
-; GFX9-NEXT:    s_addc_u32 s5, s5, s19
+; GFX9-NEXT:    s_mul_hi_u32 s18, s15, s14
+; GFX9-NEXT:    s_addc_u32 s9, s9, s19
 ; GFX9-NEXT:    s_addc_u32 s16, s18, 0
-; GFX9-NEXT:    s_mul_i32 s12, s13, s12
-; GFX9-NEXT:    s_add_u32 s5, s5, s12
-; GFX9-NEXT:    s_addc_u32 s12, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s13, s12
-; GFX9-NEXT:    s_mul_i32 s4, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s16
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s16
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s16
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_mul_i32 s13, s12, s8
-; GFX9-NEXT:    s_mul_i32 s18, s16, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s16, s8
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s18
+; GFX9-NEXT:    s_mul_i32 s14, s15, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s14
+; GFX9-NEXT:    s_addc_u32 s14, 0, s16
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
+; GFX9-NEXT:    s_addc_u32 s9, s15, s14
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s15, s4, s8
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s5, s5, s8
+; GFX9-NEXT:    s_add_i32 s14, s14, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s9, s4
+; GFX9-NEXT:    s_mul_i32 s16, s9, s4
+; GFX9-NEXT:    s_mul_i32 s18, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s4, s8, s4
+; GFX9-NEXT:    s_mul_hi_u32 s17, s8, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s8, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX9-NEXT:    s_addc_u32 s8, s17, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
+; GFX9-NEXT:    s_mul_hi_u32 s5, s9, s14
+; GFX9-NEXT:    s_addc_u32 s4, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s12, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s13, s16, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s8
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s14, s8, s4
+; GFX9-NEXT:    s_addc_u32 s15, s9, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s8, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s9, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[8:9], s[8:9], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s13
-; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s12
+; GFX9-NEXT:    s_mul_i32 s11, s8, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s10, s8, s15
 ; GFX9-NEXT:    s_add_u32 s11, s16, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s13
-; GFX9-NEXT:    s_mul_i32 s13, s9, s13
-; GFX9-NEXT:    s_add_u32 s11, s11, s13
-; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s12
+; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s14
+; GFX9-NEXT:    s_mul_i32 s14, s9, s14
+; GFX9-NEXT:    s_add_u32 s11, s11, s14
+; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s15
 ; GFX9-NEXT:    s_addc_u32 s10, s10, s17
 ; GFX9-NEXT:    s_addc_u32 s11, s16, 0
-; GFX9-NEXT:    s_mul_i32 s12, s9, s12
-; GFX9-NEXT:    s_add_u32 s16, s10, s12
-; GFX9-NEXT:    s_addc_u32 s17, 0, s11
-; GFX9-NEXT:    s_mul_i32 s10, s6, s17
-; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s16
+; GFX9-NEXT:    s_mul_i32 s14, s9, s15
+; GFX9-NEXT:    s_add_u32 s14, s10, s14
+; GFX9-NEXT:    s_addc_u32 s15, 0, s11
+; GFX9-NEXT:    s_mul_i32 s10, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s11, s6, s14
 ; GFX9-NEXT:    s_add_i32 s10, s11, s10
-; GFX9-NEXT:    s_mul_i32 s11, s7, s16
-; GFX9-NEXT:    s_add_i32 s18, s10, s11
-; GFX9-NEXT:    s_sub_i32 s12, s9, s18
-; GFX9-NEXT:    s_mul_i32 s10, s6, s16
+; GFX9-NEXT:    s_mul_i32 s11, s7, s14
+; GFX9-NEXT:    s_add_i32 s16, s10, s11
+; GFX9-NEXT:    s_sub_i32 s17, s9, s16
+; GFX9-NEXT:    s_mul_i32 s10, s6, s14
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s19, s12, s7
-; GFX9-NEXT:    s_sub_u32 s20, s8, s6
-; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GFX9-NEXT:    s_subb_u32 s12, s19, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s20, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, s7
+; GFX9-NEXT:    s_sub_u32 s18, s8, s6
+; GFX9-NEXT:    s_subb_u32 s17, s17, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s17, s7
 ; GFX9-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s12, s7
-; GFX9-NEXT:    s_cselect_b32 s12, s19, s13
-; GFX9-NEXT:    s_add_u32 s13, s16, 1
-; GFX9-NEXT:    s_addc_u32 s19, s17, 0
-; GFX9-NEXT:    s_add_u32 s20, s16, 2
-; GFX9-NEXT:    s_addc_u32 s21, s17, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX9-NEXT:    s_cselect_b32 s12, s20, s13
-; GFX9-NEXT:    s_cselect_b32 s13, s21, s19
+; GFX9-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s17, s7
+; GFX9-NEXT:    s_cselect_b32 s17, s18, s19
+; GFX9-NEXT:    s_add_u32 s18, s14, 1
+; GFX9-NEXT:    s_addc_u32 s19, s15, 0
+; GFX9-NEXT:    s_add_u32 s20, s14, 2
+; GFX9-NEXT:    s_addc_u32 s21, s15, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s17, 0
+; GFX9-NEXT:    s_cselect_b32 s17, s20, s18
+; GFX9-NEXT:    s_cselect_b32 s18, s21, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s9, s9, s18
+; GFX9-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX9-NEXT:    s_cmp_ge_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s8, s6
@@ -8861,14 +8828,14 @@ define amdgpu_kernel void @sdiv_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s9, s7
 ; GFX9-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX9-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX9-NEXT:    s_cselect_b32 s7, s13, s17
-; GFX9-NEXT:    s_cselect_b32 s6, s12, s16
+; GFX9-NEXT:    s_cselect_b32 s7, s18, s15
+; GFX9-NEXT:    s_cselect_b32 s6, s17, s14
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[4:5], s[2:3]
 ; GFX9-NEXT:    s_xor_b64 s[4:5], s[6:7], s[2:3]
 ; GFX9-NEXT:    s_sub_u32 s2, s4, s2
 ; GFX9-NEXT:    s_subb_u32 s3, s5, s3
-; GFX9-NEXT:    v_mov_b32_e32 v1, s14
-; GFX9-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-NEXT:    v_mov_b32_e32 v2, s13
 ; GFX9-NEXT:    v_mov_b32_e32 v3, s2
 ; GFX9-NEXT:    v_mov_b32_e32 v4, s3
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9089,10 +9056,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_addc_u32 s13, 0, s14
 ; GFX6-NEXT:    s_add_u32 s14, s0, s1
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s14
-; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s13
 ; GFX6-NEXT:    s_mul_i32 s0, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s1, v0
@@ -9123,7 +9089,6 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    s_add_u32 s13, s14, s0
 ; GFX6-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX6-NEXT:    s_or_b32 s0, s0, s1
-; GFX6-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s12, s10
 ; GFX6-NEXT:    s_ashr_i32 s10, s7, 31
 ; GFX6-NEXT:    s_add_u32 s0, s6, s10
@@ -9158,46 +9123,43 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v0
 ; GFX6-NEXT:    s_add_i32 s4, s5, s4
 ; GFX6-NEXT:    s_mul_i32 s5, s9, s12
-; GFX6-NEXT:    s_add_i32 s13, s4, s5
-; GFX6-NEXT:    s_sub_i32 s14, s7, s13
+; GFX6-NEXT:    s_add_i32 s14, s4, s5
+; GFX6-NEXT:    s_sub_i32 s13, s7, s14
 ; GFX6-NEXT:    s_mul_i32 s4, s8, s12
 ; GFX6-NEXT:    s_sub_u32 s6, s6, s4
 ; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX6-NEXT:    s_or_b32 s12, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s15, s6, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GFX6-NEXT:    s_subb_u32 s15, s13, s9
+; GFX6-NEXT:    s_sub_u32 s16, s6, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s17, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s15, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s16, s8
+; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s17, s9
+; GFX6-NEXT:    s_cselect_b32 s18, s19, s18
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s15, s15, s9
+; GFX6-NEXT:    s_sub_u32 s19, s16, s8
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s15, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s18, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s19, s16
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s17
 ; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s16, s14, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s16, s9
+; GFX6-NEXT:    s_subb_u32 s4, s7, s14
+; GFX6-NEXT:    s_cmp_ge_u32 s4, s9
 ; GFX6-NEXT:    s_cselect_b32 s5, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s15, s8
-; GFX6-NEXT:    s_cselect_b32 s17, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s16, s9
-; GFX6-NEXT:    s_cselect_b32 s17, s17, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s14, s14, s9
-; GFX6-NEXT:    s_sub_u32 s18, s15, s8
-; GFX6-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX6-NEXT:    s_or_b32 s4, s4, s5
-; GFX6-NEXT:    s_cmp_lg_u32 s4, 0
-; GFX6-NEXT:    s_subb_u32 s4, s14, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s17, 0
-; GFX6-NEXT:    s_cselect_b32 s14, s18, s15
-; GFX6-NEXT:    s_cselect_b32 s4, s4, s16
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s5, s7, s13
-; GFX6-NEXT:    s_cmp_ge_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s6, s8
-; GFX6-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s5, s9
-; GFX6-NEXT:    s_cselect_b32 s7, s8, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s7, 0
-; GFX6-NEXT:    s_cselect_b32 s5, s4, s5
-; GFX6-NEXT:    s_cselect_b32 s4, s14, s6
+; GFX6-NEXT:    s_cselect_b32 s7, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s4, s9
+; GFX6-NEXT:    s_cselect_b32 s5, s7, s5
+; GFX6-NEXT:    s_cmp_lg_u32 s5, 0
+; GFX6-NEXT:    s_cselect_b32 s5, s12, s4
+; GFX6-NEXT:    s_cselect_b32 s4, s13, s6
 ; GFX6-NEXT:    s_xor_b64 s[4:5], s[4:5], s[10:11]
 ; GFX6-NEXT:    s_sub_u32 s4, s4, s10
 ; GFX6-NEXT:    s_subb_u32 s5, s5, s10
@@ -9219,8 +9181,8 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s8, 0, s6
-; GFX9-NEXT:    s_subb_u32 s9, 0, s7
+; GFX9-NEXT:    s_sub_u32 s4, 0, s6
+; GFX9-NEXT:    s_subb_u32 s5, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9230,56 +9192,52 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    v_madmk_f32 v1, v2, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s10, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
-; GFX9-NEXT:    s_mul_i32 s5, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s12, s8, s4
-; GFX9-NEXT:    s_mul_i32 s11, s9, s4
-; GFX9-NEXT:    s_add_i32 s5, s12, s5
-; GFX9-NEXT:    s_mul_i32 s13, s8, s4
-; GFX9-NEXT:    s_add_i32 s5, s5, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX9-NEXT:    s_mul_i32 s14, s4, s5
-; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s5
+; GFX9-NEXT:    v_readfirstlane_b32 s8, v2
+; GFX9-NEXT:    v_readfirstlane_b32 s9, v1
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s4, s9
+; GFX9-NEXT:    s_mul_i32 s11, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s13, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s11
+; GFX9-NEXT:    s_mul_hi_u32 s12, s9, s13
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s11, s9, s10
 ; GFX9-NEXT:    s_add_u32 s12, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s11
-; GFX9-NEXT:    s_mul_hi_u32 s15, s10, s13
-; GFX9-NEXT:    s_mul_i32 s13, s10, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s8, s13
+; GFX9-NEXT:    s_mul_i32 s13, s8, s13
 ; GFX9-NEXT:    s_add_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s5
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s10
 ; GFX9-NEXT:    s_addc_u32 s11, s11, s15
 ; GFX9-NEXT:    s_addc_u32 s12, s14, 0
-; GFX9-NEXT:    s_mul_i32 s5, s10, s5
-; GFX9-NEXT:    s_add_u32 s5, s11, s5
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s10, s11, s10
 ; GFX9-NEXT:    s_addc_u32 s11, 0, s12
-; GFX9-NEXT:    s_add_u32 s12, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_i32 s4, s8, s10
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s12
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s9, s9, s12
-; GFX9-NEXT:    s_add_i32 s4, s4, s9
-; GFX9-NEXT:    s_mul_i32 s8, s8, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s8
-; GFX9-NEXT:    s_mul_i32 s11, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s12, s4
-; GFX9-NEXT:    s_mul_hi_u32 s8, s12, s8
-; GFX9-NEXT:    s_mul_hi_u32 s13, s12, s4
-; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_add_u32 s9, s9, s10
+; GFX9-NEXT:    s_addc_u32 s8, s8, s11
+; GFX9-NEXT:    s_mul_i32 s10, s4, s8
+; GFX9-NEXT:    s_mul_hi_u32 s11, s4, s9
+; GFX9-NEXT:    s_add_i32 s10, s11, s10
+; GFX9-NEXT:    s_mul_i32 s5, s5, s9
+; GFX9-NEXT:    s_add_i32 s10, s10, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s9
+; GFX9-NEXT:    s_mul_hi_u32 s11, s8, s4
+; GFX9-NEXT:    s_mul_i32 s12, s8, s4
+; GFX9-NEXT:    s_mul_i32 s14, s9, s10
+; GFX9-NEXT:    s_mul_hi_u32 s4, s9, s4
+; GFX9-NEXT:    s_mul_hi_u32 s13, s9, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_add_u32 s8, s8, s11
-; GFX9-NEXT:    s_mul_hi_u32 s5, s10, s4
-; GFX9-NEXT:    s_addc_u32 s8, s13, s9
+; GFX9-NEXT:    s_add_u32 s4, s4, s12
+; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s10
+; GFX9-NEXT:    s_addc_u32 s4, s13, s11
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s10, s4
-; GFX9-NEXT:    s_add_u32 s4, s8, s4
-; GFX9-NEXT:    s_addc_u32 s8, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s12, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s10, s8
+; GFX9-NEXT:    s_mul_i32 s10, s8, s10
+; GFX9-NEXT:    s_add_u32 s4, s4, s10
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s9, s9, s4
+; GFX9-NEXT:    s_addc_u32 s8, s8, s5
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_ashr_i32 s4, s3, 31
 ; GFX9-NEXT:    s_add_u32 s2, s2, s4
@@ -9309,11 +9267,9 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_mul_i32 s8, s6, s8
 ; GFX9-NEXT:    s_sub_u32 s2, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s13, s10, s7
 ; GFX9-NEXT:    s_sub_u32 s14, s2, s6
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s13, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
@@ -9322,13 +9278,11 @@ define amdgpu_kernel void @srem_i64_pow2_shl_denom(ptr addrspace(1) %out, i64 %x
 ; GFX9-NEXT:    s_cmp_eq_u32 s15, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, s17, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s13, s13, s7
-; GFX9-NEXT:    s_sub_u32 s17, s14, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s13, 0
+; GFX9-NEXT:    s_subb_u32 s10, s13, s7
+; GFX9-NEXT:    s_sub_u32 s11, s14, s6
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s17, s14
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s14
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s15
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s3, s3, s12
@@ -9490,10 +9444,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_addc_u32 s15, 0, s16
 ; GFX6-NEXT:    s_add_u32 s16, s6, s7
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s16
-; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s14, s14, s15
 ; GFX6-NEXT:    s_mul_i32 s6, s12, s14
 ; GFX6-NEXT:    v_readfirstlane_b32 s7, v0
@@ -9524,7 +9477,6 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    s_add_u32 s13, s16, s6
 ; GFX6-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX6-NEXT:    s_or_b32 s6, s6, s7
-; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX6-NEXT:    s_addc_u32 s12, s14, s12
 ; GFX6-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX6-NEXT:    s_add_u32 s8, s8, s6
@@ -9557,49 +9509,46 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
 ; GFX6-NEXT:    s_add_i32 s13, s14, s13
 ; GFX6-NEXT:    s_mul_i32 s14, s3, s12
-; GFX6-NEXT:    s_add_i32 s14, s13, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    s_add_i32 s16, s13, s14
+; GFX6-NEXT:    s_sub_i32 s14, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s17, s8, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s13, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s2
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s3
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s3
-; GFX6-NEXT:    s_sub_u32 s20, s17, s2
-; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s15, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s14, s3
+; GFX6-NEXT:    s_sub_u32 s18, s8, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s14, s15
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s2
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s3
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s17, s17, s3
+; GFX6-NEXT:    s_sub_u32 s21, s18, s2
+; GFX6-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GFX6-NEXT:    s_or_b32 s14, s14, s15
+; GFX6-NEXT:    s_subb_u32 s14, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX6-NEXT:    s_or_b32 s12, s12, s13
-; GFX6-NEXT:    s_cmp_lg_u32 s12, 0
-; GFX6-NEXT:    s_subb_u32 s12, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s13, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s12, s12, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s12, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s2
 ; GFX6-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s3
-; GFX6-NEXT:    s_cselect_b32 s2, s2, s14
+; GFX6-NEXT:    s_cselect_b32 s2, s2, s12
 ; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_cselect_b32 s3, s12, s9
-; GFX6-NEXT:    s_cselect_b32 s2, s13, s8
+; GFX6-NEXT:    s_cselect_b32 s3, s14, s9
+; GFX6-NEXT:    s_cselect_b32 s2, s15, s8
 ; GFX6-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
-; GFX6-NEXT:    s_sub_u32 s12, s2, s6
-; GFX6-NEXT:    s_subb_u32 s13, s3, s6
+; GFX6-NEXT:    s_sub_u32 s14, s2, s6
+; GFX6-NEXT:    s_subb_u32 s15, s3, s6
 ; GFX6-NEXT:    s_ashr_i32 s2, s1, 31
 ; GFX6-NEXT:    s_add_u32 s0, s0, s2
 ; GFX6-NEXT:    s_mov_b32 s3, s2
@@ -9618,40 +9567,39 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v0, v0
 ; GFX6-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v2, s8, v0
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    v_readfirstlane_b32 s2, v0
-; GFX6-NEXT:    s_mul_i32 s1, s8, s14
+; GFX6-NEXT:    s_mul_i32 s1, s8, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v2
 ; GFX6-NEXT:    s_mul_i32 s0, s9, s2
 ; GFX6-NEXT:    s_add_i32 s1, s3, s1
 ; GFX6-NEXT:    s_add_i32 s3, s1, s0
-; GFX6-NEXT:    s_mul_i32 s15, s8, s2
+; GFX6-NEXT:    s_mul_i32 s13, s8, s2
 ; GFX6-NEXT:    v_mul_hi_u32 v2, v0, s3
-; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v0, s13
 ; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x9
 ; GFX6-NEXT:    s_mul_i32 s4, s2, s3
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s15
+; GFX6-NEXT:    v_mul_hi_u32 v0, v1, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, v1, s3
 ; GFX6-NEXT:    s_add_u32 s4, s16, s4
 ; GFX6-NEXT:    s_addc_u32 s5, 0, s5
-; GFX6-NEXT:    s_mul_i32 s15, s14, s15
+; GFX6-NEXT:    s_mul_i32 s13, s12, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s16, v0
-; GFX6-NEXT:    s_add_u32 s4, s4, s15
+; GFX6-NEXT:    s_add_u32 s4, s4, s13
 ; GFX6-NEXT:    s_addc_u32 s4, s5, s16
 ; GFX6-NEXT:    v_readfirstlane_b32 s5, v1
 ; GFX6-NEXT:    s_addc_u32 s5, s5, 0
-; GFX6-NEXT:    s_mul_i32 s3, s14, s3
+; GFX6-NEXT:    s_mul_i32 s3, s12, s3
 ; GFX6-NEXT:    s_add_u32 s3, s4, s3
 ; GFX6-NEXT:    s_addc_u32 s4, 0, s5
 ; GFX6-NEXT:    s_add_u32 s5, s2, s3
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s5
-; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s8, v0
+; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s4, s14, s4
+; GFX6-NEXT:    s_addc_u32 s4, s12, s4
 ; GFX6-NEXT:    s_mul_i32 s2, s8, s4
 ; GFX6-NEXT:    v_readfirstlane_b32 s3, v0
 ; GFX6-NEXT:    s_add_i32 s2, s3, s2
@@ -9665,102 +9613,98 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s4, v0
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s5, v0
 ; GFX6-NEXT:    s_mul_i32 s9, s5, s2
-; GFX6-NEXT:    v_readfirstlane_b32 s15, v2
-; GFX6-NEXT:    s_add_u32 s9, s15, s9
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
+; GFX6-NEXT:    v_readfirstlane_b32 s13, v2
+; GFX6-NEXT:    s_add_u32 s9, s13, s9
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
 ; GFX6-NEXT:    s_mul_i32 s3, s4, s3
-; GFX6-NEXT:    s_addc_u32 s14, 0, s14
+; GFX6-NEXT:    s_addc_u32 s12, 0, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v3
 ; GFX6-NEXT:    s_add_u32 s3, s9, s3
-; GFX6-NEXT:    s_addc_u32 s3, s14, s8
+; GFX6-NEXT:    s_addc_u32 s3, s12, s8
 ; GFX6-NEXT:    v_readfirstlane_b32 s8, v1
 ; GFX6-NEXT:    s_addc_u32 s8, s8, 0
 ; GFX6-NEXT:    s_mul_i32 s2, s4, s2
 ; GFX6-NEXT:    s_add_u32 s2, s3, s2
 ; GFX6-NEXT:    s_addc_u32 s8, 0, s8
-; GFX6-NEXT:    s_add_u32 s14, s5, s2
+; GFX6-NEXT:    s_add_u32 s12, s5, s2
 ; GFX6-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX6-NEXT:    s_or_b32 s2, s2, s3
-; GFX6-NEXT:    s_cmp_lg_u32 s2, 0
-; GFX6-NEXT:    s_addc_u32 s15, s4, s8
+; GFX6-NEXT:    s_addc_u32 s13, s4, s8
 ; GFX6-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX6-NEXT:    s_add_u32 s2, s10, s4
 ; GFX6-NEXT:    s_mov_b32 s5, s4
 ; GFX6-NEXT:    s_addc_u32 s3, s11, s4
 ; GFX6-NEXT:    s_xor_b64 s[8:9], s[2:3], s[4:5]
-; GFX6-NEXT:    v_mov_b32_e32 v0, s15
+; GFX6-NEXT:    v_mov_b32_e32 v0, s13
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s8, v0
-; GFX6-NEXT:    v_mov_b32_e32 v2, s14
+; GFX6-NEXT:    v_mov_b32_e32 v2, s12
 ; GFX6-NEXT:    v_mul_hi_u32 v3, s8, v2
-; GFX6-NEXT:    s_mul_i32 s2, s8, s15
+; GFX6-NEXT:    s_mul_i32 s2, s8, s13
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v1
 ; GFX6-NEXT:    v_mul_hi_u32 v1, s9, v2
 ; GFX6-NEXT:    v_readfirstlane_b32 s11, v3
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s9, v0
 ; GFX6-NEXT:    s_add_u32 s2, s11, s2
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
-; GFX6-NEXT:    s_mul_i32 s11, s9, s14
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v1
+; GFX6-NEXT:    s_mul_i32 s11, s9, s12
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v1
 ; GFX6-NEXT:    s_add_u32 s2, s2, s11
-; GFX6-NEXT:    s_addc_u32 s2, s10, s14
+; GFX6-NEXT:    s_addc_u32 s2, s10, s12
 ; GFX6-NEXT:    v_readfirstlane_b32 s10, v0
 ; GFX6-NEXT:    s_addc_u32 s10, s10, 0
-; GFX6-NEXT:    s_mul_i32 s11, s9, s15
+; GFX6-NEXT:    s_mul_i32 s11, s9, s13
 ; GFX6-NEXT:    s_add_u32 s11, s2, s11
 ; GFX6-NEXT:    v_mov_b32_e32 v0, s11
 ; GFX6-NEXT:    v_mul_hi_u32 v0, s6, v0
 ; GFX6-NEXT:    s_addc_u32 s10, 0, s10
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX6-NEXT:    s_mov_b32 s3, 0xf000
-; GFX6-NEXT:    v_readfirstlane_b32 s14, v0
-; GFX6-NEXT:    s_add_i32 s10, s14, s10
-; GFX6-NEXT:    s_mul_i32 s14, s7, s11
-; GFX6-NEXT:    s_add_i32 s14, s10, s14
-; GFX6-NEXT:    s_sub_i32 s15, s9, s14
+; GFX6-NEXT:    v_readfirstlane_b32 s12, v0
+; GFX6-NEXT:    s_add_i32 s10, s12, s10
+; GFX6-NEXT:    s_mul_i32 s12, s7, s11
+; GFX6-NEXT:    s_add_i32 s16, s10, s12
+; GFX6-NEXT:    s_sub_i32 s12, s9, s16
 ; GFX6-NEXT:    s_mul_i32 s10, s6, s11
 ; GFX6-NEXT:    s_sub_u32 s8, s8, s10
 ; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s16, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s17, s8, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s18, s15, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX6-NEXT:    s_cmp_ge_u32 s17, s6
-; GFX6-NEXT:    s_cselect_b32 s19, -1, 0
-; GFX6-NEXT:    s_cmp_eq_u32 s18, s7
-; GFX6-NEXT:    s_cselect_b32 s19, s19, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s15, s15, s7
-; GFX6-NEXT:    s_sub_u32 s20, s17, s6
-; GFX6-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GFX6-NEXT:    s_or_b32 s13, s10, s11
+; GFX6-NEXT:    s_subb_u32 s17, s12, s7
+; GFX6-NEXT:    s_sub_u32 s18, s8, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s19, s12, s13
+; GFX6-NEXT:    s_subb_u32 s19, s17, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, -1, 0
+; GFX6-NEXT:    s_cmp_ge_u32 s18, s6
+; GFX6-NEXT:    s_cselect_b32 s21, -1, 0
+; GFX6-NEXT:    s_cmp_eq_u32 s19, s7
+; GFX6-NEXT:    s_cselect_b32 s20, s21, s20
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s17, s17, s7
+; GFX6-NEXT:    s_sub_u32 s21, s18, s6
+; GFX6-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GFX6-NEXT:    s_or_b32 s12, s12, s13
+; GFX6-NEXT:    s_subb_u32 s12, s17, 0
+; GFX6-NEXT:    s_cmp_lg_u32 s20, 0
+; GFX6-NEXT:    s_cselect_b32 s13, s21, s18
+; GFX6-NEXT:    s_cselect_b32 s12, s12, s19
 ; GFX6-NEXT:    s_or_b32 s10, s10, s11
-; GFX6-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX6-NEXT:    s_subb_u32 s10, s15, 0
-; GFX6-NEXT:    s_cmp_lg_u32 s19, 0
-; GFX6-NEXT:    s_cselect_b32 s11, s20, s17
-; GFX6-NEXT:    s_cselect_b32 s10, s10, s18
-; GFX6-NEXT:    s_cmp_lg_u32 s16, 0
-; GFX6-NEXT:    s_subb_u32 s9, s9, s14
+; GFX6-NEXT:    s_subb_u32 s9, s9, s16
 ; GFX6-NEXT:    s_cmp_ge_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s14, -1, 0
+; GFX6-NEXT:    s_cselect_b32 s10, -1, 0
 ; GFX6-NEXT:    s_cmp_ge_u32 s8, s6
 ; GFX6-NEXT:    s_cselect_b32 s6, -1, 0
 ; GFX6-NEXT:    s_cmp_eq_u32 s9, s7
-; GFX6-NEXT:    s_cselect_b32 s6, s6, s14
+; GFX6-NEXT:    s_cselect_b32 s6, s6, s10
 ; GFX6-NEXT:    s_cmp_lg_u32 s6, 0
-; GFX6-NEXT:    s_cselect_b32 s7, s10, s9
-; GFX6-NEXT:    s_cselect_b32 s6, s11, s8
+; GFX6-NEXT:    s_cselect_b32 s7, s12, s9
+; GFX6-NEXT:    s_cselect_b32 s6, s13, s8
 ; GFX6-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GFX6-NEXT:    s_sub_u32 s5, s6, s4
 ; GFX6-NEXT:    s_subb_u32 s4, s7, s4
 ; GFX6-NEXT:    s_mov_b32 s2, -1
-; GFX6-NEXT:    v_mov_b32_e32 v0, s12
-; GFX6-NEXT:    v_mov_b32_e32 v1, s13
+; GFX6-NEXT:    v_mov_b32_e32 v0, s14
+; GFX6-NEXT:    v_mov_b32_e32 v1, s15
 ; GFX6-NEXT:    v_mov_b32_e32 v2, s5
 ; GFX6-NEXT:    v_mov_b32_e32 v3, s4
 ; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
@@ -9780,8 +9724,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_xor_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
-; GFX9-NEXT:    s_sub_u32 s12, 0, s2
-; GFX9-NEXT:    s_subb_u32 s13, 0, s3
+; GFX9-NEXT:    s_sub_u32 s6, 0, s2
+; GFX9-NEXT:    s_subb_u32 s7, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -9790,56 +9734,52 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0xcf800000, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s14, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX9-NEXT:    s_mul_i32 s7, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s6
-; GFX9-NEXT:    s_mul_i32 s15, s13, s6
-; GFX9-NEXT:    s_add_i32 s7, s16, s7
-; GFX9-NEXT:    s_mul_i32 s17, s12, s6
-; GFX9-NEXT:    s_add_i32 s7, s7, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s17
-; GFX9-NEXT:    s_mul_i32 s18, s6, s7
-; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s7
+; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s13, v0
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s13
+; GFX9-NEXT:    s_mul_i32 s15, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s16, s14
+; GFX9-NEXT:    s_mul_i32 s17, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s15
+; GFX9-NEXT:    s_mul_hi_u32 s16, s13, s17
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s15, s13, s14
 ; GFX9-NEXT:    s_add_u32 s16, s16, s18
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_mul_hi_u32 s18, s14, s17
-; GFX9-NEXT:    s_mul_i32 s17, s14, s17
+; GFX9-NEXT:    s_mul_hi_u32 s18, s12, s17
+; GFX9-NEXT:    s_mul_i32 s17, s12, s17
 ; GFX9-NEXT:    s_add_u32 s16, s16, s17
-; GFX9-NEXT:    s_mul_hi_u32 s19, s14, s7
+; GFX9-NEXT:    s_mul_hi_u32 s19, s12, s14
 ; GFX9-NEXT:    s_addc_u32 s15, s15, s18
 ; GFX9-NEXT:    s_addc_u32 s16, s19, 0
-; GFX9-NEXT:    s_mul_i32 s7, s14, s7
-; GFX9-NEXT:    s_add_u32 s7, s15, s7
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s14, s15, s14
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s16
-; GFX9-NEXT:    s_add_u32 s16, s6, s7
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_i32 s6, s12, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s16
-; GFX9-NEXT:    s_add_i32 s6, s7, s6
-; GFX9-NEXT:    s_mul_i32 s13, s13, s16
-; GFX9-NEXT:    s_add_i32 s6, s6, s13
-; GFX9-NEXT:    s_mul_i32 s12, s12, s16
-; GFX9-NEXT:    s_mul_hi_u32 s13, s14, s12
-; GFX9-NEXT:    s_mul_i32 s15, s14, s12
-; GFX9-NEXT:    s_mul_i32 s18, s16, s6
-; GFX9-NEXT:    s_mul_hi_u32 s12, s16, s12
-; GFX9-NEXT:    s_mul_hi_u32 s17, s16, s6
-; GFX9-NEXT:    s_add_u32 s12, s12, s18
+; GFX9-NEXT:    s_add_u32 s13, s13, s14
+; GFX9-NEXT:    s_addc_u32 s12, s12, s15
+; GFX9-NEXT:    s_mul_i32 s14, s6, s12
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s13
+; GFX9-NEXT:    s_add_i32 s14, s15, s14
+; GFX9-NEXT:    s_mul_i32 s7, s7, s13
+; GFX9-NEXT:    s_add_i32 s14, s14, s7
+; GFX9-NEXT:    s_mul_i32 s6, s6, s13
+; GFX9-NEXT:    s_mul_hi_u32 s15, s12, s6
+; GFX9-NEXT:    s_mul_i32 s16, s12, s6
+; GFX9-NEXT:    s_mul_i32 s18, s13, s14
+; GFX9-NEXT:    s_mul_hi_u32 s6, s13, s6
+; GFX9-NEXT:    s_mul_hi_u32 s17, s13, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s18
 ; GFX9-NEXT:    s_addc_u32 s17, 0, s17
-; GFX9-NEXT:    s_add_u32 s12, s12, s15
-; GFX9-NEXT:    s_mul_hi_u32 s7, s14, s6
-; GFX9-NEXT:    s_addc_u32 s12, s17, s13
+; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_mul_hi_u32 s7, s12, s14
+; GFX9-NEXT:    s_addc_u32 s6, s17, s15
 ; GFX9-NEXT:    s_addc_u32 s7, s7, 0
-; GFX9-NEXT:    s_mul_i32 s6, s14, s6
-; GFX9-NEXT:    s_add_u32 s6, s12, s6
-; GFX9-NEXT:    s_addc_u32 s12, 0, s7
-; GFX9-NEXT:    s_add_u32 s13, s16, s6
-; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX9-NEXT:    s_addc_u32 s12, s14, s12
+; GFX9-NEXT:    s_mul_i32 s14, s12, s14
+; GFX9-NEXT:    s_add_u32 s6, s6, s14
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
+; GFX9-NEXT:    s_add_u32 s13, s13, s6
+; GFX9-NEXT:    s_addc_u32 s12, s12, s7
 ; GFX9-NEXT:    s_ashr_i32 s6, s9, 31
 ; GFX9-NEXT:    s_add_u32 s8, s8, s6
 ; GFX9-NEXT:    s_mov_b32 s7, s6
@@ -9868,11 +9808,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s12, s2, s12
 ; GFX9-NEXT:    s_sub_u32 s8, s8, s12
 ; GFX9-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s14, s3
 ; GFX9-NEXT:    s_sub_u32 s18, s8, s2
 ; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GFX9-NEXT:    s_subb_u32 s19, s17, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, -1, 0
@@ -9881,13 +9819,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s19, s3
 ; GFX9-NEXT:    s_cselect_b32 s20, s21, s20
 ; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s17, s17, s3
-; GFX9-NEXT:    s_sub_u32 s21, s18, s2
-; GFX9-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX9-NEXT:    s_subb_u32 s14, s17, 0
+; GFX9-NEXT:    s_subb_u32 s14, s17, s3
+; GFX9-NEXT:    s_sub_u32 s15, s18, s2
+; GFX9-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s20, 0
-; GFX9-NEXT:    s_cselect_b32 s15, s21, s18
+; GFX9-NEXT:    s_cselect_b32 s15, s15, s18
 ; GFX9-NEXT:    s_cselect_b32 s14, s14, s19
 ; GFX9-NEXT:    s_cmp_lg_u64 s[12:13], 0
 ; GFX9-NEXT:    s_subb_u32 s9, s9, s16
@@ -9911,8 +9847,8 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s2
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s3
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[4:5], 0x24
-; GFX9-NEXT:    s_sub_u32 s6, 0, s2
-; GFX9-NEXT:    s_subb_u32 s7, 0, s3
+; GFX9-NEXT:    s_sub_u32 s4, 0, s2
+; GFX9-NEXT:    s_subb_u32 s5, 0, s3
 ; GFX9-NEXT:    v_mac_f32_e32 v0, 0x4f800000, v1
 ; GFX9-NEXT:    v_rcp_f32_e32 v1, v0
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
@@ -9922,74 +9858,70 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    v_mac_f32_e32 v1, 0xcf800000, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX9-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX9-NEXT:    v_readfirstlane_b32 s9, v2
-; GFX9-NEXT:    s_mul_hi_u32 s8, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s6, s9
-; GFX9-NEXT:    s_mul_i32 s5, s7, s4
+; GFX9-NEXT:    s_mul_hi_u32 s8, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s4, s9
+; GFX9-NEXT:    s_mul_i32 s7, s5, s6
 ; GFX9-NEXT:    s_add_i32 s8, s8, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s5
-; GFX9-NEXT:    s_mul_i32 s15, s6, s4
-; GFX9-NEXT:    s_mul_i32 s14, s4, s8
-; GFX9-NEXT:    s_mul_hi_u32 s16, s4, s15
-; GFX9-NEXT:    s_mul_hi_u32 s5, s4, s8
+; GFX9-NEXT:    s_add_i32 s8, s8, s7
+; GFX9-NEXT:    s_mul_i32 s15, s4, s6
+; GFX9-NEXT:    s_mul_i32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s16, s6, s15
+; GFX9-NEXT:    s_mul_hi_u32 s7, s6, s8
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
-; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_addc_u32 s7, 0, s7
 ; GFX9-NEXT:    s_mul_hi_u32 s17, s9, s15
 ; GFX9-NEXT:    s_mul_i32 s15, s9, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
 ; GFX9-NEXT:    s_mul_hi_u32 s16, s9, s8
-; GFX9-NEXT:    s_addc_u32 s5, s5, s17
+; GFX9-NEXT:    s_addc_u32 s7, s7, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
 ; GFX9-NEXT:    s_mul_i32 s8, s9, s8
-; GFX9-NEXT:    s_add_u32 s5, s5, s8
+; GFX9-NEXT:    s_add_u32 s7, s7, s8
 ; GFX9-NEXT:    s_addc_u32 s8, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s4, s5
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s4, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s5, s6, s14
-; GFX9-NEXT:    s_add_i32 s4, s5, s4
-; GFX9-NEXT:    s_mul_i32 s7, s7, s14
-; GFX9-NEXT:    s_add_i32 s4, s4, s7
-; GFX9-NEXT:    s_mul_i32 s6, s6, s14
-; GFX9-NEXT:    s_mul_hi_u32 s7, s8, s6
-; GFX9-NEXT:    s_mul_i32 s9, s8, s6
-; GFX9-NEXT:    s_mul_i32 s16, s14, s4
-; GFX9-NEXT:    s_mul_hi_u32 s6, s14, s6
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s4
-; GFX9-NEXT:    s_add_u32 s6, s6, s16
+; GFX9-NEXT:    s_add_u32 s6, s6, s7
+; GFX9-NEXT:    s_addc_u32 s7, s9, s8
+; GFX9-NEXT:    s_mul_i32 s8, s4, s7
+; GFX9-NEXT:    s_mul_hi_u32 s9, s4, s6
+; GFX9-NEXT:    s_add_i32 s8, s9, s8
+; GFX9-NEXT:    s_mul_i32 s5, s5, s6
+; GFX9-NEXT:    s_add_i32 s8, s8, s5
+; GFX9-NEXT:    s_mul_i32 s4, s4, s6
+; GFX9-NEXT:    s_mul_hi_u32 s9, s7, s4
+; GFX9-NEXT:    s_mul_i32 s14, s7, s4
+; GFX9-NEXT:    s_mul_i32 s16, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s4, s6, s4
+; GFX9-NEXT:    s_mul_hi_u32 s15, s6, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s6, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s5, s8, s4
-; GFX9-NEXT:    s_addc_u32 s6, s15, s7
+; GFX9-NEXT:    s_add_u32 s4, s4, s14
+; GFX9-NEXT:    s_mul_hi_u32 s5, s7, s8
+; GFX9-NEXT:    s_addc_u32 s4, s15, s9
 ; GFX9-NEXT:    s_addc_u32 s5, s5, 0
-; GFX9-NEXT:    s_mul_i32 s4, s8, s4
-; GFX9-NEXT:    s_add_u32 s4, s6, s4
-; GFX9-NEXT:    s_addc_u32 s6, 0, s5
-; GFX9-NEXT:    s_add_u32 s9, s14, s4
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s8, s8, s6
+; GFX9-NEXT:    s_mul_i32 s8, s7, s8
+; GFX9-NEXT:    s_add_u32 s4, s4, s8
+; GFX9-NEXT:    s_addc_u32 s5, 0, s5
+; GFX9-NEXT:    s_add_u32 s8, s6, s4
+; GFX9-NEXT:    s_addc_u32 s9, s7, s5
 ; GFX9-NEXT:    s_ashr_i32 s4, s11, 31
 ; GFX9-NEXT:    s_add_u32 s6, s10, s4
 ; GFX9-NEXT:    s_mov_b32 s5, s4
 ; GFX9-NEXT:    s_addc_u32 s7, s11, s4
 ; GFX9-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
-; GFX9-NEXT:    s_mul_i32 s11, s6, s8
-; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s9
-; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s8
+; GFX9-NEXT:    s_mul_i32 s11, s6, s9
+; GFX9-NEXT:    s_mul_hi_u32 s14, s6, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s6, s9
 ; GFX9-NEXT:    s_add_u32 s11, s14, s11
 ; GFX9-NEXT:    s_addc_u32 s10, 0, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s9
-; GFX9-NEXT:    s_mul_i32 s9, s7, s9
-; GFX9-NEXT:    s_add_u32 s9, s11, s9
-; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s8
-; GFX9-NEXT:    s_addc_u32 s9, s10, s15
-; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_hi_u32 s15, s7, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s7, s8
-; GFX9-NEXT:    s_add_u32 s8, s9, s8
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s14, s7, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s15
+; GFX9-NEXT:    s_addc_u32 s10, s14, 0
+; GFX9-NEXT:    s_mul_i32 s9, s7, s9
+; GFX9-NEXT:    s_add_u32 s8, s8, s9
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s10
 ; GFX9-NEXT:    s_mul_i32 s9, s2, s9
 ; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s8
@@ -10000,11 +9932,9 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_mul_i32 s8, s2, s8
 ; GFX9-NEXT:    s_sub_u32 s6, s6, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s15, s10, s3
 ; GFX9-NEXT:    s_sub_u32 s16, s6, s2
 ; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX9-NEXT:    s_subb_u32 s17, s15, 0
 ; GFX9-NEXT:    s_cmp_ge_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, -1, 0
@@ -10013,13 +9943,11 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
 ; GFX9-NEXT:    s_cmp_eq_u32 s17, s3
 ; GFX9-NEXT:    s_cselect_b32 s18, s19, s18
 ; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s15, s15, s3
-; GFX9-NEXT:    s_sub_u32 s19, s16, s2
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s15, 0
+; GFX9-NEXT:    s_subb_u32 s10, s15, s3
+; GFX9-NEXT:    s_sub_u32 s11, s16, s2
+; GFX9-NEXT:    s_subb_u32 s10, s10, 0
 ; GFX9-NEXT:    s_cmp_lg_u32 s18, 0
-; GFX9-NEXT:    s_cselect_b32 s11, s19, s16
+; GFX9-NEXT:    s_cselect_b32 s11, s11, s16
 ; GFX9-NEXT:    s_cselect_b32 s10, s10, s17
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX9-NEXT:    s_subb_u32 s7, s7, s14
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
index 394727c88b0be..01f4414b930e1 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_buffer.ll
@@ -612,12 +612,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -653,12 +652,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -693,11 +691,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -733,11 +730,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -774,11 +770,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -818,11 +813,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -859,11 +853,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -901,15 +894,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -999,12 +992,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1042,12 +1034,11 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1084,11 +1075,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1127,11 +1117,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1171,11 +1160,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1218,11 +1206,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1261,11 +1248,10 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1306,15 +1292,15 @@ define amdgpu_kernel void @struct_add_i32_varying_vdata(ptr addrspace(1) %out, p
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2073,12 +2059,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2114,12 +2099,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2154,11 +2138,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2194,11 +2177,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2235,11 +2217,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2279,11 +2260,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2321,11 +2301,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2363,15 +2342,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
index 258bc2959f391..9db6d706b634b 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_global_pointer.ll
@@ -717,12 +717,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -762,12 +761,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s6, s6, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -805,13 +803,12 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1
 ; GFX1064_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s7, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s7
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s7
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -853,11 +850,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -897,14 +893,13 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1164_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s7
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s6, s6, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -949,11 +944,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s6, s6, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -993,14 +987,14 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -1028,6 +1022,7 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_add_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -1041,15 +1036,15 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2363,7 +2358,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2416,7 +2410,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2462,13 +2455,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2515,13 +2507,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s7, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s6, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -2569,14 +2560,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v2, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s6, s6, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2626,14 +2616,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s6, s6, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s7, s7, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2677,16 +2666,16 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -2731,17 +2720,17 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4490,12 +4479,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4550,12 +4538,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s2
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s12, s12, s3
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4608,13 +4595,12 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[0:1]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4670,11 +4656,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s3
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -4728,14 +4713,13 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr2
 ; GFX1164_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s6, s[0:1]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s6
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v2, s12, s2
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s6
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_add_i32 s12, s12, s7
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4799,11 +4783,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v2, s8, s1
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s8, s8, s2
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4861,14 +4844,14 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1264_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s7, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s2, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s7, v1, s2
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s2
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
+; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s7
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s7
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s7
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s8
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -4896,6 +4879,7 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    s_wait_kmcnt 0x0
 ; GFX1264_ITERATIVE-NEXT:    v_readfirstlane_b32 s2, v1
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xf1ff
 ; GFX1264_ITERATIVE-NEXT:    v_sub_nc_u32_e32 v0, s2, v0
 ; GFX1264_ITERATIVE-NEXT:    s_mov_b32 s2, -1
 ; GFX1264_ITERATIVE-NEXT:    buffer_store_b32 v0, off, s[0:3], null
@@ -4909,15 +4893,15 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0
 ; GFX1232_ITERATIVE-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v1, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s3
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_add_co_i32 s6, s6, s2
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -6673,7 +6657,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6746,7 +6729,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6812,13 +6794,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s2, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6883,13 +6864,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
@@ -6955,14 +6935,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v1, s2
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s2
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s2
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s8, s8, s3
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7036,14 +7015,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s1
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v5, s9, s1
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v4, s8, s1
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s8, s8, s2
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s9, s9, s3
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7109,16 +7087,16 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1264_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1264_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1264_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[0:1]
+; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s8
+; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s8
+; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s8
+; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1264_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s10
-; GFX1264_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s10
-; GFX1264_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s10
-; GFX1264_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s10
 ; GFX1264_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[8:9]
 ; GFX1264_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
-; GFX1264_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1264_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1264_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1264_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
@@ -7163,17 +7141,17 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1232_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1232_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX1232_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s3, v3, s1
 ; GFX1232_ITERATIVE-NEXT:    v_readlane_b32 s2, v2, s1
-; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v1, s7, s1
 ; GFX1232_ITERATIVE-NEXT:    v_writelane_b32 v0, s6, s1
-; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s8
-; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
+; GFX1232_ITERATIVE-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1232_ITERATIVE-NEXT:    s_wait_alu 0xfffe
-; GFX1232_ITERATIVE-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX1232_ITERATIVE-NEXT:    s_and_not1_b32 s0, s0, s1
+; GFX1232_ITERATIVE-NEXT:    s_add_nc_u64 s[6:7], s[6:7], s[2:3]
 ; GFX1232_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1232_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1232_ITERATIVE-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
index 23c5f4f5506f3..6167a84094b7a 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll
@@ -499,12 +499,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -540,12 +539,11 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -580,11 +578,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -621,11 +618,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -663,11 +659,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -707,11 +702,10 @@ define amdgpu_kernel void @add_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1088,11 +1082,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1117,11 +1110,10 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1147,9 +1139,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1176,9 +1167,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1206,10 +1196,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1239,10 +1227,8 @@ define amdgpu_kernel void @add_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2022,7 +2008,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2071,7 +2056,6 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2112,13 +2096,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2160,13 +2143,12 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2209,14 +2191,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2261,14 +2242,13 @@ define amdgpu_kernel void @add_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -2881,7 +2861,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2914,7 +2893,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2946,7 +2924,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[4:5]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2979,7 +2956,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3013,8 +2989,6 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s4
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[4:5]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3048,9 +3022,8 @@ define amdgpu_kernel void @add_i64_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s4
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s5
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3906,12 +3879,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3947,12 +3919,11 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3987,11 +3958,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4028,11 +3998,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4070,11 +4039,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4114,11 +4082,10 @@ define amdgpu_kernel void @sub_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -4495,11 +4462,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX8_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX8_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4524,11 +4490,10 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX9_ITERATIVE-NEXT:  .LBB11_1: ; %ComputeLoop
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s4, v0, s3
+; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s4
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX9_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4554,9 +4519,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4583,9 +4547,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4613,10 +4576,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[4:5], 1, s3
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_add_i32 s2, s2, s6
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[4:5]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4646,10 +4607,8 @@ define amdgpu_kernel void @sub_i32_varying_nouse() {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_add_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5452,7 +5411,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5501,7 +5459,6 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5542,13 +5499,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s6, s[2:3]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s6
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1064_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5590,13 +5546,12 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
+; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
 ; GFX1032_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5639,14 +5594,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v2, s6
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v3, s6
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s6
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_add_u32 s0, s0, s7
 ; GFX1164_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s6
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -5691,14 +5645,13 @@ define amdgpu_kernel void @sub_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
-; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
+; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
 ; GFX1132_ITERATIVE-NEXT:    s_add_u32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_addc_u32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -6313,12 +6266,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,12 +6306,11 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_and_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6394,11 +6345,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6435,11 +6385,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6477,11 +6426,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_and_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6521,11 +6469,10 @@ define amdgpu_kernel void @and_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_and_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -6926,12 +6873,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6973,12 +6919,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7015,15 +6960,14 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7065,12 +7009,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7109,16 +7052,15 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB16_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7163,12 +7105,11 @@ define amdgpu_kernel void @and_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB16_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -7672,12 +7613,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7713,12 +7653,11 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_or_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7753,11 +7692,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7794,11 +7732,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7836,11 +7773,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_or_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -7880,11 +7816,10 @@ define amdgpu_kernel void @or_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_or_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -8284,12 +8219,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8331,12 +8265,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8373,15 +8306,14 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8423,12 +8355,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8467,16 +8398,15 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB18_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8521,12 +8451,11 @@ define amdgpu_kernel void @or_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_or_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB18_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9030,12 +8959,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9071,12 +8999,11 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9111,11 +9038,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9152,11 +9078,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9194,11 +9119,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9238,11 +9162,10 @@ define amdgpu_kernel void @xor_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB19_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -9642,12 +9565,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX8_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9689,12 +9611,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
 ; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, m0
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, m0
 ; GFX9_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
+; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9731,15 +9652,14 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; implicit-def: $vgpr1_vgpr2
 ; GFX1064_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s10, s[2:3]
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s10
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s10
-; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s10
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
+; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s8, s[2:3]
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s8
+; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s8
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1064_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9781,12 +9701,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s3, s2
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v2, s1, s3
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s8
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9825,16 +9744,15 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GFX1164_ITERATIVE-NEXT:  .LBB20_1: ; %ComputeLoop
 ; GFX1164_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
-; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s10, s[2:3]
+; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s8, s[2:3]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s10
-; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s10
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s10
-; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s10
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s8
+; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s8
+; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s8
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[8:9], 1, s8
 ; GFX1164_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[8:9]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9879,12 +9797,11 @@ define amdgpu_kernel void @xor_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s7, v3, s3
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s6, v2, s3
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s8, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v1, s1, s3
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s8
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_xor_b64 s[0:1], s[0:1], s[6:7]
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB20_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -10388,12 +10305,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10429,12 +10345,11 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10469,11 +10384,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10510,11 +10424,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10552,11 +10465,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -10596,11 +10508,10 @@ define amdgpu_kernel void @max_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB21_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -11255,7 +11166,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11311,7 +11221,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11363,7 +11272,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11415,7 +11323,6 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11468,9 +11375,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -11525,9 +11431,8 @@ define amdgpu_kernel void @max_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB23_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -12214,12 +12119,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12255,12 +12159,11 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_i32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12295,11 +12198,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12336,11 +12238,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12378,11 +12279,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_i32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -12422,11 +12322,10 @@ define amdgpu_kernel void @min_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_i32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB24_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -13081,7 +12980,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13137,7 +13035,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13189,7 +13086,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13241,7 +13137,6 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -13294,9 +13189,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -13351,9 +13245,8 @@ define amdgpu_kernel void @min_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s1, s1, s7
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB26_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -14040,12 +13933,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14081,12 +13973,11 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_max_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14121,11 +14012,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14162,11 +14052,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14204,11 +14093,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_max_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14248,11 +14136,10 @@ define amdgpu_kernel void @umax_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_max_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB27_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -14901,7 +14788,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -14956,7 +14842,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15007,7 +14892,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15058,7 +14942,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15112,8 +14995,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15169,8 +15050,6 @@ define amdgpu_kernel void @umax_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB29_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -15853,12 +15732,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX8_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15894,12 +15772,11 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9_ITERATIVE-NEXT:    s_mov_b32 m0, s3
-; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9_ITERATIVE-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
+; GFX9_ITERATIVE-NEXT:    s_min_u32 s2, s2, s6
+; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15934,11 +15811,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064_ITERATIVE-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX1064_ITERATIVE-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1064_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -15975,11 +15851,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1032_ITERATIVE-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX1032_ITERATIVE-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16017,11 +15892,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1164_ITERATIVE-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX1164_ITERATIVE-NEXT:    s_min_u32 s2, s2, s8
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16061,11 +15935,10 @@ define amdgpu_kernel void @umin_i32_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX1132_ITERATIVE-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_min_u32 s0, s0, s3
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB30_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -16715,7 +16588,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX8_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX8_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX8_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX8_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX8_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX8_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16770,7 +16642,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX9_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s10
 ; GFX9_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s8
 ; GFX9_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX9_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX9_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX9_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16821,7 +16692,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1064_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1064_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1064_ITERATIVE-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[6:7]
-; GFX1064_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1064_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1064_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16872,7 +16742,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1032_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1032_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1032_ITERATIVE-NEXT:    s_andn2_b32 s2, s2, s3
-; GFX1032_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1032_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1032_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -16926,8 +16795,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1164_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1164_ITERATIVE-NEXT:    s_lshl_b64 s[6:7], 1, s10
 ; GFX1164_ITERATIVE-NEXT:    s_and_not1_b64 s[2:3], s[2:3], s[6:7]
-; GFX1164_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164_ITERATIVE-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; GFX1164_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1164_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -16983,8 +16850,6 @@ define amdgpu_kernel void @umin_i64_varying(ptr addrspace(1) %out) {
 ; GFX1132_ITERATIVE-NEXT:    s_cselect_b32 s0, s0, s6
 ; GFX1132_ITERATIVE-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX1132_ITERATIVE-NEXT:    s_and_not1_b32 s2, s2, s3
-; GFX1132_ITERATIVE-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1132_ITERATIVE-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1132_ITERATIVE-NEXT:    s_cbranch_scc1 .LBB32_1
 ; GFX1132_ITERATIVE-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132_ITERATIVE-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
index e4def28667ed4..9afc0c62e846e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_raw_buffer.ll
@@ -611,12 +611,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -652,12 +651,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -692,11 +690,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -732,11 +729,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -773,11 +769,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -817,11 +812,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -858,11 +852,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -900,15 +893,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1665,12 +1658,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1706,12 +1698,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1746,11 +1737,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1786,11 +1776,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1827,11 +1816,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1871,11 +1859,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1913,11 +1900,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1955,15 +1941,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
index 39a3c9aade586..10fd34f08b83e 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_struct_buffer.ll
@@ -628,12 +628,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -670,12 +669,11 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -711,11 +709,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -752,11 +749,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -794,11 +790,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -839,11 +834,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -880,11 +874,10 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -923,15 +916,15 @@ define amdgpu_kernel void @add_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB2_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB2_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -1833,12 +1826,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX8-NEXT:    s_mov_b32 m0, s3
-; GFX8-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX8-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX8-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX8-NEXT:    s_add_i32 s2, s2, s8
+; GFX8-NEXT:    s_add_i32 s2, s2, s6
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX8-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1875,12 +1867,11 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 m0, s3
-; GFX9-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
+; GFX9-NEXT:    v_readlane_b32 s6, v0, s3
 ; GFX9-NEXT:    v_writelane_b32 v1, s2, m0
-; GFX9-NEXT:    s_add_i32 s2, s2, s8
+; GFX9-NEXT:    s_add_i32 s2, s2, s6
+; GFX9-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1916,11 +1907,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W64-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W64-NEXT:    s_ff1_i32_b64 s3, s[0:1]
 ; GFX10W64-NEXT:    v_readlane_b32 s8, v0, s3
-; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    v_writelane_b32 v1, s2, s3
-; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
+; GFX10W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX10W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX10W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX10W64-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
 ; GFX10W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W64-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1957,11 +1947,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX10W32-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX10W32-NEXT:    s_ff1_i32_b32 s2, s1
 ; GFX10W32-NEXT:    v_readlane_b32 s3, v0, s2
-; GFX10W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX10W32-NEXT:    v_writelane_b32 v1, s0, s2
-; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s6
+; GFX10W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX10W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX10W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX10W32-NEXT:    s_andn2_b32 s1, s1, s2
 ; GFX10W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX10W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX10W32-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1999,11 +1988,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX11W64-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX11W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX11W64-NEXT:    s_add_i32 s2, s2, s8
-; GFX11W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX11W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX11W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2044,11 +2032,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX11W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX11W32-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
 ; GFX11W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX11W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX11W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX11W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX11W32-NEXT:    s_add_i32 s0, s0, s3
-; GFX11W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX11W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX11W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX11W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX11W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2086,11 +2073,10 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W64-NEXT:    s_ctz_i32_b64 s3, s[0:1]
 ; GFX12W64-NEXT:    s_wait_alu 0xfffe
 ; GFX12W64-NEXT:    v_readlane_b32 s8, v1, s3
-; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    v_writelane_b32 v0, s2, s3
-; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
+; GFX12W64-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX12W64-NEXT:    s_add_co_i32 s2, s2, s8
-; GFX12W64-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX12W64-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[6:7]
 ; GFX12W64-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W64-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W64-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
@@ -2129,15 +2115,15 @@ define amdgpu_kernel void @sub_i32_varying_vdata(ptr addrspace(1) %out, ptr addr
 ; GFX12W32-NEXT:    ; implicit-def: $vgpr0
 ; GFX12W32-NEXT:  .LBB7_1: ; %ComputeLoop
 ; GFX12W32-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    s_ctz_i32_b32 s2, s1
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
 ; GFX12W32-NEXT:    v_readlane_b32 s3, v1, s2
-; GFX12W32-NEXT:    s_lshl_b32 s6, 1, s2
 ; GFX12W32-NEXT:    v_writelane_b32 v0, s0, s2
-; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s6
+; GFX12W32-NEXT:    s_lshl_b32 s2, 1, s2
 ; GFX12W32-NEXT:    s_add_co_i32 s0, s0, s3
 ; GFX12W32-NEXT:    s_wait_alu 0xfffe
-; GFX12W32-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12W32-NEXT:    s_and_not1_b32 s1, s1, s2
 ; GFX12W32-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX12W32-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX12W32-NEXT:    v_mbcnt_lo_u32_b32 v1, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 9db1ffb041f10..05b24bbc4cc97 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -39137,21 +39137,21 @@ define bfloat @v_sitofp_i64_to_bf16(i64 %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v2, v0, v1
 ; GFX1250-NEXT:    v_cls_i32_e32 v3, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v3, -1, v3 :: v_dual_bitop2_b32 v2, v0, v1 bitop3:0x14
 ; GFX1250-NEXT:    v_ashrrev_i32_e32 v2, 31, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_add_nc_u32_e32 v2, 32, v2
+; GFX1250-NEXT:    v_min_u32_e32 v2, v3, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250-NEXT:    v_add_min_u32_e64 v2, v3, -1, v2
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v2, v[0:1]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v2 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, s0
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp i64 %x to bfloat
@@ -39489,29 +39489,30 @@ define <2 x bfloat> @v_sitofp_v2i64_to_v2bf16(<2 x i64> %x) {
 ; GFX1250-NEXT:    v_xor_b32_e32 v4, v2, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v6, v3
 ; GFX1250-NEXT:    v_cls_i32_e32 v7, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_dual_ashrrev_i32 v5, 31, v5 :: v_dual_ashrrev_i32 v4, 31, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_add_nc_u32 v7, -1, v7
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_dual_add_nc_u32 v5, 32, v5 :: v_dual_add_nc_u32 v4, 32, v4
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32_e64 v5, v7, -1, v5
-; GFX1250-NEXT:    v_add_min_u32_e64 v4, v6, -1, v4
+; GFX1250-NEXT:    v_min_u32_e32 v5, v7, v5
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v4, v6, v4
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v5, v[0:1]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v4, v[2:3]
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_dual_sub_nc_u32 v1, 32, v4 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v3, 32, v5
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250-NEXT:    v_ldexp_f32 v0, v0, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250-NEXT:    v_ldexp_f32 v1, v2, v1
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1250-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v1
 ; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = sitofp <2 x i64> %x to <2 x bfloat>
@@ -39974,41 +39975,42 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250TRUE16:       ; %bb.0:
 ; GFX1250TRUE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v2, v3
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v6, v4, v5
+; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v7, v4, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250TRUE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_1) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_ashrrev_i32 v6, 31, v6
-; GFX1250TRUE16-NEXT:    v_xor_b32_e32 v8, v0, v1
-; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v6, 32, v6
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v8, 31, v8
-; GFX1250TRUE16-NEXT:    v_add_min_u32_e64 v7, v10, -1, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_add_min_u32_e64 v6, v9, -1, v6
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
-; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_ashrrev_i32_e32 v7, 31, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250TRUE16-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_bitop2_b32 v8, v2, v3 bitop3:0x14
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250TRUE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_add_nc_u32 v11, -1, v11
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v6, v6, v7
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v7, v10, v8
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v8, v11, v9
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_add_min_u32_e64 v8, v11, -1, v8
-; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250TRUE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v8, v[0:1]
-; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v5, 32, v8
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX1250TRUE16-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250TRUE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250TRUE16-NEXT:    v_or_b32_e32 v4, v5, v4
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v5, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250TRUE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v4
 ; GFX1250TRUE16-NEXT:    v_sub_nc_u32_e32 v4, 32, v7
-; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
 ; GFX1250TRUE16-NEXT:    v_ldexp_f32 v1, v1, v3
 ; GFX1250TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
@@ -40023,44 +40025,47 @@ define <3 x bfloat> @v_sitofp_v3i64_to_v3bf16(<3 x i64> %x) {
 ; GFX1250FAKE16:       ; %bb.0:
 ; GFX1250FAKE16-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v8, v4, v5
-; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v6, v2, v3
+; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v6, v5
+; GFX1250FAKE16-NEXT:    v_xor_b32_e32 v7, v2, v3
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v10, v3
-; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v9, v5
 ; GFX1250FAKE16-NEXT:    v_cls_i32_e32 v11, v1
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v8, 31, v8 :: v_dual_bitop2_b32 v7, v0, v1 bitop3:0x14
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v6, 31, v6 :: v_dual_ashrrev_i32 v7, 31, v7
-; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, 32, v6 :: v_dual_add_nc_u32 v7, 32, v7
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_add_min_u32_e64 v6, v10, -1, v6
-; GFX1250FAKE16-NEXT:    v_add_min_u32_e64 v7, v11, -1, v7
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v6, -1, v6 :: v_dual_bitop2_b32 v8, v4, v5 bitop3:0x14
+; GFX1250FAKE16-NEXT:    v_dual_ashrrev_i32 v7, 31, v7 :: v_dual_bitop2_b32 v9, v0, v1 bitop3:0x14
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v6, v[2:3]
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v7, v[0:1]
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v11, -1, v11 :: v_dual_ashrrev_i32 v9, 31, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_dual_add_nc_u32 v7, 32, v7 :: v_dual_add_nc_u32 v8, 32, v8
+; GFX1250FAKE16-NEXT:    v_add_nc_u32_e32 v9, 32, v9
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v7, v10, v7
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v6, v6, v8
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_or_b32_e32 v2, v3, v2
-; GFX1250FAKE16-NEXT:    v_add_min_u32_e64 v8, v9, -1, v8
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v9, v11, v9
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[2:3], v7, v[2:3]
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v6 :: v_dual_bitop2_b32 v0, v1, v0 bitop3:0x54
-; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v8, v[4:5]
-; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v8, 32, v8
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[4:5], v6, v[4:5]
+; GFX1250FAKE16-NEXT:    v_lshlrev_b64_e32 v[0:1], v9, v[0:1]
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v2, 1, v2
 ; GFX1250FAKE16-NEXT:    v_min_u32_e32 v4, 1, v4
-; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v7 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4)
+; GFX1250FAKE16-NEXT:    v_min_u32_e32 v0, 1, v0
+; GFX1250FAKE16-NEXT:    v_sub_nc_u32_e32 v6, 32, v6
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v3, 32, v7 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_or_b32_e32 v0, v1, v0
+; GFX1250FAKE16-NEXT:    v_dual_sub_nc_u32 v4, 32, v9 :: v_dual_bitop2_b32 v1, v5, v4 bitop3:0x54
+; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v2, v2
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v2, v2, v3
 ; GFX1250FAKE16-NEXT:    v_ldexp_f32 v0, v0, v4
-; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v8
+; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX1250FAKE16-NEXT:    v_ldexp_f32 v1, v1, v6
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v0, v0, v2
 ; GFX1250FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX1250FAKE16-NEXT:    v_cvt_pk_bf16_f32 v1, v1, s0
@@ -40650,51 +40655,54 @@ define <4 x bfloat> @v_sitofp_v4i64_to_v4bf16(<4 x i64> %x) {
 ; GFX1250:       ; %bb.0:
 ; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-NEXT:    v_xor_b32_e32 v9, v4, v5
+; GFX1250-NEXT:    v_cls_i32_e32 v9, v7
 ; GFX1250-NEXT:    v_xor_b32_e32 v8, v6, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v12, v7
-; GFX1250-NEXT:    v_cls_i32_e32 v13, v5
-; GFX1250-NEXT:    v_cls_i32_e32 v14, v3
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v9, 31, v9 :: v_dual_ashrrev_i32 v8, 31, v8
-; GFX1250-NEXT:    v_xor_b32_e32 v10, v2, v3
-; GFX1250-NEXT:    v_cls_i32_e32 v15, v1
-; GFX1250-NEXT:    v_dual_add_nc_u32 v9, 32, v9 :: v_dual_add_nc_u32 v8, 32, v8
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v10, 31, v10 :: v_dual_bitop2_b32 v11, v0, v1 bitop3:0x14
-; GFX1250-NEXT:    v_add_min_u32_e64 v9, v13, -1, v9
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_add_min_u32_e64 v8, v12, -1, v8
-; GFX1250-NEXT:    v_dual_ashrrev_i32 v11, 31, v11 :: v_dual_add_nc_u32 v10, 32, v10
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v9, v[4:5]
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_add_nc_u32_e32 v11, 32, v11
-; GFX1250-NEXT:    v_add_min_u32_e64 v10, v14, -1, v10
+; GFX1250-NEXT:    v_cls_i32_e32 v10, v5
+; GFX1250-NEXT:    v_xor_b32_e32 v14, v0, v1
+; GFX1250-NEXT:    v_cls_i32_e32 v12, v3
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v9, -1, v9 :: v_dual_ashrrev_i32 v8, 31, v8
+; GFX1250-NEXT:    v_dual_add_nc_u32 v10, -1, v10 :: v_dual_bitop2_b32 v11, v4, v5 bitop3:0x14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1250-NEXT:    v_add_min_u32_e64 v11, v15, -1, v11
-; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v10, v[2:3]
-; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
-; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    v_dual_add_nc_u32 v8, 32, v8 :: v_dual_bitop2_b32 v13, v2, v3 bitop3:0x14
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v11, 31, v11
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_min_u32_e32 v8, v9, v8
+; GFX1250-NEXT:    v_ashrrev_i32_e32 v9, 31, v13
+; GFX1250-NEXT:    v_cls_i32_e32 v13, v1
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_ashrrev_i32 v14, 31, v14 :: v_dual_add_nc_u32 v11, 32, v11
+; GFX1250-NEXT:    v_dual_add_nc_u32 v12, -1, v12 :: v_dual_add_nc_u32 v9, 32, v9
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_dual_add_nc_u32 v13, -1, v13 :: v_dual_add_nc_u32 v14, 32, v14
+; GFX1250-NEXT:    v_min_u32_e32 v10, v10, v11
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[6:7], v8, v[6:7]
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
+; GFX1250-NEXT:    v_min_u32_e32 v9, v12, v9
+; GFX1250-NEXT:    v_min_u32_e32 v11, v13, v14
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[4:5], v10, v[4:5]
+; GFX1250-NEXT:    v_lshlrev_b64_e32 v[2:3], v9, v[2:3]
+; GFX1250-NEXT:    v_min_u32_e32 v6, 1, v6
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_lshlrev_b64_e32 v[0:1], v11, v[0:1]
+; GFX1250-NEXT:    v_min_u32_e32 v4, 1, v4
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_4) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_or_b32_e32 v6, v7, v6
 ; GFX1250-NEXT:    v_min_u32_e32 v2, 1, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4)
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v10 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
-; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v9
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v7, 32, v10
 ; GFX1250-NEXT:    v_min_u32_e32 v0, 1, v0
-; GFX1250-NEXT:    v_dual_sub_nc_u32 v3, 32, v8 :: v_dual_bitop2_b32 v2, v3, v2 bitop3:0x54
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
+; GFX1250-NEXT:    v_dual_sub_nc_u32 v5, 32, v9 :: v_dual_bitop2_b32 v4, v5, v4 bitop3:0x54
+; GFX1250-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_3)
 ; GFX1250-NEXT:    v_or_b32_e32 v0, v1, v0
-; GFX1250-NEXT:    v_cvt_f32_i32_e32 v1, v6
+; GFX1250-NEXT:    v_sub_nc_u32_e32 v1, 32, v8
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v3, v6
+; GFX1250-NEXT:    v_cvt_f32_i32_e32 v4, v4
 ; GFX1250-NEXT:    v_sub_nc_u32_e32 v6, 32, v11
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v2, v2
-; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX1250-NEXT:    v_ldexp_f32 v1, v1, v3
+; GFX1250-NEXT:    v_ldexp_f32 v1, v3, v1
 ; GFX1250-NEXT:    v_ldexp_f32 v3, v4, v7
 ; GFX1250-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX1250-NEXT:    v_ldexp_f32 v2, v2, v5
diff --git a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
index afd0f01580538..68313807c427f 100644
--- a/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
+++ b/llvm/test/CodeGen/AMDGPU/buffer-fat-pointers-memcpy.ll
@@ -415,28 +415,18 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX942-LABEL: memcpy_known:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x2000
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -491,25 +481,16 @@ define amdgpu_kernel void @memcpy_known(ptr addrspace(7) %src, ptr addrspace(7)
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB0_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -960,28 +941,18 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX942-LABEL: memcpy_known_medium:
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[12:15], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
+; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_load_dword s3, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s13
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s14
-; GISEL-GFX942-NEXT:    s_mov_b32 s16, 0
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s15
-; GISEL-GFX942-NEXT:    s_mov_b32 s2, s7
-; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[2:3]
-; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, 0x100
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v1, s16
 ; GISEL-GFX942-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
@@ -1036,25 +1007,16 @@ define amdgpu_kernel void @memcpy_known_medium(ptr addrspace(7) %src, ptr addrsp
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_load_b32 s15, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s17, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, 0
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s17
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s17
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s12
-; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s17
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, 0
+; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX1100-NEXT:    s_mov_b32 s2, s17
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[16:17], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s3
-; GISEL-GFX1100-NEXT:    s_mov_b32 s3, s10
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[16:17], s[6:7]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s9
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[12:13], s[16:17], s[2:3]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s16, s11
-; GISEL-GFX1100-NEXT:    s_or_b64 s[14:15], s[16:17], s[14:15]
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
+; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
+; GISEL-GFX1100-NEXT:    s_mov_b32 s13, s10
+; GISEL-GFX1100-NEXT:    s_mov_b32 s14, s11
 ; GISEL-GFX1100-NEXT:  .LBB1_1: ; %load-store-loop
 ; GISEL-GFX1100-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GISEL-GFX1100-NEXT:    v_add_nc_u32_e32 v61, s0, v0
@@ -1228,27 +1190,18 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX942:       ; %bb.0:
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GISEL-GFX942-NEXT:    s_load_dword s11, s[4:5], 0x34
-; GISEL-GFX942-NEXT:    s_mov_b32 s7, 0
-; GISEL-GFX942-NEXT:    s_mov_b32 s8, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s10, s7
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s8, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s9, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GISEL-GFX942-NEXT:    s_mov_b32 s10, s3
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v4, s0
 ; GISEL-GFX942-NEXT:    buffer_load_dwordx4 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX942-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x44
-; GISEL-GFX942-NEXT:    s_load_dword s13, s[4:5], 0x54
-; GISEL-GFX942-NEXT:    s_mov_b32 s4, s7
-; GISEL-GFX942-NEXT:    s_mov_b32 s12, s7
+; GISEL-GFX942-NEXT:    s_load_dword s7, s[4:5], 0x54
 ; GISEL-GFX942-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX942-NEXT:    s_mov_b32 s6, s1
+; GISEL-GFX942-NEXT:    s_mov_b32 s4, s1
 ; GISEL-GFX942-NEXT:    s_mov_b32 s5, s2
-; GISEL-GFX942-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GISEL-GFX942-NEXT:    s_mov_b32 s6, s3
-; GISEL-GFX942-NEXT:    s_or_b64 s[6:7], s[6:7], s[12:13]
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v5, s0
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX942-NEXT:    buffer_store_dwordx4 v[0:3], v5, s[4:7], 0 offen
@@ -1261,35 +1214,24 @@ define amdgpu_kernel void @memcpy_known_small(ptr addrspace(7) %src, ptr addrspa
 ; GISEL-GFX1100:       ; %bb.0:
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
 ; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x34
-; GISEL-GFX1100-NEXT:    s_mov_b32 s13, 0
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s13
-; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s13
+; GISEL-GFX1100-NEXT:    s_load_b32 s11, s[4:5], 0x34
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s1
-; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
 ; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v4, s0
-; GISEL-GFX1100-NEXT:    s_or_b64 s[0:1], s[12:13], s[8:9]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s3
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[2:3], s[12:13], s[6:7]
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen
+; GISEL-GFX1100-NEXT:    s_mov_b32 s8, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s9, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s10, s3
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen
 ; GISEL-GFX1100-NEXT:    s_clause 0x1
-; GISEL-GFX1100-NEXT:    s_load_b128 s[8:11], s[4:5], 0x44
+; GISEL-GFX1100-NEXT:    s_load_b128 s[0:3], s[4:5], 0x44
 ; GISEL-GFX1100-NEXT:    s_load_b32 s7, s[4:5], 0x54
-; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s13
 ; GISEL-GFX1100-NEXT:    s_waitcnt lgkmcnt(0)
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s9
-; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s10
-; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s8
-; GISEL-GFX1100-NEXT:    s_or_b64 s[4:5], s[12:13], s[4:5]
-; GISEL-GFX1100-NEXT:    s_mov_b32 s12, s11
-; GISEL-GFX1100-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GISEL-GFX1100-NEXT:    s_or_b64 s[6:7], s[12:13], s[6:7]
+; GISEL-GFX1100-NEXT:    v_mov_b32_e32 v5, s0
+; GISEL-GFX1100-NEXT:    s_mov_b32 s4, s1
+; GISEL-GFX1100-NEXT:    s_mov_b32 s5, s2
+; GISEL-GFX1100-NEXT:    s_mov_b32 s6, s3
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen
-; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[0:3], 0 offen offset:16
+; GISEL-GFX1100-NEXT:    buffer_load_b128 v[0:3], v4, s[8:11], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_waitcnt vmcnt(0)
 ; GISEL-GFX1100-NEXT:    buffer_store_b128 v[0:3], v5, s[4:7], 0 offen offset:16
 ; GISEL-GFX1100-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
index 4af226b4e2e49..2904bdbbdda3d 100644
--- a/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
+++ b/llvm/test/CodeGen/AMDGPU/carryout-selection.ll
@@ -704,7 +704,6 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_add_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_addc_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -725,16 +724,14 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -746,12 +743,10 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -764,10 +759,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_add_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_addc_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -781,10 +774,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -798,10 +789,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_add_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_addc_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -814,10 +803,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -831,10 +818,8 @@ define amdgpu_kernel void @suaddo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_add_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1691,7 +1676,6 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; CISI-NEXT:    s_sub_u32 s4, s4, s6
 ; CISI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; CISI-NEXT:    s_or_b32 s6, s12, s13
-; CISI-NEXT:    s_cmp_lg_u32 s6, 0
 ; CISI-NEXT:    s_subb_u32 s5, s5, s7
 ; CISI-NEXT:    s_mov_b32 s8, s0
 ; CISI-NEXT:    s_mov_b32 s9, s1
@@ -1712,16 +1696,14 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -1733,12 +1715,10 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -1751,10 +1731,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1010-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1010-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1010-NEXT:    s_sub_u32 s0, s12, s14
-; GFX1010-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
-; GFX1010-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1010-NEXT:    s_subb_u32 s1, s13, s15
+; GFX1010-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX1010-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1010-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1010-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -1768,10 +1746,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W32-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W32-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W32-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W32-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W32-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX1030W32-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W32-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1785,10 +1761,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX1030W64-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX1030W64-NEXT:    s_sub_u32 s4, s4, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
 ; GFX1030W64-NEXT:    s_subb_u32 s5, s5, s7
+; GFX1030W64-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX1030W64-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX1030W64-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
@@ -1801,10 +1775,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
@@ -1818,10 +1790,8 @@ define amdgpu_kernel void @susubo64(ptr addrspace(1) %out, ptr addrspace(1) %car
 ; GFX1250-NEXT:    s_load_b256 s[8:15], s[4:5], 0x24
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_sub_co_u32 s0, s12, s14
-; GFX1250-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
-; GFX1250-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s1, s13, s15
+; GFX1250-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v0, s0
 ; GFX1250-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX1250-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX1250-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -2218,49 +2188,46 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; VI-NEXT:    s_addc_u32 s6, s7, s9
 ; VI-NEXT:    s_addc_u32 s8, s8, 0
 ; VI-NEXT:    v_readfirstlane_b32 s7, v0
-; VI-NEXT:    s_add_u32 s12, s6, s7
-; VI-NEXT:    v_mov_b32_e32 v0, s12
+; VI-NEXT:    s_add_u32 s10, s6, s7
+; VI-NEXT:    v_mov_b32_e32 v0, s10
 ; VI-NEXT:    v_mad_u64_u32 v[0:1], s[6:7], s4, v0, 0
-; VI-NEXT:    s_addc_u32 s13, 0, s8
-; VI-NEXT:    s_mul_i32 s8, s4, s13
+; VI-NEXT:    s_addc_u32 s11, 0, s8
+; VI-NEXT:    s_mul_i32 s8, s4, s11
 ; VI-NEXT:    v_readfirstlane_b32 s9, v1
 ; VI-NEXT:    s_add_i32 s8, s9, s8
-; VI-NEXT:    s_mul_i32 s9, s5, s12
-; VI-NEXT:    s_add_i32 s14, s8, s9
-; VI-NEXT:    s_sub_i32 s10, s3, s14
+; VI-NEXT:    s_mul_i32 s9, s5, s10
+; VI-NEXT:    s_add_i32 s12, s8, s9
+; VI-NEXT:    s_sub_i32 s13, s3, s12
 ; VI-NEXT:    v_readfirstlane_b32 s8, v0
-; VI-NEXT:    s_sub_u32 s15, s2, s8
+; VI-NEXT:    s_sub_u32 s14, s2, s8
 ; VI-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s16, s10, s5
-; VI-NEXT:    s_sub_u32 s17, s15, s4
-; VI-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; VI-NEXT:    s_subb_u32 s10, s16, 0
-; VI-NEXT:    s_cmp_ge_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s11, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s17, s4
+; VI-NEXT:    s_subb_u32 s13, s13, s5
+; VI-NEXT:    s_sub_u32 s15, s14, s4
+; VI-NEXT:    s_subb_u32 s13, s13, 0
+; VI-NEXT:    s_cmp_ge_u32 s13, s5
 ; VI-NEXT:    s_cselect_b32 s16, -1, 0
-; VI-NEXT:    s_cmp_eq_u32 s10, s5
-; VI-NEXT:    s_cselect_b32 s10, s16, s11
-; VI-NEXT:    s_add_u32 s11, s12, 1
-; VI-NEXT:    s_addc_u32 s16, s13, 0
-; VI-NEXT:    s_add_u32 s17, s12, 2
-; VI-NEXT:    s_addc_u32 s18, s13, 0
-; VI-NEXT:    s_cmp_lg_u32 s10, 0
-; VI-NEXT:    s_cselect_b32 s10, s17, s11
-; VI-NEXT:    s_cselect_b32 s11, s18, s16
+; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cselect_b32 s15, -1, 0
+; VI-NEXT:    s_cmp_eq_u32 s13, s5
+; VI-NEXT:    s_cselect_b32 s13, s15, s16
+; VI-NEXT:    s_add_u32 s15, s10, 1
+; VI-NEXT:    s_addc_u32 s16, s11, 0
+; VI-NEXT:    s_add_u32 s17, s10, 2
+; VI-NEXT:    s_addc_u32 s18, s11, 0
+; VI-NEXT:    s_cmp_lg_u32 s13, 0
+; VI-NEXT:    s_cselect_b32 s13, s17, s15
+; VI-NEXT:    s_cselect_b32 s15, s18, s16
 ; VI-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; VI-NEXT:    s_subb_u32 s3, s3, s14
+; VI-NEXT:    s_subb_u32 s3, s3, s12
 ; VI-NEXT:    s_cmp_ge_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s8, -1, 0
-; VI-NEXT:    s_cmp_ge_u32 s15, s4
+; VI-NEXT:    s_cmp_ge_u32 s14, s4
 ; VI-NEXT:    s_cselect_b32 s9, -1, 0
 ; VI-NEXT:    s_cmp_eq_u32 s3, s5
 ; VI-NEXT:    s_cselect_b32 s3, s9, s8
 ; VI-NEXT:    s_cmp_lg_u32 s3, 0
-; VI-NEXT:    s_cselect_b32 s9, s11, s13
-; VI-NEXT:    s_cselect_b32 s8, s10, s12
+; VI-NEXT:    s_cselect_b32 s9, s15, s11
+; VI-NEXT:    s_cselect_b32 s8, s13, s10
 ; VI-NEXT:    s_cbranch_execnz .LBB16_4
 ; VI-NEXT:  .LBB16_2:
 ; VI-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2311,8 +2278,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:  ; %bb.1:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v1, s7
-; GFX9-NEXT:    s_sub_u32 s10, 0, s6
-; GFX9-NEXT:    s_subb_u32 s11, 0, s7
+; GFX9-NEXT:    s_sub_u32 s8, 0, s6
+; GFX9-NEXT:    s_subb_u32 s9, 0, s7
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX9-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX9-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2321,109 +2288,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX9-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX9-NEXT:    v_readfirstlane_b32 s12, v1
-; GFX9-NEXT:    v_readfirstlane_b32 s8, v0
-; GFX9-NEXT:    s_mul_i32 s9, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s14, s10, s8
-; GFX9-NEXT:    s_mul_i32 s13, s11, s8
-; GFX9-NEXT:    s_add_i32 s9, s14, s9
-; GFX9-NEXT:    s_add_i32 s9, s9, s13
-; GFX9-NEXT:    s_mul_i32 s15, s10, s8
-; GFX9-NEXT:    s_mul_i32 s14, s8, s9
-; GFX9-NEXT:    s_mul_hi_u32 s16, s8, s15
-; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s9
+; GFX9-NEXT:    v_readfirstlane_b32 s10, v1
+; GFX9-NEXT:    v_readfirstlane_b32 s11, v0
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s14, s8, s11
+; GFX9-NEXT:    s_mul_i32 s13, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s14, s12
+; GFX9-NEXT:    s_add_i32 s12, s12, s13
+; GFX9-NEXT:    s_mul_i32 s15, s8, s11
+; GFX9-NEXT:    s_mul_i32 s14, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s16, s11, s15
+; GFX9-NEXT:    s_mul_hi_u32 s13, s11, s12
 ; GFX9-NEXT:    s_add_u32 s14, s16, s14
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s13
-; GFX9-NEXT:    s_mul_hi_u32 s17, s12, s15
-; GFX9-NEXT:    s_mul_i32 s15, s12, s15
+; GFX9-NEXT:    s_mul_hi_u32 s17, s10, s15
+; GFX9-NEXT:    s_mul_i32 s15, s10, s15
 ; GFX9-NEXT:    s_add_u32 s14, s14, s15
-; GFX9-NEXT:    s_mul_hi_u32 s16, s12, s9
+; GFX9-NEXT:    s_mul_hi_u32 s16, s10, s12
 ; GFX9-NEXT:    s_addc_u32 s13, s13, s17
 ; GFX9-NEXT:    s_addc_u32 s14, s16, 0
-; GFX9-NEXT:    s_mul_i32 s9, s12, s9
-; GFX9-NEXT:    s_add_u32 s9, s13, s9
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s12, s13, s12
 ; GFX9-NEXT:    s_addc_u32 s13, 0, s14
-; GFX9-NEXT:    s_add_u32 s14, s8, s9
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s12, s12, s13
-; GFX9-NEXT:    s_mul_i32 s8, s10, s12
-; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s14
-; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s11, s11, s14
-; GFX9-NEXT:    s_add_i32 s8, s8, s11
-; GFX9-NEXT:    s_mul_i32 s10, s10, s14
-; GFX9-NEXT:    s_mul_hi_u32 s11, s12, s10
-; GFX9-NEXT:    s_mul_i32 s13, s12, s10
-; GFX9-NEXT:    s_mul_i32 s16, s14, s8
-; GFX9-NEXT:    s_mul_hi_u32 s10, s14, s10
-; GFX9-NEXT:    s_mul_hi_u32 s15, s14, s8
-; GFX9-NEXT:    s_add_u32 s10, s10, s16
+; GFX9-NEXT:    s_add_u32 s11, s11, s12
+; GFX9-NEXT:    s_addc_u32 s10, s10, s13
+; GFX9-NEXT:    s_mul_i32 s12, s8, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s8, s11
+; GFX9-NEXT:    s_add_i32 s12, s13, s12
+; GFX9-NEXT:    s_mul_i32 s9, s9, s11
+; GFX9-NEXT:    s_add_i32 s12, s12, s9
+; GFX9-NEXT:    s_mul_i32 s8, s8, s11
+; GFX9-NEXT:    s_mul_hi_u32 s13, s10, s8
+; GFX9-NEXT:    s_mul_i32 s14, s10, s8
+; GFX9-NEXT:    s_mul_i32 s16, s11, s12
+; GFX9-NEXT:    s_mul_hi_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s15, s11, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s16
 ; GFX9-NEXT:    s_addc_u32 s15, 0, s15
-; GFX9-NEXT:    s_add_u32 s10, s10, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s12, s8
-; GFX9-NEXT:    s_addc_u32 s10, s15, s11
+; GFX9-NEXT:    s_add_u32 s8, s8, s14
+; GFX9-NEXT:    s_mul_hi_u32 s9, s10, s12
+; GFX9-NEXT:    s_addc_u32 s8, s15, s13
 ; GFX9-NEXT:    s_addc_u32 s9, s9, 0
-; GFX9-NEXT:    s_mul_i32 s8, s12, s8
-; GFX9-NEXT:    s_add_u32 s8, s10, s8
-; GFX9-NEXT:    s_addc_u32 s10, 0, s9
-; GFX9-NEXT:    s_add_u32 s11, s14, s8
-; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_addc_u32 s8, s12, s10
-; GFX9-NEXT:    s_mul_i32 s10, s2, s8
-; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s11
-; GFX9-NEXT:    s_mul_hi_u32 s9, s2, s8
-; GFX9-NEXT:    s_add_u32 s10, s12, s10
+; GFX9-NEXT:    s_mul_i32 s12, s10, s12
+; GFX9-NEXT:    s_add_u32 s8, s8, s12
 ; GFX9-NEXT:    s_addc_u32 s9, 0, s9
-; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s11
-; GFX9-NEXT:    s_mul_i32 s11, s3, s11
-; GFX9-NEXT:    s_add_u32 s10, s10, s11
-; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s8
-; GFX9-NEXT:    s_addc_u32 s9, s9, s13
-; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_addc_u32 s9, s10, s9
+; GFX9-NEXT:    s_mul_i32 s11, s2, s9
+; GFX9-NEXT:    s_mul_hi_u32 s12, s2, s8
+; GFX9-NEXT:    s_mul_hi_u32 s10, s2, s9
+; GFX9-NEXT:    s_add_u32 s11, s12, s11
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_hi_u32 s13, s3, s8
 ; GFX9-NEXT:    s_mul_i32 s8, s3, s8
-; GFX9-NEXT:    s_add_u32 s12, s9, s8
-; GFX9-NEXT:    s_addc_u32 s13, 0, s10
-; GFX9-NEXT:    s_mul_i32 s8, s6, s13
-; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s12
+; GFX9-NEXT:    s_add_u32 s8, s11, s8
+; GFX9-NEXT:    s_mul_hi_u32 s12, s3, s9
+; GFX9-NEXT:    s_addc_u32 s8, s10, s13
+; GFX9-NEXT:    s_addc_u32 s10, s12, 0
+; GFX9-NEXT:    s_mul_i32 s9, s3, s9
+; GFX9-NEXT:    s_add_u32 s11, s8, s9
+; GFX9-NEXT:    s_addc_u32 s10, 0, s10
+; GFX9-NEXT:    s_mul_i32 s8, s6, s10
+; GFX9-NEXT:    s_mul_hi_u32 s9, s6, s11
 ; GFX9-NEXT:    s_add_i32 s8, s9, s8
-; GFX9-NEXT:    s_mul_i32 s9, s7, s12
-; GFX9-NEXT:    s_add_i32 s14, s8, s9
-; GFX9-NEXT:    s_sub_i32 s10, s3, s14
-; GFX9-NEXT:    s_mul_i32 s8, s6, s12
-; GFX9-NEXT:    s_sub_u32 s15, s2, s8
+; GFX9-NEXT:    s_mul_i32 s9, s7, s11
+; GFX9-NEXT:    s_add_i32 s12, s8, s9
+; GFX9-NEXT:    s_sub_i32 s13, s3, s12
+; GFX9-NEXT:    s_mul_i32 s8, s6, s11
+; GFX9-NEXT:    s_sub_u32 s14, s2, s8
 ; GFX9-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s16, s10, s7
-; GFX9-NEXT:    s_sub_u32 s17, s15, s6
-; GFX9-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX9-NEXT:    s_subb_u32 s10, s16, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s17, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, s7
+; GFX9-NEXT:    s_sub_u32 s15, s14, s6
+; GFX9-NEXT:    s_subb_u32 s13, s13, 0
+; GFX9-NEXT:    s_cmp_ge_u32 s13, s7
 ; GFX9-NEXT:    s_cselect_b32 s16, -1, 0
-; GFX9-NEXT:    s_cmp_eq_u32 s10, s7
-; GFX9-NEXT:    s_cselect_b32 s10, s16, s11
-; GFX9-NEXT:    s_add_u32 s11, s12, 1
-; GFX9-NEXT:    s_addc_u32 s16, s13, 0
-; GFX9-NEXT:    s_add_u32 s17, s12, 2
-; GFX9-NEXT:    s_addc_u32 s18, s13, 0
-; GFX9-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX9-NEXT:    s_cselect_b32 s10, s17, s11
-; GFX9-NEXT:    s_cselect_b32 s11, s18, s16
+; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cselect_b32 s15, -1, 0
+; GFX9-NEXT:    s_cmp_eq_u32 s13, s7
+; GFX9-NEXT:    s_cselect_b32 s13, s15, s16
+; GFX9-NEXT:    s_add_u32 s15, s11, 1
+; GFX9-NEXT:    s_addc_u32 s16, s10, 0
+; GFX9-NEXT:    s_add_u32 s17, s11, 2
+; GFX9-NEXT:    s_addc_u32 s18, s10, 0
+; GFX9-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX9-NEXT:    s_cselect_b32 s13, s17, s15
+; GFX9-NEXT:    s_cselect_b32 s15, s18, s16
 ; GFX9-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX9-NEXT:    s_subb_u32 s3, s3, s14
+; GFX9-NEXT:    s_subb_u32 s3, s3, s12
 ; GFX9-NEXT:    s_cmp_ge_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s8, -1, 0
-; GFX9-NEXT:    s_cmp_ge_u32 s15, s6
+; GFX9-NEXT:    s_cmp_ge_u32 s14, s6
 ; GFX9-NEXT:    s_cselect_b32 s9, -1, 0
 ; GFX9-NEXT:    s_cmp_eq_u32 s3, s7
 ; GFX9-NEXT:    s_cselect_b32 s3, s9, s8
 ; GFX9-NEXT:    s_cmp_lg_u32 s3, 0
-; GFX9-NEXT:    s_cselect_b32 s9, s11, s13
-; GFX9-NEXT:    s_cselect_b32 s8, s10, s12
+; GFX9-NEXT:    s_cselect_b32 s9, s15, s10
+; GFX9-NEXT:    s_cselect_b32 s8, s13, s11
 ; GFX9-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX9-NEXT:  .LBB16_2:
 ; GFX9-NEXT:    v_cvt_f32_u32_e32 v0, s6
@@ -2503,44 +2463,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1010-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s11
-; GFX1010-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1010-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1010-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1010-NEXT:    s_add_i32 s9, s13, s9
-; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1010-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1010-NEXT:    s_add_i32 s9, s11, s9
+; GFX1010-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1010-NEXT:    s_add_i32 s9, s9, s10
-; GFX1010-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1010-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1010-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1010-NEXT:    s_add_u32 s12, s12, s15
+; GFX1010-NEXT:    s_add_u32 s10, s10, s15
+; GFX1010-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1010-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1010-NEXT:    s_add_u32 s10, s12, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1010-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1010-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1010-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1010-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1010-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1010-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1010-NEXT:    s_add_u32 s8, s8, s9
-; GFX1010-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1010-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1010-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1010-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1010-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1010-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1010-NEXT:    s_add_u32 s11, s11, s12
-; GFX1010-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1010-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1010-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1010-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1010-NEXT:    s_add_u32 s9, s9, s12
+; GFX1010-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1010-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1010-NEXT:    s_add_u32 s8, s11, s8
+; GFX1010-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1010-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1010-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1010-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1010-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1010-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1010-NEXT:    s_addc_u32 s8, 0, s9
@@ -2553,11 +2509,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1010-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1010-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1010-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, s7
 ; GFX1010-NEXT:    s_sub_u32 s13, s10, s6
-; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1010-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1010-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1010-NEXT:    s_cmp_ge_u32 s11, s7
 ; GFX1010-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2663,44 +2616,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1030W32-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s11
-; GFX1030W32-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1030W32-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s12
-; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1030W32-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s9, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1030W32-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX1030W32-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1030W32-NEXT:    s_add_i32 s9, s11, s9
+; GFX1030W32-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX1030W32-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W32-NEXT:    s_mul_i32 s10, s7, s11
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1030W32-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1030W32-NEXT:    s_add_u32 s12, s12, s15
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s15
+; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX1030W32-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX1030W32-NEXT:    s_add_u32 s10, s12, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX1030W32-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1030W32-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1030W32-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1030W32-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1030W32-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1030W32-NEXT:    s_add_u32 s8, s8, s9
-; GFX1030W32-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1030W32-NEXT:    s_addc_u32 s7, s7, s10
-; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s12, s2, s7
-; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX1030W32-NEXT:    s_add_u32 s11, s11, s12
-; GFX1030W32-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W32-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX1030W32-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1030W32-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1030W32-NEXT:    s_add_u32 s9, s9, s12
+; GFX1030W32-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1030W32-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX1030W32-NEXT:    s_add_u32 s8, s11, s8
+; GFX1030W32-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1030W32-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W32-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1030W32-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1030W32-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1030W32-NEXT:    s_add_u32 s7, s8, s7
 ; GFX1030W32-NEXT:    s_addc_u32 s8, 0, s9
@@ -2713,11 +2662,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W32-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1030W32-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1030W32-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX1030W32-NEXT:    s_sub_u32 s13, s10, s4
-; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W32-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1030W32-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1030W32-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX1030W32-NEXT:    s_cselect_b32 s14, -1, 0
@@ -2790,8 +2736,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:  ; %bb.1:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v1, s5
-; GFX1030W64-NEXT:    s_sub_u32 s9, 0, s4
-; GFX1030W64-NEXT:    s_subb_u32 s10, 0, s5
+; GFX1030W64-NEXT:    s_sub_u32 s8, 0, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, 0, s5
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0x4f800000, v0
 ; GFX1030W64-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1030W64-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2800,109 +2746,102 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1030W64-NEXT:    v_fmamk_f32 v0, v1, 0xcf800000, v0
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1030W64-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v0
-; GFX1030W64-NEXT:    s_mul_i32 s7, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s9, s6
-; GFX1030W64-NEXT:    s_mul_i32 s11, s10, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s12, s7
-; GFX1030W64-NEXT:    s_mul_i32 s13, s9, s6
-; GFX1030W64-NEXT:    s_add_i32 s7, s7, s11
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s13
-; GFX1030W64-NEXT:    s_mul_i32 s15, s6, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s6, s7
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s6, v1
+; GFX1030W64-NEXT:    v_readfirstlane_b32 s7, v0
+; GFX1030W64-NEXT:    s_mul_i32 s10, s8, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s12, s10
+; GFX1030W64-NEXT:    s_mul_i32 s13, s8, s7
+; GFX1030W64-NEXT:    s_add_i32 s10, s10, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s7, s13
+; GFX1030W64-NEXT:    s_mul_i32 s15, s7, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s14, s6, s13
+; GFX1030W64-NEXT:    s_mul_i32 s11, s6, s13
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s10
 ; GFX1030W64-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s8, s7
+; GFX1030W64-NEXT:    s_mul_hi_u32 s16, s6, s10
 ; GFX1030W64-NEXT:    s_add_u32 s11, s12, s11
-; GFX1030W64-NEXT:    s_mul_i32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1030W64-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1030W64-NEXT:    s_add_u32 s7, s11, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1030W64-NEXT:    s_add_u32 s12, s6, s7
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_i32 s6, s9, s12
-; GFX1030W64-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1030W64-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1030W64-NEXT:    s_mul_hi_u32 s7, s12, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s13, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s8, s6
-; GFX1030W64-NEXT:    s_add_i32 s9, s9, s10
-; GFX1030W64-NEXT:    s_mul_i32 s6, s8, s6
-; GFX1030W64-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1030W64-NEXT:    s_add_u32 s7, s7, s14
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s10
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s11
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s8, s8, s6
+; GFX1030W64-NEXT:    s_mul_i32 s9, s9, s7
+; GFX1030W64-NEXT:    s_add_i32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_i32 s10, s6, s11
+; GFX1030W64-NEXT:    s_add_i32 s8, s8, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s7, s11
+; GFX1030W64-NEXT:    s_mul_i32 s14, s7, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s13, s7, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s14
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s6, s11
 ; GFX1030W64-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1030W64-NEXT:    s_add_u32 s6, s7, s6
-; GFX1030W64-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1030W64-NEXT:    s_addc_u32 s6, s13, s11
-; GFX1030W64-NEXT:    s_addc_u32 s7, s10, 0
-; GFX1030W64-NEXT:    s_add_u32 s6, s6, s9
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s12, s6
-; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s3, s10
-; GFX1030W64-NEXT:    s_addc_u32 s7, s8, s9
-; GFX1030W64-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1030W64-NEXT:    s_mul_i32 s10, s2, s7
-; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s2, s7
-; GFX1030W64-NEXT:    s_add_u32 s10, s11, s10
-; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s7
-; GFX1030W64-NEXT:    s_add_u32 s8, s10, s8
+; GFX1030W64-NEXT:    s_mul_hi_u32 s11, s6, s8
+; GFX1030W64-NEXT:    s_add_u32 s9, s9, s10
+; GFX1030W64-NEXT:    s_mul_i32 s8, s6, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1030W64-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1030W64-NEXT:    s_add_u32 s8, s9, s8
+; GFX1030W64-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1030W64-NEXT:    s_add_u32 s7, s7, s8
+; GFX1030W64-NEXT:    s_addc_u32 s6, s6, s9
+; GFX1030W64-NEXT:    s_mul_hi_u32 s8, s2, s7
+; GFX1030W64-NEXT:    s_mul_i32 s11, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s10, s2, s6
+; GFX1030W64-NEXT:    s_mul_hi_u32 s9, s3, s7
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s3, s7
-; GFX1030W64-NEXT:    s_addc_u32 s6, s9, s6
+; GFX1030W64-NEXT:    s_add_u32 s8, s8, s11
+; GFX1030W64-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1030W64-NEXT:    s_mul_hi_u32 s12, s3, s6
+; GFX1030W64-NEXT:    s_add_u32 s7, s8, s7
+; GFX1030W64-NEXT:    s_mul_i32 s6, s3, s6
+; GFX1030W64-NEXT:    s_addc_u32 s7, s10, s9
 ; GFX1030W64-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1030W64-NEXT:    s_add_u32 s10, s6, s7
+; GFX1030W64-NEXT:    s_add_u32 s10, s7, s6
 ; GFX1030W64-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1030W64-NEXT:    s_mul_hi_u32 s6, s4, s10
 ; GFX1030W64-NEXT:    s_mul_i32 s7, s4, s11
 ; GFX1030W64-NEXT:    s_mul_i32 s8, s5, s10
 ; GFX1030W64-NEXT:    s_add_i32 s6, s6, s7
-; GFX1030W64-NEXT:    s_add_i32 s12, s6, s8
+; GFX1030W64-NEXT:    s_add_i32 s8, s6, s8
 ; GFX1030W64-NEXT:    s_mul_i32 s6, s4, s10
-; GFX1030W64-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1030W64-NEXT:    s_sub_u32 s13, s2, s6
+; GFX1030W64-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1030W64-NEXT:    s_sub_u32 s12, s2, s6
 ; GFX1030W64-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s14, s8, s5
-; GFX1030W64-NEXT:    s_sub_u32 s15, s13, s4
-; GFX1030W64-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1030W64-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s15, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, s5
+; GFX1030W64-NEXT:    s_sub_u32 s13, s12, s4
+; GFX1030W64-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s9, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1030W64-NEXT:    s_cmp_eq_u32 s8, s5
-; GFX1030W64-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1030W64-NEXT:    s_add_u32 s9, s10, 1
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1030W64-NEXT:    s_cmp_eq_u32 s9, s5
+; GFX1030W64-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1030W64-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1030W64-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1030W64-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1030W64-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1030W64-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1030W64-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1030W64-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1030W64-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1030W64-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1030W64-NEXT:    s_cmp_lg_u64 s[6:7], 0
-; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1030W64-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1030W64-NEXT:    s_cmp_ge_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX1030W64-NEXT:    s_cmp_ge_u32 s13, s4
+; GFX1030W64-NEXT:    s_cmp_ge_u32 s12, s4
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX1030W64-NEXT:    s_cmp_eq_u32 s3, s5
 ; GFX1030W64-NEXT:    s_cselect_b32 s3, s7, s6
 ; GFX1030W64-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1030W64-NEXT:    s_cselect_b32 s7, s14, s11
-; GFX1030W64-NEXT:    s_cselect_b32 s6, s15, s10
+; GFX1030W64-NEXT:    s_cselect_b32 s6, s13, s10
 ; GFX1030W64-NEXT:    s_cbranch_execnz .LBB16_3
 ; GFX1030W64-NEXT:  .LBB16_2:
 ; GFX1030W64-NEXT:    v_cvt_f32_u32_e32 v0, s4
@@ -2988,44 +2927,40 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_u32 s11, s12, s11
 ; GFX11-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX11-NEXT:    s_add_u32 s8, s8, s11
-; GFX11-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX11-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s12
-; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX11-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s9, s9, s7
-; GFX11-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX11-NEXT:    s_add_i32 s9, s13, s9
-; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s11
+; GFX11-NEXT:    s_mul_i32 s10, s10, s8
+; GFX11-NEXT:    s_add_i32 s9, s11, s9
+; GFX11-NEXT:    s_mul_i32 s11, s7, s12
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
-; GFX11-NEXT:    s_mul_i32 s10, s7, s11
+; GFX11-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX11-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX11-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX11-NEXT:    s_add_u32 s12, s12, s15
+; GFX11-NEXT:    s_add_u32 s10, s10, s15
+; GFX11-NEXT:    s_mul_hi_u32 s13, s7, s12
 ; GFX11-NEXT:    s_addc_u32 s14, 0, s14
-; GFX11-NEXT:    s_mul_hi_u32 s11, s7, s9
-; GFX11-NEXT:    s_add_u32 s10, s12, s10
+; GFX11-NEXT:    s_mul_hi_u32 s12, s7, s9
+; GFX11-NEXT:    s_add_u32 s10, s10, s11
 ; GFX11-NEXT:    s_mul_i32 s9, s7, s9
 ; GFX11-NEXT:    s_addc_u32 s10, s14, s13
-; GFX11-NEXT:    s_addc_u32 s11, s11, 0
+; GFX11-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX11-NEXT:    s_add_u32 s9, s10, s9
 ; GFX11-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX11-NEXT:    s_add_u32 s8, s8, s9
-; GFX11-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX11-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX11-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX11-NEXT:    s_addc_u32 s7, s7, s10
-; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX11-NEXT:    s_mul_i32 s12, s2, s7
-; GFX11-NEXT:    s_mul_hi_u32 s10, s2, s7
-; GFX11-NEXT:    s_add_u32 s11, s11, s12
-; GFX11-NEXT:    s_addc_u32 s10, 0, s10
+; GFX11-NEXT:    s_mul_hi_u32 s11, s2, s7
+; GFX11-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX11-NEXT:    s_mul_i32 s8, s3, s8
+; GFX11-NEXT:    s_add_u32 s9, s9, s12
+; GFX11-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX11-NEXT:    s_mul_hi_u32 s13, s3, s7
-; GFX11-NEXT:    s_add_u32 s8, s11, s8
+; GFX11-NEXT:    s_add_u32 s8, s9, s8
 ; GFX11-NEXT:    s_mul_i32 s7, s3, s7
-; GFX11-NEXT:    s_addc_u32 s8, s10, s9
+; GFX11-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX11-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX11-NEXT:    s_add_u32 s7, s8, s7
 ; GFX11-NEXT:    s_addc_u32 s8, 0, s9
@@ -3035,17 +2970,14 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX11-NEXT:    s_add_i32 s9, s9, s10
 ; GFX11-NEXT:    s_mul_i32 s10, s4, s7
 ; GFX11-NEXT:    s_add_i32 s9, s9, s11
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX11-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX11-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX11-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, s5
 ; GFX11-NEXT:    s_sub_u32 s13, s10, s4
-; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX11-NEXT:    s_subb_u32 s11, s11, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_cmp_ge_u32 s11, s5
 ; GFX11-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX11-NEXT:    s_cmp_ge_u32 s13, s4
@@ -3118,9 +3050,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_load_b64 s[6:7], s[4:5], 0x34
 ; GFX1250-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-NEXT:    s_or_b64 s[4:5], s[2:3], s[6:7]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_and_b64 s[4:5], s[4:5], 0xffffffff00000000
-; GFX1250-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX1250-NEXT:    s_cbranch_scc0 .LBB16_4
 ; GFX1250-NEXT:  ; %bb.1:
 ; GFX1250-NEXT:    s_cvt_f32_u32 s4, s6
@@ -3155,12 +3086,9 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[12:13], s[4:5], s[12:13]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s12
-; GFX1250-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX1250-NEXT:    s_add_co_ci_u32 s9, s9, s13
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_u64 s[10:11], s[10:11], s[8:9]
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s8, s11
 ; GFX1250-NEXT:    s_mul_i32 s12, s8, s11
 ; GFX1250-NEXT:    s_mul_hi_u32 s4, s8, s10
@@ -3175,19 +3103,17 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[10:11], s[4:5], s[10:11]
 ; GFX1250-NEXT:    s_add_co_u32 s8, s8, s10
-; GFX1250-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
-; GFX1250-NEXT:    s_cmp_lg_u32 s10, 0
-; GFX1250-NEXT:    s_mul_hi_u32 s12, s3, s8
 ; GFX1250-NEXT:    s_add_co_ci_u32 s10, s9, s11
-; GFX1250-NEXT:    s_mul_i32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s4, s2, s8
+; GFX1250-NEXT:    s_mul_hi_u32 s11, s3, s8
+; GFX1250-NEXT:    s_mul_i32 s12, s3, s8
 ; GFX1250-NEXT:    s_mul_hi_u32 s9, s2, s10
 ; GFX1250-NEXT:    s_mul_i32 s8, s2, s10
 ; GFX1250-NEXT:    s_mul_hi_u32 s13, s3, s10
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[8:9]
 ; GFX1250-NEXT:    s_mul_i32 s10, s3, s10
-; GFX1250-NEXT:    s_add_co_u32 s4, s8, s11
-; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s12
+; GFX1250-NEXT:    s_add_co_u32 s4, s8, s12
+; GFX1250-NEXT:    s_add_co_ci_u32 s4, s9, s11
 ; GFX1250-NEXT:    s_add_co_ci_u32 s11, s13, 0
 ; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_add_nc_u64 s[8:9], s[4:5], s[10:11]
@@ -3202,10 +3128,8 @@ define amdgpu_kernel void @sudiv64(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GFX1250-NEXT:    s_cmp_lg_u32 s8, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, s7
 ; GFX1250-NEXT:    s_sub_co_u32 s13, s4, s6
-; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX1250-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1250-NEXT:    s_sub_co_ci_u32 s12, s12, 0
+; GFX1250-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-NEXT:    s_cmp_ge_u32 s12, s7
 ; GFX1250-NEXT:    s_cselect_b32 s14, -1, 0
 ; GFX1250-NEXT:    s_cmp_ge_u32 s13, s6
diff --git a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
index 57a1e4cb795bf..ec92edbe2bf65 100644
--- a/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine_andor_with_cmps.ll
@@ -3385,7 +3385,7 @@ declare half @llvm.canonicalize.f16(half)
 declare <2 x half> @llvm.canonicalize.v2f16(<2 x half>)
 
 attributes #0 = { nounwind "amdgpu-ieee"="false" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11NONANS-FAKE16: {{.*}}
 ; GFX11NONANS-TRUE16: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
index 4b151b9038a60..07e6a76d14cf9 100644
--- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll
@@ -714,9 +714,8 @@ define amdgpu_kernel void @v_ctlz_zero_undef_i16_with_select(ptr addrspace(1) no
 ; VI-NEXT:    s_lshl_b32 s2, s2, 8
 ; VI-NEXT:    s_or_b32 s2, s2, s3
 ; VI-NEXT:    s_lshl_b32 s3, s2, 16
-; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_flbit_i32_b32 s3, s3
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
+; VI-NEXT:    s_and_b32 s2, s2, 0xffff
 ; VI-NEXT:    s_cselect_b32 s2, s3, 32
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
diff --git a/llvm/test/CodeGen/AMDGPU/ctpop16.ll b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
index cefcbddd3e394..fca57be5764f8 100644
--- a/llvm/test/CodeGen/AMDGPU/ctpop16.ll
+++ b/llvm/test/CodeGen/AMDGPU/ctpop16.ll
@@ -1491,7 +1491,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; SI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x9
 ; SI-NEXT:    s_waitcnt lgkmcnt(0)
 ; SI-NEXT:    s_lshr_b32 s4, s6, 16
-; SI-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; SI-NEXT:  ; %bb.1: ; %else
 ; SI-NEXT:    s_mov_b32 s11, 0xf000
@@ -1521,7 +1520,6 @@ define amdgpu_kernel void @ctpop_i16_in_br(ptr addrspace(1) %out, ptr addrspace(
 ; VI-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    s_lshr_b32 s4, s6, 16
-; VI-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-NEXT:    s_cbranch_scc0 .LBB14_4
 ; VI-NEXT:  ; %bb.1: ; %else
 ; VI-NEXT:    s_mov_b32 s11, 0xf000
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 6c8207a4b1396..df7f8c6f39b3f 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -4344,7 +4344,7 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[1:2]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v3, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v3
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
 ; GFX9-G-NEXT:    v_ashrrev_i32_e32 v2, 1, v2
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4375,14 +4375,12 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v6, v4
 ; GFX9-G-O0-NEXT:    s_mov_b32 s5, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s5
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[5:6]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v3, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v3
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v0, v[5:6]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_ashrrev_i32_e64 v3, v2, v4
@@ -4437,7 +4435,7 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-NEXT:    v_mov_b32_e32 v4, v1
 ; GFX9-G-NEXT:    v_lshlrev_b64 v[0:1], 31, v[2:3]
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v4
-; GFX9-G-NEXT:    v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT:    v_or_b32_e32 v0, v0, v2
 ; GFX9-G-NEXT:    v_lshrrev_b32_e32 v2, 1, v3
 ; GFX9-G-NEXT:    v_mov_b32_e32 v3, 0
 ; GFX9-G-NEXT:    s_setpc_b64 s[30:31]
@@ -4450,15 +4448,13 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v5, v3
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
-; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v0, v0, v1
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v0, v1
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 31
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
-; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[5:6], v2, v[4:5]
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v4, v5
-; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, v6
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v4
-; GFX9-G-O0-NEXT:    v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT:    v_lshlrev_b64 v[4:5], v0, v[4:5]
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-G-O0-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT:    v_or_b32_e64 v0, v0, v2
 ; GFX9-G-O0-NEXT:    s_mov_b32 s4, 1
 ; GFX9-G-O0-NEXT:    v_mov_b32_e32 v2, s4
 ; GFX9-G-O0-NEXT:    v_lshrrev_b32_e64 v2, v2, v3
diff --git a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
index d8a5e7fa3b029..dbdea8e3c533d 100644
--- a/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
+++ b/llvm/test/CodeGen/AMDGPU/expand-scalar-carry-out-select-user.ll
@@ -14,7 +14,6 @@ define i32 @s_add_co_select_user() {
 ; GFX7-NEXT:    s_add_u32 s7, s6, s6
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_or_b32 s4, s4, s5
-; GFX7-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX7-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX7-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX7-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -31,8 +30,6 @@ define i32 @s_add_co_select_user() {
 ; GFX9-NEXT:    s_load_dword s6, s[4:5], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    s_add_u32 s7, s6, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
 ; GFX9-NEXT:    s_addc_u32 s8, s6, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    s_and_b64 s[4:5], s[4:5], exec
@@ -49,8 +46,6 @@ define i32 @s_add_co_select_user() {
 ; GFX10-NEXT:    s_load_dword s4, s[4:5], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s5, s4, s4
-; GFX10-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX10-NEXT:    s_addc_u32 s6, s4, 0
 ; GFX10-NEXT:    s_cselect_b32 s7, -1, 0
 ; GFX10-NEXT:    s_and_b32 s7, s7, exec_lo
@@ -67,16 +62,13 @@ define i32 @s_add_co_select_user() {
 ; GFX11-NEXT:    s_load_b32 s0, s[0:1], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-NEXT:    s_addc_u32 s2, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s3, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s3, s3, exec_lo
 ; GFX11-NEXT:    s_cselect_b32 s2, s2, 0
 ; GFX11-NEXT:    s_cmp_gt_u32 s0, 31
 ; GFX11-NEXT:    s_cselect_b32 s0, s1, s2
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
 bb:
@@ -104,7 +96,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX7-NEXT:    s_add_u32 s0, s2, s2
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_or_b32 s0, s0, s1
-; GFX7-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX7-NEXT:    s_addc_u32 s0, s2, 0
 ; GFX7-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX7-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
@@ -125,12 +116,10 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ;
 ; GFX9-LABEL: s_add_co_br_user:
 ; GFX9:       ; %bb.0: ; %bb
-; GFX9-NEXT:    s_load_dword s2, s[8:9], 0x0
+; GFX9-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s0, s2, s2
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s2, 0
+; GFX9-NEXT:    s_add_u32 s1, s0, s0
+; GFX9-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    s_andn2_b64 vcc, exec, s[0:1]
 ; GFX9-NEXT:    s_cbranch_vccnz .LBB1_2
@@ -153,8 +142,6 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX10-NEXT:    s_load_dword s0, s[8:9], 0x0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s1, s0, s0
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    s_andn2_b32 vcc_lo, exec_lo, s0
@@ -178,11 +165,9 @@ define amdgpu_kernel void @s_add_co_br_user(i32 %i) {
 ; GFX11-NEXT:    s_load_b32 s0, s[4:5], 0x0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s1, s0, s0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX11-NEXT:    s_addc_u32 s0, s0, 0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_not1_b32 vcc_lo, exec_lo, s0
 ; GFX11-NEXT:    s_cbranch_vccnz .LBB1_2
 ; GFX11-NEXT:  ; %bb.1: ; %bb0
diff --git a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
index 13206adc10ffa..f45070cbe88ee 100644
--- a/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/fadd-fma-fmul-combine.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=preserve-sign < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FLUSH %s
 
-; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee  -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
-; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee -enable-unsafe-fp-math < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=+fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee  < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-FASTFMA %s
+; RUN: llc -mtriple=amdgcn -mattr=-fast-fmaf,+mad-mac-f32-insts -denormal-fp-math-f32=ieee < %s | FileCheck -check-prefix=GCN -check-prefix=GCN-SLOWFMA %s
 
 ; FIXME: This should also fold when fma is actually fast if an FMA
 ; exists in the original program.
diff --git a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
index 62847b15d3443..9a17538ea9b1b 100644
--- a/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll
@@ -1117,7 +1117,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; SI:       ; %bb.0:
 ; SI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s3, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; SI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1169,7 +1168,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s3, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; VI-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1217,7 +1215,6 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_and_b32 s3, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s3, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; GFX9-NEXT:    s_lshr_b32 s0, s1, 8
@@ -1264,11 +1261,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-TRUE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-TRUE16:       ; %bb.0:
 ; GFX11-TRUE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-TRUE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-TRUE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-TRUE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-TRUE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -1320,11 +1315,9 @@ define amdgpu_ps i16 @s_copysign_out_f16_mag_f64_sign_f16(double inreg %mag, hal
 ; GFX11-FAKE16-LABEL: s_copysign_out_f16_mag_f64_sign_f16:
 ; GFX11-FAKE16:       ; %bb.0:
 ; GFX11-FAKE16-NEXT:    s_and_b32 s3, s1, 0x1ff
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    s_or_b32 s0, s3, s0
-; GFX11-FAKE16-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX11-FAKE16-NEXT:    s_cselect_b32 s0, -1, 0
-; GFX11-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
 ; GFX11-FAKE16-NEXT:    s_bfe_u32 s0, s1, 0xb0014
 ; GFX11-FAKE16-NEXT:    s_lshr_b32 s1, s1, 8
@@ -4023,7 +4016,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s6, s4, 0xffe
 ; SI-NEXT:    s_and_b32 s4, s1, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s4, s0
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    v_cvt_f16_f32_e32 v0, s5
 ; SI-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
@@ -4066,7 +4058,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; SI-NEXT:    s_and_b32 s5, s0, 0xffe
 ; SI-NEXT:    s_and_b32 s0, s3, 0x1ff
 ; SI-NEXT:    s_or_b32 s0, s0, s2
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; SI-NEXT:    v_readfirstlane_b32 s0, v2
@@ -4120,10 +4111,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_lshr_b32 s5, s3, 8
-; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_and_b32 s5, s5, 0xffe
+; VI-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; VI-NEXT:    s_or_b32 s2, s6, s2
-; VI-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; VI-NEXT:    s_bfe_u32 s3, s3, 0xb0014
@@ -4163,7 +4153,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; VI-NEXT:    s_and_b32 s7, s2, 0xffe
 ; VI-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; VI-NEXT:    s_or_b32 s0, s2, s0
-; VI-NEXT:    s_cmp_lg_u32 s0, 0
 ; VI-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; VI-NEXT:    s_bfe_u32 s1, s1, 0xb0014
@@ -4209,10 +4198,9 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX9-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[6:7]
 ; GFX9-NEXT:    s_bfe_u32 s6, s3, 0xb0014
@@ -4254,7 +4242,6 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX9-NEXT:    s_and_b32 s6, s2, 0xffe
 ; GFX9-NEXT:    s_and_b32 s2, s1, 0x1ff
 ; GFX9-NEXT:    s_or_b32 s0, s2, s0
-; GFX9-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
 ; GFX9-NEXT:    v_readfirstlane_b32 s0, v0
@@ -4301,11 +4288,10 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ;
 ; GFX11-LABEL: s_copysign_out_v2f16_mag_v2f64_sign_v2f16:
 ; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_and_b32 s5, s3, 0x1ff
-; GFX11-NEXT:    s_lshr_b32 s6, s3, 8
-; GFX11-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-NEXT:    s_and_b32 s5, s6, 0xffe
-; GFX11-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-NEXT:    s_lshr_b32 s5, s3, 8
+; GFX11-NEXT:    s_and_b32 s6, s3, 0x1ff
+; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
+; GFX11-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -4348,13 +4334,12 @@ define amdgpu_ps i32 @s_copysign_out_v2f16_mag_v2f64_sign_v2f16(<2 x double> inr
 ; GFX11-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-NEXT:    s_cselect_b32 s2, s5, s6
 ; GFX11-NEXT:    s_lshr_b32 s3, s3, 16
-; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_lshr_b32 s5, s1, 8
 ; GFX11-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-NEXT:    s_or_b32 s0, s6, s0
+; GFX11-NEXT:    s_and_b32 s6, s1, 0x1ff
 ; GFX11-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_or_b32 s0, s6, s0
 ; GFX11-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
index acb32d4951c1c..11476a6aa862e 100644
--- a/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fdiv.f64.ll
@@ -127,7 +127,7 @@ define amdgpu_kernel void @s_fdiv_v4f64(ptr addrspace(1) %out, <4 x double> %num
 ; GCN-LABEL: {{^}}div_fast_2_x_pat_f64:
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, 0.5
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 2.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -139,7 +139,7 @@ define amdgpu_kernel void @div_fast_2_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0x3fb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, 10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -151,7 +151,7 @@ define amdgpu_kernel void @div_fast_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 ; GCN-DAG: v_mov_b32_e32 v[[K_HI:[0-9]+]], 0xbfb99999
 ; GCN: v_mul_f64 [[MUL:v\[[0-9]+:[0-9]+\]]], s{{\[[0-9]+:[0-9]+\]}}, v[[[K_LO]]:[[K_HI]]]
 ; GCN: buffer_store_dwordx2 [[MUL]]
-define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
+define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #0 {
   %x = load double, ptr addrspace(1) poison
   %rcp = fdiv fast double %x, -10.0
   store double %rcp, ptr addrspace(1) %out, align 4
@@ -159,4 +159,3 @@ define amdgpu_kernel void @div_fast_neg_k_x_pat_f64(ptr addrspace(1) %out) #1 {
 }
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
index b5b2655246c3f..31344c78990b8 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-saddr-load.ll
@@ -2080,21 +2080,13 @@ define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrs
 }
 
 define amdgpu_ps float @flat_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
-; GFX1250-SDAG-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-SDAG:       ; %bb.0:
-; GFX1250-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-SDAG-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-SDAG-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX1250-GISEL-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX1250-GISEL:       ; %bb.0:
-; GFX1250-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX1250-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX1250-GISEL-NEXT:    flat_load_u8 v0, v[0:1]
-; GFX1250-GISEL-NEXT:    s_wait_loadcnt_dscnt 0x0
-; GFX1250-GISEL-NEXT:    ; return to shader part epilog
+; GFX1250-LABEL: flat_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX1250-NEXT:    v_mov_b32_e32 v1, 0
+; GFX1250-NEXT:    flat_load_u8 v0, v[0:1]
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr
diff --git a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
index 92eb4a642dc92..0a266bcbc48ae 100644
--- a/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmad-formation-fmul-distribute-denormal-mode.ll
@@ -284,4 +284,4 @@ define <2 x float> @unsafe_fast_fmul_fsub_ditribute_post_legalize(float %arg0, <
   ret <2 x float> %tmp1
 }
 
-attributes #0 = { "no-infs-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "no-infs-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmaximum.ll b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
index e59fbada6793d..62ec010107d8f 100644
--- a/llvm/test/CodeGen/AMDGPU/fmaximum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmaximum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fmaximum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fmaximum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fmaximum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fmaximum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.maximum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fmaximum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fmaximum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_signed_zero_f32() {
+; GFX9-LABEL: test_fmaximum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fmaximum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v2
-; GCN-NEXT:    v_maximum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v2
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f32 s0, s0, s2
-; GCN-NEXT:    s_maximum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_max_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_max_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f32 s0, s0, s2
+; GFX12-NEXT:    s_maximum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.maximum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fmaximum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fmaximum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v3
-; GCN-NEXT:    v_maximum_f32 v1, v1, v4
-; GCN-NEXT:    v_maximum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_max_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v3
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v4
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.maximum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fmaximum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v4
-; GCN-NEXT:    v_maximum_f32 v1, v1, v5
-; GCN-NEXT:    v_maximum_f32 v2, v2, v6
-; GCN-NEXT:    v_maximum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_max_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v4
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v5
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v6
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.maximum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fmaximum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fmaximum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f32 v0, v0, v16
-; GCN-NEXT:    v_maximum_f32 v1, v1, v17
-; GCN-NEXT:    v_maximum_f32 v2, v2, v18
-; GCN-NEXT:    v_maximum_f32 v3, v3, v19
-; GCN-NEXT:    v_maximum_f32 v4, v4, v20
-; GCN-NEXT:    v_maximum_f32 v5, v5, v21
-; GCN-NEXT:    v_maximum_f32 v6, v6, v22
-; GCN-NEXT:    v_maximum_f32 v7, v7, v23
-; GCN-NEXT:    v_maximum_f32 v8, v8, v24
-; GCN-NEXT:    v_maximum_f32 v9, v9, v25
-; GCN-NEXT:    v_maximum_f32 v10, v10, v26
-; GCN-NEXT:    v_maximum_f32 v11, v11, v27
-; GCN-NEXT:    v_maximum_f32 v12, v12, v28
-; GCN-NEXT:    v_maximum_f32 v13, v13, v29
-; GCN-NEXT:    v_maximum_f32 v14, v14, v30
-; GCN-NEXT:    v_maximum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_max_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_max_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_max_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_max_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_max_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_max_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_max_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_max_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_max_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_max_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_max_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_max_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_max_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_max_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v16
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v17
+; GFX12-NEXT:    v_maximum_f32 v2, v2, v18
+; GFX12-NEXT:    v_maximum_f32 v3, v3, v19
+; GFX12-NEXT:    v_maximum_f32 v4, v4, v20
+; GFX12-NEXT:    v_maximum_f32 v5, v5, v21
+; GFX12-NEXT:    v_maximum_f32 v6, v6, v22
+; GFX12-NEXT:    v_maximum_f32 v7, v7, v23
+; GFX12-NEXT:    v_maximum_f32 v8, v8, v24
+; GFX12-NEXT:    v_maximum_f32 v9, v9, v25
+; GFX12-NEXT:    v_maximum_f32 v10, v10, v26
+; GFX12-NEXT:    v_maximum_f32 v11, v11, v27
+; GFX12-NEXT:    v_maximum_f32 v12, v12, v28
+; GFX12-NEXT:    v_maximum_f32 v13, v13, v29
+; GFX12-NEXT:    v_maximum_f32 v14, v14, v30
+; GFX12-NEXT:    v_maximum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.maximum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fmaximum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fmaximum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_maximum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fmaximum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fmaximum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fmaximum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_maximum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fmaximum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_max_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_maximum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.maximum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fmaximum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fmaximum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.maximum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fmaximum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_maximum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fmaximum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fmaximum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_maximum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fmaximum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_maximum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_maximum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_max_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_max_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_maximum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_maximum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.maximum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fmaximum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fmaximum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fmaximum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.maximum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fmaximum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.maximum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fmaximum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_max_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_max_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_max_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fmaximum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fmaximum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_max_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_max_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_max_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fmaximum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_max_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_max_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_max_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_max_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_maximum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_maximum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_maximum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.maximum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fmaximumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_maximum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fmaximumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fmaximumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_maximum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.maximum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fmaximumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fmaximum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_max_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fmaximum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fmaximum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fmaximum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fmaximum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fmaximum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_max_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fmaximum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_maximum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.maximum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.maximum.f32(float, float)
 declare <2 x float> @llvm.maximum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.maximum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.maximum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.maximum.f64(double, double)
 declare <2 x double> @llvm.maximum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.maximum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
index bc85dc2f1e9e1..3e513de22caf3 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.bf16.ll
@@ -219,8 +219,8 @@ define <2 x bfloat> @v_test_fmed3_r_i_i_v2bf16_minimumnum_maximumnum(<2 x bfloat
 }
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="false" }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #1 = { nounwind "no-nans-fp-math"="false" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX11: {{.*}}
 ; GFX11-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fmed3.ll b/llvm/test/CodeGen/AMDGPU/fmed3.ll
index 3145a272ae48f..60ac0b943faf4 100644
--- a/llvm/test/CodeGen/AMDGPU/fmed3.ll
+++ b/llvm/test/CodeGen/AMDGPU/fmed3.ll
@@ -8905,4 +8905,4 @@ declare half @llvm.minnum.f16(half, half) #0
 declare half @llvm.maxnum.f16(half, half) #0
 
 attributes #0 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #2 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/fminimum.ll b/llvm/test/CodeGen/AMDGPU/fminimum.ll
index b25120f2ece6f..474ac7ce887f0 100644
--- a/llvm/test/CodeGen/AMDGPU/fminimum.ll
+++ b/llvm/test/CodeGen/AMDGPU/fminimum.ll
@@ -1,117 +1,296 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
-; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GCN,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck -enable-var-scope -check-prefixes=GFX9,GFX9-GISEL %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-TRUE16 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-SDAG,GFX12-SDAG-FAKE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=+real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-TRUE16 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -mattr=-real-true16 < %s | FileCheck -enable-var-scope -check-prefixes=GFX12,GFX12-GISEL,GFX12-GISEL-FAKE16 %s
 
 define amdgpu_ps float @test_fminimum_f32_vv(float %a, float %b) {
-; GCN-LABEL: test_fminimum_f32_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_ss(float inreg %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_f32_vs(float %a, float inreg %b) {
-; GCN-LABEL: test_fminimum_f32_vs:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f32_vs:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_vs:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
 define amdgpu_ps float @test_fminimum_nnan_f32(float %a, float %b) {
-; GCN-LABEL: test_fminimum_nnan_f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_nnan_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v0, v0, v1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nnan_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call nnan float @llvm.minimum.f32(float %a, float %b)
   ret float %val
 }
 
+define amdgpu_ps float @test_fminimum_nsz_f32(float %a, float %b) {
+; GFX9-LABEL: test_fminimum_nsz_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_nsz_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call nsz float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_signed_zero_f32() {
+; GFX9-LABEL: test_fminimum_signed_zero_f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_signed_zero_f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_bfrev_b32_e32 v0, 1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float -0.0, float 0.0)
+  ret float %val
+}
+
 define amdgpu_ps <2 x float> @test_fminimum_v2f32(<2 x float> %a, <2 x float> %b) {
-; GCN-LABEL: test_fminimum_v2f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v2
-; GCN-NEXT:    v_minimum_f32 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v4, v0, v2
+; GFX9-NEXT:    v_mov_b32_e32 v5, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v2, v1, v3
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v5, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v2
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_v2f32_ss(<2 x float> inreg %a, <2 x float> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f32_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f32 s0, s0, s2
-; GCN-NEXT:    s_minimum_f32 s1, s1, s3
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v2f32_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-NEXT:    v_min_f32_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-NEXT:    v_min_f32_e32 v3, s1, v1
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, s1, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v2, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f32_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f32 s0, s0, s2
+; GFX12-NEXT:    s_minimum_f32 s1, s1, s3
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x float> @llvm.minimum.v2f32(<2 x float> %a, <2 x float> %b)
   ret <2 x float> %val
 }
 
 define amdgpu_ps <3 x float> @test_fminimum_v3f32(<3 x float> %a, <3 x float> %b) {
-; GCN-LABEL: test_fminimum_v3f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v3
-; GCN-NEXT:    v_minimum_f32 v1, v1, v4
-; GCN-NEXT:    v_minimum_f32 v2, v2, v5
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v3f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v6, v0, v3
+; GFX9-NEXT:    v_mov_b32_e32 v7, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v3
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v6, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v1, v4
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v7, v3, vcc
+; GFX9-NEXT:    v_min_f32_e32 v3, v2, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v7, v3, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v3f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v3
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v4
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v5
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <3 x float> @llvm.minimum.v3f32(<3 x float> %a, <3 x float> %b)
   ret <3 x float> %val
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v4f32(<4 x float> %a, <4 x float> %b) {
-; GCN-LABEL: test_fminimum_v4f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v4
-; GCN-NEXT:    v_minimum_f32 v1, v1, v5
-; GCN-NEXT:    v_minimum_f32 v2, v2, v6
-; GCN-NEXT:    v_minimum_f32 v3, v3, v7
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v4f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v8, v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v9, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v4
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v9, v8, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v5
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v5
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v2, v6
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v2, v6
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, v9, v4, vcc
+; GFX9-NEXT:    v_min_f32_e32 v4, v3, v7
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v3, v7
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, v9, v4, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v4
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v5
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v6
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v7
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x float> @llvm.minimum.v4f32(<4 x float> %a, <4 x float> %b)
   ret <4 x float> %val
 }
 
 define amdgpu_ps <16 x float> @test_fminimum_v16f32(<16 x float> %a, <16 x float> %b) {
-; GCN-LABEL: test_fminimum_v16f32:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f32 v0, v0, v16
-; GCN-NEXT:    v_minimum_f32 v1, v1, v17
-; GCN-NEXT:    v_minimum_f32 v2, v2, v18
-; GCN-NEXT:    v_minimum_f32 v3, v3, v19
-; GCN-NEXT:    v_minimum_f32 v4, v4, v20
-; GCN-NEXT:    v_minimum_f32 v5, v5, v21
-; GCN-NEXT:    v_minimum_f32 v6, v6, v22
-; GCN-NEXT:    v_minimum_f32 v7, v7, v23
-; GCN-NEXT:    v_minimum_f32 v8, v8, v24
-; GCN-NEXT:    v_minimum_f32 v9, v9, v25
-; GCN-NEXT:    v_minimum_f32 v10, v10, v26
-; GCN-NEXT:    v_minimum_f32 v11, v11, v27
-; GCN-NEXT:    v_minimum_f32 v12, v12, v28
-; GCN-NEXT:    v_minimum_f32 v13, v13, v29
-; GCN-NEXT:    v_minimum_f32 v14, v14, v30
-; GCN-NEXT:    v_minimum_f32 v15, v15, v31
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_v16f32:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v32, v1, v17
+; GFX9-NEXT:    v_mov_b32_e32 v33, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v17
+; GFX9-NEXT:    v_min_f32_e32 v1, v0, v16
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[12:13], v0, v16
+; GFX9-NEXT:    v_min_f32_e32 v17, v2, v18
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[0:1], v2, v18
+; GFX9-NEXT:    v_min_f32_e32 v18, v3, v19
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[2:3], v3, v19
+; GFX9-NEXT:    v_min_f32_e32 v19, v4, v20
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[4:5], v4, v20
+; GFX9-NEXT:    v_min_f32_e32 v20, v5, v21
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[6:7], v5, v21
+; GFX9-NEXT:    v_min_f32_e32 v21, v6, v22
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[8:9], v6, v22
+; GFX9-NEXT:    v_min_f32_e32 v22, v7, v23
+; GFX9-NEXT:    v_cmp_o_f32_e64 s[10:11], v7, v23
+; GFX9-NEXT:    v_min_f32_e32 v23, v8, v24
+; GFX9-NEXT:    v_cndmask_b32_e64 v0, v33, v1, s[12:13]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v33, v32, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v8, v24
+; GFX9-NEXT:    v_min_f32_e32 v34, v9, v25
+; GFX9-NEXT:    v_cndmask_b32_e32 v8, v33, v23, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v9, v25
+; GFX9-NEXT:    v_min_f32_e32 v35, v10, v26
+; GFX9-NEXT:    v_cndmask_b32_e32 v9, v33, v34, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v10, v26
+; GFX9-NEXT:    v_min_f32_e32 v36, v11, v27
+; GFX9-NEXT:    v_cndmask_b32_e32 v10, v33, v35, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v11, v27
+; GFX9-NEXT:    v_min_f32_e32 v37, v12, v28
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, v33, v36, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v12, v28
+; GFX9-NEXT:    v_min_f32_e32 v16, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v12, v33, v37, vcc
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v13, v29
+; GFX9-NEXT:    v_cndmask_b32_e32 v13, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v14, v30
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v14, v30
+; GFX9-NEXT:    v_cndmask_b32_e32 v14, v33, v16, vcc
+; GFX9-NEXT:    v_min_f32_e32 v16, v15, v31
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v15, v31
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, v33, v17, s[0:1]
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, v33, v18, s[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, v33, v19, s[4:5]
+; GFX9-NEXT:    v_cndmask_b32_e64 v5, v33, v20, s[6:7]
+; GFX9-NEXT:    v_cndmask_b32_e64 v6, v33, v21, s[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e64 v7, v33, v22, s[10:11]
+; GFX9-NEXT:    v_cndmask_b32_e32 v15, v33, v16, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v16f32:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v16
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v17
+; GFX12-NEXT:    v_minimum_f32 v2, v2, v18
+; GFX12-NEXT:    v_minimum_f32 v3, v3, v19
+; GFX12-NEXT:    v_minimum_f32 v4, v4, v20
+; GFX12-NEXT:    v_minimum_f32 v5, v5, v21
+; GFX12-NEXT:    v_minimum_f32 v6, v6, v22
+; GFX12-NEXT:    v_minimum_f32 v7, v7, v23
+; GFX12-NEXT:    v_minimum_f32 v8, v8, v24
+; GFX12-NEXT:    v_minimum_f32 v9, v9, v25
+; GFX12-NEXT:    v_minimum_f32 v10, v10, v26
+; GFX12-NEXT:    v_minimum_f32 v11, v11, v27
+; GFX12-NEXT:    v_minimum_f32 v12, v12, v28
+; GFX12-NEXT:    v_minimum_f32 v13, v13, v29
+; GFX12-NEXT:    v_minimum_f32 v14, v14, v30
+; GFX12-NEXT:    v_minimum_f32 v15, v15, v31
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <16 x float> @llvm.minimum.v16f32(<16 x float> %a, <16 x float> %b)
   ret <16 x float> %val
 }
 
 define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
+; GFX9-LABEL: test_fminimum_f16_vv:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f16_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-TRUE16-LABEL: test_fminimum_f16_vv:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    v_minimum_f16 v0.l, v0.l, v1.l
@@ -136,35 +315,131 @@ define amdgpu_ps half @test_fminimum_f16_vv(half %a, half %b) {
 }
 
 define amdgpu_ps half @test_fminimum_f16_ss(half inreg %a, half inreg %b) {
-; GCN-LABEL: test_fminimum_f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_minimum_f16 s0, s0, s1
-; GCN-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-LABEL: test_fminimum_f16_ss:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-NEXT:    v_min_f16_e32 v1, s0, v0
+; GFX9-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_minimum_f16 s0, s0, s1
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_3)
+; GFX12-NEXT:    v_mov_b32_e32 v0, s0
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call half @llvm.minimum.f16(half %a, half %b)
   ret half %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_vv(<2 x half> %a, <2 x half> %b) {
-; GCN-LABEL: test_fminimum_v2f16_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v3, v2, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v0, v1
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v1 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v3, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v3, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <2 x half> @test_fminimum_v2f16_ss(<2 x half> inreg %a, <2 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s1
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s0, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s1
+; GFX9-GISEL-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s2, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s0, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v2, v1, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v1, v2, v1, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s1
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x half> @llvm.minimum.v2f16(<2 x half> %a, <2 x half> %b)
   ret <2 x half> %val
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v0, v2
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v0, v5, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v5, v4, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_vv:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, v0, v2
@@ -187,6 +462,49 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_vv(<3 x half> %a, <3 x half> %b
 }
 
 define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x half> inreg %b) {
+; GFX9-SDAG-LABEL: test_fminimum_v3f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, s0, v3
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v4
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v3f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    s_lshr_b32 s5, s2, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s5
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v2, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v3, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s1, v1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
 ; GFX12-SDAG-LABEL: test_fminimum_v3f16_ss:
 ; GFX12-SDAG:       ; %bb.0:
 ; GFX12-SDAG-NEXT:    v_pk_minimum_f16 v0, s0, s2
@@ -206,97 +524,384 @@ define amdgpu_ps <3 x half> @test_fminimum_v3f16_ss(<3 x half> inreg %a, <3 x ha
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16(<4 x half> %a, <4 x half> %b) {
-; GCN-LABEL: test_fminimum_v4f16:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, v0, v2
-; GCN-NEXT:    v_pk_minimum_f16 v1, v1, v3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, v1, v3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v1, v3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v6, v5, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v1, v5, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v3, v0, v2
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v4, v5, v3, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v0, v5, v3, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    s_mov_b32 s0, 0x5040100
+; GFX9-SDAG-NEXT:    v_perm_b32 v0, v0, v4, s0
+; GFX9-SDAG-NEXT:    v_perm_b32 v1, v1, v6, s0
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v4, v0, v2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, v0, v2
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v4, v6, v4, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v0, v2 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v6, v5, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v2, 0xffff, v4
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v0, 16, v2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], v1, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_sdwa vcc, v1, v3 src0_sel:WORD_1 src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v6, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v6, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, v0, v2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, v1, v3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <4 x half> @test_fminimum_v4f16_ss(<4 x half> inreg %a, <4 x half> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f16_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_pk_minimum_f16 v0, s0, s2
-; GCN-NEXT:    v_pk_minimum_f16 v1, s1, s3
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f16_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    s_lshr_b32 s3, s3, 16
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v1, s1, v1
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, 0x7e00
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s1, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s3
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v3, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s1, v0
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v4, s2
+; GFX9-SDAG-NEXT:    s_lshr_b32 s1, s2, 16
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v2, v1, vcc
+; GFX9-SDAG-NEXT:    v_pk_min_f16 v4, s0, v4
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-SDAG-NEXT:    s_lshr_b32 s0, s0, 16
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v0, v2, v4, vcc
+; GFX9-SDAG-NEXT:    v_cmp_o_f16_e32 vcc, s0, v5
+; GFX9-SDAG-NEXT:    v_cndmask_b32_sdwa v2, v2, v4, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-SDAG-NEXT:    v_and_b32_e32 v2, 0xffff, v3
+; GFX9-SDAG-NEXT:    v_lshl_or_b32 v1, v1, 16, v2
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f16_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s2, 16
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v1, s0, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s0, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v2, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v4, 0x7e00
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v0
+; GFX9-GISEL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, v4, v1, vcc
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s4, v2
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v4, v3, vcc
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX9-GISEL-NEXT:    s_lshr_b32 s2, s3, 16
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    s_lshr_b32 s0, s1, 16
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v3, s2
+; GFX9-GISEL-NEXT:    v_pk_min_f16 v2, s1, v1
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e32 vcc, s0, v3
+; GFX9-GISEL-NEXT:    v_cmp_o_f16_e64 s[0:1], s1, v1
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v1, v4, v2, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_sdwa v2, v4, v2, vcc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-GISEL-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX9-GISEL-NEXT:    v_lshl_or_b32 v1, v2, 16, v1
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f16_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_pk_minimum_f16 v0, s0, s2
+; GFX12-NEXT:    v_pk_minimum_f16 v1, s1, s3
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x half> @llvm.minimum.v4f16(<4 x half> %a, <4 x half> %b)
   ret <4 x half> %val
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_vv(double %a, double %b) {
-; GCN-LABEL: test_fminimum_f64_vv:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_vv:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v4, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v5, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_vv:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[2:3]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v4, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v5, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_vv:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <2 x float> @test_fminimum_f64_ss(double inreg %a, double inreg %b) {
-; GCN-LABEL: test_fminimum_f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v1, vcc
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s2
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s3
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[2:3]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call double @llvm.minimum.f64(double %a, double %b)
   %ret = bitcast double %val to <2 x float>
   ret <2 x float> %ret
 }
 
 define amdgpu_ps <4 x float> @test_fminimum_v2f64_ss(<2 x double> inreg %a, <2 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v2f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v2f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v6, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v6, s[0:1]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v2f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v6, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v6, v5, s[0:1]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v2f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[6:7]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <2 x double> @llvm.minimum.v2f64(<2 x double> %a, <2 x double> %b)
   %ret = bitcast <2 x double> %val to <4 x float>
   ret <4 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64(<4 x double> %a, <4 x double> %b) {
-; GCN-LABEL: test_fminimum_v4f64:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-SDAG-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-SDAG-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v7, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v16, 0, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v17, v7, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v8, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v9, v7, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v10, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v11, v7, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v12, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v13, v7, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_min_f64 v[16:17], v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, v[0:1], v[8:9]
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], v[2:3], v[10:11]
+; GFX9-GISEL-NEXT:    v_min_f64 v[10:11], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], v[4:5], v[12:13]
+; GFX9-GISEL-NEXT:    v_min_f64 v[12:13], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], v[6:7], v[14:15]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v18, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v16, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v18, v17, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v8, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v18, v9, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v10, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v18, v11, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v12, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v18, v13, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], v[0:1], v[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], v[2:3], v[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], v[4:5], v[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], v[6:7], v[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_ps <8 x float> @test_fminimum_v4f64_ss(<4 x double> inreg %a, <4 x double> inreg %b) {
-; GCN-LABEL: test_fminimum_v4f64_ss:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
-; GCN-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
-; GCN-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
-; GCN-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
-; GCN-NEXT:    ; return to shader part epilog
+; GFX9-SDAG-LABEL: test_fminimum_v4f64_ss:
+; GFX9-SDAG:       ; %bb.0:
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-SDAG-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v0, v2, 0, vcc
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s10
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s11
+; GFX9-SDAG-NEXT:    v_min_f64 v[4:5], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[0:1], s[2:3], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s12
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s13
+; GFX9-SDAG-NEXT:    v_min_f64 v[6:7], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[2:3], s[4:5], v[1:2]
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v1, s14
+; GFX9-SDAG-NEXT:    v_mov_b32_e32 v2, s15
+; GFX9-SDAG-NEXT:    v_min_f64 v[8:9], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cmp_u_f64_e64 s[4:5], s[6:7], v[1:2]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e32 v1, v3, v10, vcc
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v2, v4, 0, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v3, v5, v10, s[0:1]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v4, v6, 0, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v5, v7, v10, s[2:3]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v6, v8, 0, s[4:5]
+; GFX9-SDAG-NEXT:    v_cndmask_b32_e64 v7, v9, v10, s[4:5]
+; GFX9-SDAG-NEXT:    ; return to shader part epilog
+;
+; GFX9-GISEL-LABEL: test_fminimum_v4f64_ss:
+; GFX9-GISEL:       ; %bb.0:
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s8
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s9
+; GFX9-GISEL-NEXT:    v_min_f64 v[2:3], s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e32 vcc, s[0:1], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s10
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s11
+; GFX9-GISEL-NEXT:    v_min_f64 v[4:5], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[0:1], s[2:3], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s12
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s13
+; GFX9-GISEL-NEXT:    v_min_f64 v[6:7], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[2:3], s[4:5], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s14
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s15
+; GFX9-GISEL-NEXT:    v_min_f64 v[8:9], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_cmp_o_f64_e64 s[4:5], s[6:7], v[0:1]
+; GFX9-GISEL-NEXT:    v_mov_b32_e32 v10, 0x7ff80000
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v2, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e32 v1, v10, v3, vcc
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v2, 0, v4, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v3, v10, v5, s[0:1]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v4, 0, v6, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v5, v10, v7, s[2:3]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v6, 0, v8, s[4:5]
+; GFX9-GISEL-NEXT:    v_cndmask_b32_e64 v7, v10, v9, s[4:5]
+; GFX9-GISEL-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_v4f64_ss:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f64 v[0:1], s[0:1], s[8:9]
+; GFX12-NEXT:    v_minimum_f64 v[2:3], s[2:3], s[10:11]
+; GFX12-NEXT:    v_minimum_f64 v[4:5], s[4:5], s[12:13]
+; GFX12-NEXT:    v_minimum_f64 v[6:7], s[6:7], s[14:15]
+; GFX12-NEXT:    ; return to shader part epilog
   %val = call <4 x double> @llvm.minimum.v4f64(<4 x double> %a, <4 x double> %b)
   %ret = bitcast <4 x double> %val to <8 x float>
   ret <8 x float> %ret
 }
 
 define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
-; GCN-LABEL: fminimumi_f32_move_to_valu:
-; GCN:       ; %bb.0:
-; GCN-NEXT:    s_clause 0x1
-; GCN-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
-; GCN-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
-; GCN-NEXT:    v_mov_b32_e32 v0, 0
-; GCN-NEXT:    s_wait_kmcnt 0x0
-; GCN-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
-; GCN-NEXT:    s_wait_loadcnt 0x0
-; GCN-NEXT:    v_minimum_f32 v1, v1, v2
-; GCN-NEXT:    global_store_b32 v0, v1, s[0:1]
-; GCN-NEXT:    s_endpgm
+; GFX9-LABEL: fminimumi_f32_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_dword v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_dword v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f32_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
+; GFX12-LABEL: fminimumi_f32_move_to_valu:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
+; GFX12-NEXT:    s_load_b64 s[4:5], s[4:5], 0x34
+; GFX12-NEXT:    v_mov_b32_e32 v0, 0
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    global_load_b32 v1, v0, s[2:3] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    global_load_b32 v2, v0, s[4:5] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    v_minimum_f32 v1, v1, v2
+; GFX12-NEXT:    global_store_b32 v0, v1, s[0:1]
+; GFX12-NEXT:    s_endpgm
   %a = load volatile float, ptr addrspace(1) %aptr, align 4
   %b = load volatile float, ptr addrspace(1) %bptr, align 4
   %v = call float @llvm.minimum.f32(float %a, float %b)
@@ -305,6 +910,23 @@ define amdgpu_kernel void @fminimumi_f32_move_to_valu(ptr addrspace(1) %out, ptr
 }
 
 define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr addrspace(1) %aptr, ptr addrspace(1) %bptr) {
+; GFX9-LABEL: fminimum_f16_move_to_valu:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
+; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
+; GFX9-NEXT:    v_mov_b32_e32 v0, 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7e00
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    global_load_ushort v1, v0, s[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_min_f16_e32 v4, v1, v2
+; GFX9-NEXT:    v_cmp_o_f16_e32 vcc, v1, v2
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v3, v4, vcc
+; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
+; GFX9-NEXT:    s_endpgm
+;
 ; GFX12-SDAG-TRUE16-LABEL: fminimum_f16_move_to_valu:
 ; GFX12-SDAG-TRUE16:       ; %bb.0:
 ; GFX12-SDAG-TRUE16-NEXT:    s_clause 0x1
@@ -371,6 +993,40 @@ define amdgpu_kernel void @fminimum_f16_move_to_valu(ptr addrspace(1) %out, ptr
   ret void
 }
 
+define amdgpu_ps float @test_fminimum_f32_ieee_on(float %a, float %b) #0 {
+; GFX9-LABEL: test_fminimum_f32_ieee_on:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_on:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
+define amdgpu_ps float @test_fminimum_f32_ieee_off(float %a, float %b) #1 {
+; GFX9-LABEL: test_fminimum_f32_ieee_off:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    v_min_f32_e32 v2, v0, v1
+; GFX9-NEXT:    v_mov_b32_e32 v3, 0x7fc00000
+; GFX9-NEXT:    v_cmp_o_f32_e32 vcc, v0, v1
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v3, v2, vcc
+; GFX9-NEXT:    ; return to shader part epilog
+;
+; GFX12-LABEL: test_fminimum_f32_ieee_off:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_minimum_f32 v0, v0, v1
+; GFX12-NEXT:    ; return to shader part epilog
+  %val = call float @llvm.minimum.f32(float %a, float %b)
+  ret float %val
+}
+
 declare float @llvm.minimum.f32(float, float)
 declare <2 x float> @llvm.minimum.v2f32(<2 x float>, <2 x float>)
 declare <3 x float> @llvm.minimum.v3f32(<3 x float>, <3 x float>)
@@ -383,3 +1039,6 @@ declare <4 x half> @llvm.minimum.v4f16(<4 x half>, <4 x half>)
 declare double @llvm.minimum.f64(double, double)
 declare <2 x double> @llvm.minimum.v2f64(<2 x double>, <2 x double>)
 declare <4 x double> @llvm.minimum.v4f64(<4 x double>, <4 x double>)
+
+attributes #0 = { nounwind "amdgpu-ieee"="true" }
+attributes #1 = { nounwind "amdgpu-ieee"="false" }
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
index d8bbda19453fa..69d1ee3f533a6 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.legal.f16.ll
@@ -159,7 +159,7 @@ declare half @llvm.amdgcn.interp.p2.f16(float, float, i32, i32, i1, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 attributes #4 = { nounwind "amdgpu-ieee"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
index aaea4f76ea49b..b3202cbe30d0b 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.ll
@@ -8006,7 +8006,7 @@ declare float @llvm.amdgcn.interp.p2(float, float, i32, i32, i32) #0
 
 attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "unsafe-fp-math"="true" }
+attributes #2 = { nounwind }
 attributes #3 = { nounwind "no-signed-zeros-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GCN-NSZ: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index d41e2c62be9a6..8df756481e54a 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @fpext_f16_to_f32(
 ; SI-LABEL: fpext_f16_to_f32:
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index a43292d9e5021..a043d537fbc45 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 
 define amdgpu_kernel void @fptosi_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index 96cb62110001e..af1ab37e48474 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 
 define amdgpu_kernel void @fptoui_f16_to_i16(
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
index b0dd1872e2b3a..c28b25c76d241 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.f16.ll
@@ -599,10 +599,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s6, s6, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s7, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s6, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s7, s3, 12
@@ -711,10 +709,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -824,10 +820,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -937,10 +931,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -1118,17 +1110,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1175,17 +1165,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1366,17 +1354,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -1423,17 +1409,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b64 s[2:3], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s6, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s5, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -2154,10 +2138,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
 ; SI-GISEL-NEXT:    s_or_b32 s4, s9, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s8, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s8, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s9, s3, 12
@@ -2193,12 +2175,10 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; SI-GISEL-NEXT:    s_and_b32 s4, s4, 0x8000
 ; SI-GISEL-NEXT:    s_addk_i32 s5, 0xfc10
 ; SI-GISEL-NEXT:    s_and_b32 s8, s8, 0xffe
-; SI-GISEL-NEXT:    s_or_b32 s6, s9, s6
 ; SI-GISEL-NEXT:    s_or_b32 s3, s4, s3
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s6, 0
+; SI-GISEL-NEXT:    s_or_b32 s4, s9, s6
 ; SI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; SI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; SI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; SI-GISEL-NEXT:    s_cselect_b32 s6, 1, 0
 ; SI-GISEL-NEXT:    s_lshl_b32 s6, s6, 9
 ; SI-GISEL-NEXT:    s_lshl_b32 s8, s5, 12
@@ -2355,10 +2335,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; VI-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2392,14 +2370,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; VI-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; VI-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; VI-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; VI-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; VI-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2555,10 +2531,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2592,14 +2566,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX9-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX9-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX9-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX9-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX9-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX9-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX9-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX9-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX9-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX9-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -2752,10 +2724,8 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s3, s3, 0xffe
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s8, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s3, s3, s4
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s9, 1, s2
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s8, s2, 12
@@ -2789,14 +2759,12 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX950-GISEL-NEXT:    s_or_b32 s2, s3, s2
 ; GFX950-GISEL-NEXT:    s_bfe_u32 s3, s7, 0xb0014
 ; GFX950-GISEL-NEXT:    s_lshr_b32 s4, s7, 8
-; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_addk_i32 s3, 0xfc10
 ; GFX950-GISEL-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX950-GISEL-NEXT:    s_and_b32 s5, s7, 0x1ff
 ; GFX950-GISEL-NEXT:    s_or_b32 s5, s5, s6
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s5, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_or_b32 s4, s4, s5
-; GFX950-GISEL-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX950-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX950-GISEL-NEXT:    s_sub_i32 s8, 1, s3
 ; GFX950-GISEL-NEXT:    s_lshl_b32 s6, s3, 12
@@ -3073,17 +3041,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3115,19 +3081,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-TRUE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-TRUE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-TRUE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3176,17 +3140,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX11-GISEL-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s2, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s8, 1, s2
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3218,19 +3180,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX11-GISEL-FAKE16-NEXT:    s_cmpk_eq_i32 s2, 0x40f
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX11-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX11-GISEL-FAKE16-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX11-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-FAKE16-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3511,17 +3471,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3553,19 +3511,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-TRUE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-TRUE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-TRUE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-TRUE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-TRUE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-TRUE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-TRUE16-NEXT:    s_or_b32 s9, s3, 0x1000
@@ -3614,17 +3570,15 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_wait_kmcnt 0x0
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s2, s5, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 8
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s5, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s2, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0xffe
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s4, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s4, s8, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s3, s4
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s4, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s8, 1, s2
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s10, s3, 0x1000
@@ -3656,19 +3610,17 @@ define amdgpu_kernel void @fptrunc_v2f64_to_v2f16(
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_eq_u32 s2, 0x40f
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s2, s4, s3
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s3, s5, 16
-; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_bfe_u32 s4, s7, 0xb0014
 ; GFX1250-GISEL-FAKE16-NEXT:    s_lshr_b32 s5, s7, 8
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s3, s3, 0x8000
-; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s6, s8, s6
+; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s8, s7, 0x1ff
 ; GFX1250-GISEL-FAKE16-NEXT:    s_addk_co_i32 s4, 0xfc10
 ; GFX1250-GISEL-FAKE16-NEXT:    s_and_b32 s5, s5, 0xffe
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s2, s3, s2
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s6, 0
+; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s8, s6
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s3, 1, 0
-; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1250-GISEL-FAKE16-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s3, s5, s3
-; GFX1250-GISEL-FAKE16-NEXT:    s_cmp_lg_u32 s3, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX1250-GISEL-FAKE16-NEXT:    s_sub_co_i32 s6, 1, s4
 ; GFX1250-GISEL-FAKE16-NEXT:    s_or_b32 s9, s3, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fptrunc.ll b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
index 5d311776066e5..b6b26a47970b0 100644
--- a/llvm/test/CodeGen/AMDGPU/fptrunc.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptrunc.ll
@@ -2,14 +2,14 @@
 ; RUN: llc -mtriple=amdgcn < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-GISEL,VI-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI-SDAG,VI-UNSAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-GISEL,GFX10-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1030 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX10-SDAG,GFX10-UNSAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-SAFE-SDAG %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-SAFE-GISEL %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=0 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-SDAG,GFX11-UNSAFE-DAG-FAKE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-TRUE16 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -global-isel=1 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-GISEL,GFX11-UNSAFE-GISEL-FAKE16 %s
 
@@ -182,7 +182,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; SI-NEXT:    s_and_b32 s1, s7, 0x1ff
 ; SI-NEXT:    s_and_b32 s8, s0, 0xffe
 ; SI-NEXT:    s_or_b32 s0, s1, s6
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    s_bfe_u32 s0, s7, 0xb0014
@@ -237,7 +236,6 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-SDAG-NEXT:    s_and_b32 s8, s4, 0xffe
 ; VI-SDAG-NEXT:    s_and_b32 s4, s7, 0x1ff
 ; VI-SDAG-NEXT:    s_or_b32 s4, s4, s6
-; VI-SDAG-NEXT:    s_cmp_lg_u32 s4, 0
 ; VI-SDAG-NEXT:    s_mov_b32 s1, s5
 ; VI-SDAG-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; VI-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
@@ -290,10 +288,8 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; VI-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; VI-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
 ; VI-GISEL-NEXT:    s_or_b32 s2, s6, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; VI-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; VI-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; VI-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; VI-GISEL-NEXT:    s_sub_i32 s7, 1, s4
 ; VI-GISEL-NEXT:    s_lshl_b32 s6, s4, 12
@@ -335,11 +331,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-SDAG:       ; %bb.0:
 ; GFX10-SDAG-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX10-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX10-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX10-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX10-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX10-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX10-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX10-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX10-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
 ; GFX10-SDAG-NEXT:    s_bfe_u32 s2, s3, 0xb0014
@@ -387,16 +382,14 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX10-GISEL:       ; %bb.0:
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x24
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX10-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX10-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX10-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX10-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX10-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
 ; GFX10-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX10-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX10-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX10-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX10-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
@@ -438,11 +431,10 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-SDAG:       ; %bb.0:
 ; GFX11-SDAG-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-SDAG-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s3, 0x1ff
-; GFX11-SDAG-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-SDAG-NEXT:    s_or_b32 s2, s4, s2
-; GFX11-SDAG-NEXT:    s_and_b32 s4, s5, 0xffe
-; GFX11-SDAG-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-SDAG-NEXT:    s_lshr_b32 s4, s3, 8
+; GFX11-SDAG-NEXT:    s_and_b32 s5, s3, 0x1ff
+; GFX11-SDAG-NEXT:    s_and_b32 s4, s4, 0xffe
+; GFX11-SDAG-NEXT:    s_or_b32 s2, s5, s2
 ; GFX11-SDAG-NEXT:    s_cselect_b32 s2, -1, 0
 ; GFX11-SDAG-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-SDAG-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
@@ -498,17 +490,15 @@ define amdgpu_kernel void @fptrunc_f64_to_f16(ptr addrspace(1) %out, double %in)
 ; GFX11-GISEL:       ; %bb.0:
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x24
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_bfe_u32 s4, s3, 0xb0014
 ; GFX11-GISEL-NEXT:    s_lshr_b32 s5, s3, 8
-; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
+; GFX11-GISEL-NEXT:    s_and_b32 s6, s3, 0x1ff
 ; GFX11-GISEL-NEXT:    s_addk_i32 s4, 0xfc10
 ; GFX11-GISEL-NEXT:    s_and_b32 s5, s5, 0xffe
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
+; GFX11-GISEL-NEXT:    s_or_b32 s2, s6, s2
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s2, 1, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-GISEL-NEXT:    s_or_b32 s2, s5, s2
-; GFX11-GISEL-NEXT:    s_cmp_lg_u32 s2, 0
 ; GFX11-GISEL-NEXT:    s_cselect_b32 s5, 1, 0
 ; GFX11-GISEL-NEXT:    s_sub_i32 s6, 1, s4
 ; GFX11-GISEL-NEXT:    s_or_b32 s8, s2, 0x1000
diff --git a/llvm/test/CodeGen/AMDGPU/fract.f64.ll b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
index f09c1c6b0e5c7..cc2e78d431bf5 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.f64.ll
@@ -2,8 +2,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefixes=GCN,SI,FUNC %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefixes=GCN,CI,FUNC %s
 
 declare double @llvm.fabs.f64(double) #0
 declare double @llvm.floor.f64(double) #0
diff --git a/llvm/test/CodeGen/AMDGPU/fract.ll b/llvm/test/CodeGen/AMDGPU/fract.ll
index 8ef0fcfc1e404..723fd93411c36 100644
--- a/llvm/test/CodeGen/AMDGPU/fract.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract.ll
@@ -1,8 +1,8 @@
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck --check-prefix=GCN %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck --check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn < %s | FileCheck --check-prefix=GCN %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global < %s | FileCheck --check-prefix=GCN %s
 
 declare float @llvm.fabs.f32(float) #0
 declare float @llvm.floor.f32(float) #0
diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll
index 6f91222b2f396..d8cbdb11a911f 100644
--- a/llvm/test/CodeGen/AMDGPU/frem.ll
+++ b/llvm/test/CodeGen/AMDGPU/frem.ll
@@ -2048,7 +2048,7 @@ define amdgpu_kernel void @unsafe_frem_f16(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-FAKE16-NEXT:    v_fmac_f16_e32 v1, v3, v2
 ; GFX1200-FAKE16-NEXT:    global_store_b16 v0, v1, s[0:1]
 ; GFX1200-FAKE16-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr half, ptr addrspace(1) %in2, i32 4
    %r0 = load half, ptr addrspace(1) %in1, align 4
    %r1 = load half, ptr addrspace(1) %gep2, align 4
@@ -3417,7 +3417,7 @@ define amdgpu_kernel void @unsafe_frem_f32(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fmac_f32_e32 v1, v3, v2
 ; GFX1200-NEXT:    global_store_b32 v0, v1, s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %gep2 = getelementptr float, ptr addrspace(1) %in2, i32 4
    %r0 = load float, ptr addrspace(1) %in1, align 4
    %r1 = load float, ptr addrspace(1) %gep2, align 4
@@ -4821,7 +4821,7 @@ define amdgpu_kernel void @unsafe_frem_f64(ptr addrspace(1) %out, ptr addrspace(
 ; GFX1200-NEXT:    v_fma_f64 v[0:1], -v[4:5], v[2:3], v[0:1]
 ; GFX1200-NEXT:    global_store_b64 v12, v[0:1], s[0:1]
 ; GFX1200-NEXT:    s_endpgm
-                             ptr addrspace(1) %in2) #1 {
+                             ptr addrspace(1) %in2) #0 {
    %r0 = load double, ptr addrspace(1) %in1, align 8
    %r1 = load double, ptr addrspace(1) %in2, align 8
    %r2 = frem afn double %r0, %r1
@@ -18918,7 +18918,4 @@ define amdgpu_kernel void @frem_v2f64_const(ptr addrspace(1) %out) #0 {
 
 
 
-attributes #0 = { nounwind "unsafe-fp-math"="false" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-attributes #1 = { nounwind "unsafe-fp-math"="true" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
-
-
+attributes #0 = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
index 1b74ddfb883e8..9b979816b95b8 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.f64.ll
@@ -2870,7 +2870,7 @@ define double @v_sqrt_f64__enough_unsafe_attrs(double %x) #3 {
   ret double %result
 }
 
-define double @v_sqrt_f64__unsafe_attr(double %x) #4 {
+define double @v_sqrt_f64__unsafe_attr(double %x) {
 ; GFX6-SDAG-LABEL: v_sqrt_f64__unsafe_attr:
 ; GFX6-SDAG:       ; %bb.0:
 ; GFX6-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -3449,7 +3449,6 @@ declare i32 @llvm.amdgcn.readfirstlane(i32) #1
 attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #1 = { convergent nounwind willreturn memory(none) }
 attributes #3 = { "no-nans-fp-math"="true" "no-infs-fp-math"="true" }
-attributes #4 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; GFX6: {{.*}}
 ; GFX8: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
index 9f19bcb638cd7..c93c077706046 100644
--- a/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
+++ b/llvm/test/CodeGen/AMDGPU/fsqrt.r600.ll
@@ -239,4 +239,4 @@ declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0
 
 attributes #0 = { nounwind readnone }
-attributes #1 = { nounwind "unsafe-fp-math"="true" }
+attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
index 94afa8849be50..9ebf6ae88a517 100644
--- a/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
+++ b/llvm/test/CodeGen/AMDGPU/global-saddr-load.ll
@@ -4666,21 +4666,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr add
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -4707,21 +4699,13 @@ define amdgpu_ps float @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr a
 ; GFX11-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-NEXT:    ; return to shader part epilog
 ;
-; GFX12-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-SDAG:       ; %bb.0:
-; GFX12-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-SDAG-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-SDAG-NEXT:    s_wait_loadcnt 0x0
-; GFX12-SDAG-NEXT:    ; return to shader part epilog
-;
-; GFX12-GISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX12-GISEL:       ; %bb.0:
-; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX12-GISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX12-GISEL-NEXT:    global_load_u8 v0, v[0:1], off
-; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
-; GFX12-GISEL-NEXT:    ; return to shader part epilog
+; GFX12-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX12:       ; %bb.0:
+; GFX12-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX12-NEXT:    v_mov_b32_e32 v1, 0
+; GFX12-NEXT:    global_load_u8 v0, v[0:1], off
+; GFX12-NEXT:    s_wait_loadcnt 0x0
+; GFX12-NEXT:    ; return to shader part epilog
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 37756d15861be..31f277f73099b 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -472,7 +472,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -536,11 +535,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -606,7 +604,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -660,12 +657,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -710,9 +706,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1690,7 +1685,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1754,11 +1748,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1824,7 +1817,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1878,12 +1870,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1928,9 +1919,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2968,7 +2958,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3032,11 +3021,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3102,7 +3090,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3156,12 +3143,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3206,9 +3192,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3742,7 +3727,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3806,11 +3790,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3876,7 +3859,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3930,12 +3912,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v1, s3, v1
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v1, s4, v1
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3980,9 +3961,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v1, s2, v1
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5019,7 +4999,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5083,11 +5062,10 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5153,7 +5131,6 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5207,12 +5184,11 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5270,9 +5246,8 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6284,7 +6259,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6354,7 +6328,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6424,7 +6397,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6485,8 +6457,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6550,7 +6520,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7717,7 +7686,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7787,7 +7755,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7857,7 +7824,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7918,8 +7884,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -7983,7 +7947,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9150,7 +9113,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9220,7 +9182,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9290,7 +9251,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9351,8 +9311,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9416,7 +9374,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10065,7 +10022,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10135,7 +10091,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10205,7 +10160,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10266,8 +10220,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10331,7 +10283,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11498,7 +11449,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11568,7 +11518,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11638,7 +11587,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11699,8 +11647,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11764,7 +11710,6 @@ define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 6351bb39e97f5..4581efc06504a 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_max_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index a9ac00863cd17..bd570d9eccdc3 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -381,13 +381,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -457,7 +456,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -513,7 +511,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
@@ -562,8 +559,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -610,11 +606,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1420,13 +1414,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1496,7 +1489,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1552,7 +1544,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
@@ -1601,8 +1592,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1649,11 +1639,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2459,13 +2447,12 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX9-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX9-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX9-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX9-NEXT:    v_max_f32_e64 v1, s3, s3
+; GFX9-NEXT:    v_max_f32_e32 v2, v2, v2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX9-NEXT:    v_max_f32_e32 v1, v2, v2
-; GFX9-NEXT:    v_max_f32_e64 v2, s4, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    v_min_f32_e32 v2, v1, v2
+; GFX9-NEXT:    v_min_f32_e32 v2, v2, v1
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2535,7 +2522,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1064-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2591,7 +2577,6 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_max_f32_e64 v2, s2, s2
 ; GFX1032-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
@@ -2640,8 +2625,7 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    v_max_f32_e64 v2, s3, s3
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2688,11 +2672,9 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    v_max_f32_e32 v1, v1, v1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_max_f32_e64 v2, s2, s2
-; GFX1132-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1132-NEXT:    v_min_f32_e32 v1, v1, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3591,7 +3573,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3665,7 +3646,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3724,7 +3704,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3774,8 +3753,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3841,10 +3819,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB7_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4859,7 +4836,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4933,7 +4909,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4992,7 +4967,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5042,8 +5016,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5109,10 +5082,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB9_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6127,7 +6099,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6201,7 +6172,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6260,7 +6230,6 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_min_f64 v[2:3], v[2:3], v[4:5]
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6310,8 +6279,7 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX1164-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6377,10 +6345,9 @@ define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    v_readlane_b32 s3, v1, s1
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(VALU_DEP_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_max_f64 v[4:5], s[2:3], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_min_f64 v[4:5], v[2:3], v[4:5]
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB11_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index 6311143f57260..1f2d70c931e73 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -532,7 +532,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -596,11 +595,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB1_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -666,7 +664,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -720,12 +717,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -783,9 +779,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB1_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1862,7 +1857,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -1926,11 +1920,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1064-NEXT:  .LBB3_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -1996,7 +1989,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -2050,12 +2042,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -2113,9 +2104,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_one_as_scope
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB3_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3192,7 +3182,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3256,11 +3245,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB5_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3326,7 +3314,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -3380,12 +3367,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -3443,9 +3429,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB5_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4018,7 +4003,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4082,11 +4066,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1064-NEXT:  .LBB6_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4152,7 +4135,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -4206,12 +4188,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -4269,9 +4250,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_agent_scope_
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB6_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5347,7 +5327,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX9-NEXT:    v_readlane_b32 s4, v0, s2
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5411,11 +5390,10 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1064-NEXT:  .LBB8_1: ; %ComputeLoop
 ; GFX1064-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1064-NEXT:    s_ff1_i32_b64 s2, s[0:1]
-; GFX1064-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1064-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1064-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s2
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1064-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5481,7 +5459,6 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1032-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -5535,12 +5512,11 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1164-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_ctz_i32_b64 s2, s[0:1]
-; GFX1164-NEXT:    v_readlane_b32 s4, v0, s2
+; GFX1164-NEXT:    v_readlane_b32 s3, v0, s2
+; GFX1164-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT:    v_add_f32_e32 v2, s3, v2
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s2
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX1164-NEXT:    v_add_f32_e32 v2, s4, v2
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -5598,9 +5574,8 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_default_scop
 ; GFX1132-NEXT:    s_ctz_i32_b32 s1, s0
 ; GFX1132-NEXT:    v_readlane_b32 s2, v0, s1
 ; GFX1132-NEXT:    s_lshl_b32 s1, 1, s1
-; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    v_add_f32_e32 v2, s2, v2
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB8_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
@@ -6612,7 +6587,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6682,7 +6656,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6752,7 +6725,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6813,8 +6785,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -6878,7 +6848,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB10_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8044,7 +8013,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8114,7 +8082,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8184,7 +8151,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8245,8 +8211,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -8310,7 +8274,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_a
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB12_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9477,7 +9440,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9547,7 +9509,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9617,7 +9578,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9678,8 +9638,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -9743,7 +9701,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB14_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10392,7 +10349,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10462,7 +10418,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10532,7 +10487,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10593,8 +10547,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -10658,7 +10610,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB15_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11824,7 +11775,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX9-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX9-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX9-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX9-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX9-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX9-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11894,7 +11844,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1064-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1064-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1064-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; GFX1064-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1064-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1064-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1064-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -11964,7 +11913,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1032-NEXT:    s_lshl_b32 s1, 1, s1
 ; GFX1032-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX1032-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1032-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1032-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1032-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1032-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12025,8 +11973,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1164-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
 ; GFX1164-NEXT:    s_lshl_b64 s[2:3], 1, s4
 ; GFX1164-NEXT:    s_and_not1_b64 s[0:1], s[0:1], s[2:3]
-; GFX1164-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX1164-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; GFX1164-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1164-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1164-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
@@ -12090,7 +12036,6 @@ define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defau
 ; GFX1132-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX1132-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX1132-NEXT:    v_add_f64 v[4:5], v[4:5], s[2:3]
-; GFX1132-NEXT:    s_cmp_lg_u32 s0, 0
 ; GFX1132-NEXT:    s_cbranch_scc1 .LBB17_1
 ; GFX1132-NEXT:  ; %bb.2: ; %ComputeEnd
 ; GFX1132-NEXT:    v_mbcnt_lo_u32_b32 v0, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/inline-attr.ll b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
index 4ae0ba065adfc..c33b3344fd51c 100644
--- a/llvm/test/CodeGen/AMDGPU/inline-attr.ll
+++ b/llvm/test/CodeGen/AMDGPU/inline-attr.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
-; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-unsafe-fp-math %s  | FileCheck --check-prefixes=GCN,UNSAFE %s
+; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 %s  | FileCheck --check-prefixes=GCN,UNSAFE %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-nans-fp-math %s | FileCheck --check-prefixes=GCN,NONANS %s
 ; RUN: opt -mtriple=amdgcn-amd-amdhsa -S -O3 -enable-no-infs-fp-math %s | FileCheck --check-prefixes=GCN,NOINFS %s
 
@@ -36,18 +36,18 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true"}
-attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "uniform-work-group-size"="false"}
+attributes #1 = { nounwind "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
 
 ;.
-; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; UNSAFE: attributes #[[ATTR0]] = { nounwind "uniform-work-group-size"="false" }
+; UNSAFE: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
-; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NONANS: attributes #[[ATTR0]] = { nounwind "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
+; NONANS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="true" "uniform-work-group-size"="false" }
 ;.
-; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
-; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" "unsafe-fp-math"="true" }
+; NOINFS: attributes #[[ATTR0]] = { nounwind "no-infs-fp-math"="true" "uniform-work-group-size"="false" }
+; NOINFS: attributes #[[ATTR1]] = { nounwind "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="false" "uniform-work-group-size"="false" }
 ;.
 ; UNSAFE: [[META0]] = !{}
 ;.
diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
index 5b24f8879c2cd..3be6682bc4ffa 100644
--- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll
@@ -135,19 +135,17 @@ define amdgpu_kernel void @f2(i32 %arg, i32 %arg1, i32 %arg2, i1 %arg3, i32 %arg
 ; GFX11-NEXT:  .LBB2_6: ; %bb18
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_and_b32 s1, 0xffff, s1
-; GFX11-NEXT:    v_readfirstlane_b32 s13, v0
-; GFX11-NEXT:    s_cmp_lg_u32 s1, 0
-; GFX11-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s1
-; GFX11-NEXT:    s_and_b32 s1, s8, s1
-; GFX11-NEXT:    s_and_b32 s1, s1, exec_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX11-NEXT:    v_readfirstlane_b32 s1, v0
+; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
+; GFX11-NEXT:    s_and_b32 s13, s8, s13
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_and_b32 s13, s13, exec_lo
 ; GFX11-NEXT:    v_readfirstlane_b32 s19, v2
-; GFX11-NEXT:    s_cselect_b32 s1, s19, s13
-; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
+; GFX11-NEXT:    s_cselect_b32 s1, s19, s1
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
 ; GFX11-NEXT:    s_and_b32 s1, s1, 1
-; GFX11-NEXT:    s_cmp_lg_u32 s13, 0
+; GFX11-NEXT:    s_and_b32 s13, 0xffff, s0
 ; GFX11-NEXT:    s_cselect_b32 s13, -1, 0
 ; GFX11-NEXT:    s_and_b32 s20, s9, exec_lo
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s13
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 968471287dc4d..2f9182e6e7c6a 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -1066,13 +1066,13 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[8:9]
 ; GISEL-NEXT:    v_lshlrev_b64 v[8:9], 30, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v5, 2, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT:    v_or_b32_e32 v9, v8, v5
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
 ; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 29, v[2:3]
 ; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
 ; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 3, v1
-; GISEL-NEXT:    v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT:    v_or_b32_e32 v9, v2, v0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v6
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
new file mode 100644
index 0000000000000..99421d4a9b5c9
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.add.min.max.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1250 < %s | FileCheck -check-prefixes=GCN,GFX1250-GISEL %s
+
+declare i32 @llvm.amdgcn.add.min.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.i32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.min.u32(i32, i32, i32, i1)
+declare i32 @llvm.amdgcn.add.max.u32(i32, i32, i32, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+declare <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16>, <2 x i16>, <2 x i16>, i1)
+
+define i32 @test_add_min_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_min_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_min_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_min_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_min_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.min.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_i32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_i32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_i32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_i32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.i32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_vvv(i32 %a, i32 %b, i32 %c) {
+; GCN-LABEL: test_add_max_u32_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 %c, i1 0)
+  ret i32 %ret
+}
+
+define i32 @test_add_max_u32_ssi_clamp(i32 inreg %a, i32 inreg %b) {
+; GCN-LABEL: test_add_max_u32_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_add_max_u32 v0, s0, s1, 1 clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call i32 @llvm.amdgcn.add.max.u32(i32 %a, i32 %b, i32 1, i1 1)
+  ret i32 %ret
+}
+
+define <2 x i16> @test_add_min_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_min_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_min_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_min_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_min_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.min.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_i16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_i16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_i16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_i16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.i16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_vvv(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c) {
+; GCN-LABEL: test_add_max_u16_vvv:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, v0, v1, v2
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> %c, i1 0)
+  ret <2 x i16> %ret
+}
+
+define <2 x i16> @test_add_max_u16_ssi_clamp(<2 x i16> inreg %a, <2 x i16> inreg %b) {
+; GCN-LABEL: test_add_max_u16_ssi_clamp:
+; GCN:       ; %bb.0:
+; GCN-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GCN-NEXT:    s_wait_kmcnt 0x0
+; GCN-NEXT:    v_pk_add_max_u16 v0, s0, s1, 1 op_sel_hi:[1,1,0] clamp
+; GCN-NEXT:    s_set_pc_i64 s[30:31]
+  %ret = tail call <2 x i16> @llvm.amdgcn.pk.add.max.u16(<2 x i16> %a, <2 x i16> %b, <2 x i16> <i16 1, i16 1>, i1 1)
+  ret <2 x i16> %ret
+}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GFX1250-GISEL: {{.*}}
+; GFX1250-SDAG: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll
index 00b6bcf79f79c..1dcae2c264baa 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.b128.ll
@@ -1420,23 +1420,14 @@ define <4 x float> @global_load_f32_zext_vgpr_range_too_large(ptr addrspace(1) %
 ;; Check add-as-or with split 64-bit or.
 define <4 x float> @global_load_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) %sbase, i32 %idx) {
 ;
-; GFX-SDAG-LABEL: global_load_i8_offset_or_i64_imm_offset_16:
-; GFX-SDAG:       ; %bb.0:
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 16, v1
-; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX-ISEL-LABEL: global_load_i8_offset_or_i64_imm_offset_16:
-; GFX-ISEL:       ; %bb.0:
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX-ISEL-NEXT:    v_or_b32_e32 v2, 16, v1
-; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[2:3], off sc1
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX-LABEL: global_load_i8_offset_or_i64_imm_offset_16:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_or_b32_e32 v0, 16, v1
+; GFX-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -1447,23 +1438,14 @@ define <4 x float> @global_load_i8_offset_or_i64_imm_offset_16(ptr addrspace(6)
 
 define <4 x float> @global_load_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) %sbase, i32 %idx) {
 ;
-; GFX-SDAG-LABEL: global_load_i8_offset_or_i64_imm_offset_4160:
-; GFX-SDAG:       ; %bb.0:
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v1
-; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX-ISEL-LABEL: global_load_i8_offset_or_i64_imm_offset_4160:
-; GFX-ISEL:       ; %bb.0:
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-ISEL-NEXT:    v_mov_b32_e32 v3, 0
-; GFX-ISEL-NEXT:    v_or_b32_e32 v2, 0x1040, v1
-; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[2:3], off sc0 sc1
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX-LABEL: global_load_i8_offset_or_i64_imm_offset_4160:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_or_b32_e32 v0, 0x1040, v1
+; GFX-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0 sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -2629,23 +2611,14 @@ define <4 x float> @global_load_f32_saddr_zext_vgpr_range_too_large(ptr addrspac
 ;; Check add-as-or with split 64-bit or.
 define <4 x float> @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ;
-; GFX-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX-SDAG:       ; %bb.0:
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX-ISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
-; GFX-ISEL:       ; %bb.0:
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-ISEL-NEXT:    v_or_b32_e32 v0, 16, v0
-; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_16:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_or_b32_e32 v0, 16, v0
+; GFX-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc0
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 16
   %addr = inttoptr i64 %or to ptr addrspace(1)
@@ -2656,23 +2629,14 @@ define <4 x float> @global_load_saddr_i8_offset_or_i64_imm_offset_16(ptr addrspa
 
 define <4 x float> @global_load_saddr_i8_offset_or_i64_imm_offset_4160(ptr addrspace(6) inreg %sbase, i32 %idx) {
 ;
-; GFX-SDAG-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX-SDAG:       ; %bb.0:
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-SDAG-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX-SDAG-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-SDAG-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
-; GFX-SDAG-NEXT:    s_waitcnt vmcnt(0)
-; GFX-SDAG-NEXT:    s_setpc_b64 s[30:31]
-;
-; GFX-ISEL-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
-; GFX-ISEL:       ; %bb.0:
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX-ISEL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX-ISEL-NEXT:    v_or_b32_e32 v0, 0x1040, v0
-; GFX-ISEL-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
-; GFX-ISEL-NEXT:    s_waitcnt vmcnt(0)
-; GFX-ISEL-NEXT:    s_setpc_b64 s[30:31]
+; GFX-LABEL: global_load_saddr_i8_offset_or_i64_imm_offset_4160:
+; GFX:       ; %bb.0:
+; GFX-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX-NEXT:    v_or_b32_e32 v0, 0x1040, v0
+; GFX-NEXT:    v_mov_b32_e32 v1, 0
+; GFX-NEXT:    global_load_dwordx4 v[0:3], v[0:1], off sc1
+; GFX-NEXT:    s_waitcnt vmcnt(0)
+; GFX-NEXT:    s_setpc_b64 s[30:31]
   %zext.idx = zext i32 %idx to i64
   %or = or i64 %zext.idx, 4160
   %addr = inttoptr i64 %or to ptr addrspace(1)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
index 883db20a867b3..e30a58699fadb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.exp2.ll
@@ -1485,7 +1485,7 @@ define float @v_exp2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_exp2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_exp2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_exp2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
index 0854134be6f46..61a777f8877bb 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.log2.ll
@@ -1907,7 +1907,7 @@ define float @v_log2_f32_fast(float %in) {
   ret float %result
 }
 
-define float @v_log2_f32_unsafe_math_attr(float %in) "unsafe-fp-math"="true" {
+define float @v_log2_f32_unsafe_math_attr(float %in) {
 ; SI-SDAG-LABEL: v_log2_f32_unsafe_math_attr:
 ; SI-SDAG:       ; %bb.0:
 ; SI-SDAG-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
index 8748aff42d65b..6dc919988cc4f 100644
--- a/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/local-atomicrmw-fadd.ll
@@ -8265,12 +8265,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -8351,14 +8349,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX942-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8440,15 +8437,14 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX11-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8528,11 +8524,10 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8609,14 +8604,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX90A-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8692,14 +8686,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX908-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -8776,14 +8769,13 @@ define amdgpu_kernel void @local_ds_fadd(ptr addrspace(1) %out, ptr addrspace(3)
 ; GFX8-NEXT:  .LBB28_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB28_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9130,12 +9122,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX12-NEXT:    s_wait_alu 0xfffe
 ; GFX12-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX12-NEXT:    v_readlane_b32 s6, v1, s3
-; GFX12-NEXT:    s_lshl_b32 s7, 1, s3
 ; GFX12-NEXT:    v_writelane_b32 v0, s0, s3
+; GFX12-NEXT:    s_lshl_b32 s3, 1, s3
 ; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_and_not1_b32 s1, s1, s7
-; GFX12-NEXT:    s_wait_alu 0xfffe
-; GFX12-NEXT:    s_cmp_lg_u32 s1, 0
+; GFX12-NEXT:    s_and_not1_b32 s1, s1, s3
 ; GFX12-NEXT:    s_add_f32 s0, s0, s6
 ; GFX12-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX12-NEXT:  ; %bb.6: ; %ComputeEnd
@@ -9212,14 +9202,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX942-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX942-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX942-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX942-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX942-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX942-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX942-NEXT:    s_mov_b32 m0, s3
+; GFX942-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX942-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX942-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX942-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX942-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX942-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX942-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX942-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX942-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX942-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX942-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9296,15 +9285,14 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX11-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX11-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX11-NEXT:    s_ctz_i32_b32 s1, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readfirstlane_b32 s3, v1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX11-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-NEXT:    s_and_not1_b32 s0, s0, s7
 ; GFX11-NEXT:    v_writelane_b32 v0, s3, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
 ; GFX11-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX11-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX11-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX11-NEXT:    s_and_not1_b32 s0, s0, s1
 ; GFX11-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX11-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX11-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9377,11 +9365,10 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX10-NEXT:    s_ff1_i32_b32 s1, s0
 ; GFX10-NEXT:    v_readfirstlane_b32 s3, v1
 ; GFX10-NEXT:    v_readlane_b32 s6, v2, s1
-; GFX10-NEXT:    s_lshl_b32 s7, 1, s1
-; GFX10-NEXT:    s_andn2_b32 s0, s0, s7
 ; GFX10-NEXT:    v_writelane_b32 v0, s3, s1
 ; GFX10-NEXT:    v_add_f32_e32 v1, s6, v1
-; GFX10-NEXT:    s_cmp_lg_u32 s0, 0
+; GFX10-NEXT:    s_lshl_b32 s1, 1, s1
+; GFX10-NEXT:    s_andn2_b32 s0, s0, s1
 ; GFX10-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX10-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX10-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9453,14 +9440,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX90A-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX90A-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX90A-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX90A-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX90A-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX90A-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX90A-NEXT:    s_mov_b32 m0, s3
+; GFX90A-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX90A-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX90A-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX90A-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX90A-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX90A-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX90A-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX90A-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX90A-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX90A-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX90A-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9533,14 +9519,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX908-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX908-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX908-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX908-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX908-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX908-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX908-NEXT:    s_mov_b32 m0, s3
+; GFX908-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX908-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX908-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX908-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX908-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX908-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX908-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX908-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX908-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX908-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX908-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
@@ -9614,14 +9599,13 @@ define amdgpu_kernel void @local_ds_fadd_one_as(ptr addrspace(1) %out, ptr addrs
 ; GFX8-NEXT:  .LBB29_5: ; %ComputeLoop
 ; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
 ; GFX8-NEXT:    s_ff1_i32_b64 s3, s[0:1]
-; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
-; GFX8-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX8-NEXT:    v_readlane_b32 s9, v2, s3
+; GFX8-NEXT:    v_readfirstlane_b32 s6, v1
 ; GFX8-NEXT:    s_mov_b32 m0, s3
+; GFX8-NEXT:    v_readlane_b32 s8, v2, s3
+; GFX8-NEXT:    v_writelane_b32 v0, s6, m0
+; GFX8-NEXT:    s_lshl_b64 s[6:7], 1, s3
 ; GFX8-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[6:7]
-; GFX8-NEXT:    v_writelane_b32 v0, s8, m0
-; GFX8-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX8-NEXT:    v_add_f32_e32 v1, s9, v1
+; GFX8-NEXT:    v_add_f32_e32 v1, s8, v1
 ; GFX8-NEXT:    s_cbranch_scc1 .LBB29_5
 ; GFX8-NEXT:  ; %bb.6: ; %ComputeEnd
 ; GFX8-NEXT:    v_mbcnt_lo_u32_b32 v2, exec_lo, 0
diff --git a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
index 1e4b63327651e..fc36ed939d91d 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-buffer-fat-pointers-nontemporal-metadata.ll
@@ -45,27 +45,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc slc
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
@@ -105,27 +96,18 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen nt
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen nt
@@ -168,29 +150,22 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen slc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen slc
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen glc slc
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -234,32 +209,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen slc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen slc dlc
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen glc slc dlc
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -303,32 +267,21 @@ define amdgpu_kernel void @buffer_nontemporal_load_store(ptr addrspace(7) %in, p
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT
 ; GFX12-GISEL-NEXT:    s_endpgm
@@ -374,28 +327,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX9-GISEL:       ; %bb.0: ; %entry
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
-; GFX9-GISEL-NEXT:    s_mov_b32 s11, 0
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX9-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
 ; GFX9-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
-; GFX9-GISEL-NEXT:    s_mov_b32 s4, s11
-; GFX9-GISEL-NEXT:    s_mov_b32 s6, s11
 ; GFX9-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s1
+; GFX9-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX9-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX9-GISEL-NEXT:    s_or_b64 s[4:5], s[10:11], s[4:5]
-; GFX9-GISEL-NEXT:    s_mov_b32 s10, s3
-; GFX9-GISEL-NEXT:    s_or_b64 s[6:7], s[10:11], s[6:7]
+; GFX9-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX9-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX9-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX9-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -436,28 +380,19 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX942-GISEL:       ; %bb.0: ; %entry
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x0
 ; GFX942-GISEL-NEXT:    s_load_dword s11, s[4:5], 0x10
-; GFX942-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s10, s7
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s8, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s9, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[8:9], s[6:7], s[8:9]
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[10:11], s[6:7], s[10:11]
+; GFX942-GISEL-NEXT:    s_mov_b32 s10, s3
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX942-GISEL-NEXT:    buffer_load_dword v0, v0, s[8:11], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX942-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[4:5], 0x20
-; GFX942-GISEL-NEXT:    s_load_dword s9, s[4:5], 0x30
-; GFX942-GISEL-NEXT:    s_mov_b32 s4, s7
-; GFX942-GISEL-NEXT:    s_mov_b32 s8, s7
+; GFX942-GISEL-NEXT:    s_load_dword s7, s[4:5], 0x30
 ; GFX942-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX942-GISEL-NEXT:    s_mov_b32 s6, s1
+; GFX942-GISEL-NEXT:    s_mov_b32 s4, s1
 ; GFX942-GISEL-NEXT:    s_mov_b32 s5, s2
-; GFX942-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
 ; GFX942-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX942-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[8:9]
 ; GFX942-GISEL-NEXT:    v_mov_b32_e32 v1, s0
 ; GFX942-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen sc0 sc1
 ; GFX942-GISEL-NEXT:    s_waitcnt vmcnt(0)
@@ -501,30 +436,23 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX10-GISEL:       ; %bb.0: ; %entry
 ; GFX10-GISEL-NEXT:    s_clause 0x1
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x0
-; GFX10-GISEL-NEXT:    s_load_dword s5, s[8:9], 0x10
-; GFX10-GISEL-NEXT:    s_mov_b32 s7, 0
-; GFX10-GISEL-NEXT:    s_mov_b32 s10, s7
-; GFX10-GISEL-NEXT:    s_mov_b32 s4, s7
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x10
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[0:1], s[6:7], s[10:11]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[2:3], s[6:7], s[4:5]
-; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[0:3], 0 offen glc dlc
+; GFX10-GISEL-NEXT:    buffer_load_dword v0, v0, s[4:7], 0 offen glc dlc
 ; GFX10-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-GISEL-NEXT:    s_clause 0x1
-; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
 ; GFX10-GISEL-NEXT:    s_load_dwordx4 s[0:3], s[8:9], 0x20
-; GFX10-GISEL-NEXT:    s_load_dword s11, s[8:9], 0x30
+; GFX10-GISEL-NEXT:    s_waitcnt_depctr 0xffe3
+; GFX10-GISEL-NEXT:    s_load_dword s7, s[8:9], 0x30
 ; GFX10-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-GISEL-NEXT:    s_mov_b32 s6, s1
-; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX10-GISEL-NEXT:    s_or_b64 s[4:5], s[6:7], s[4:5]
+; GFX10-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX10-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX10-GISEL-NEXT:    s_mov_b32 s6, s3
-; GFX10-GISEL-NEXT:    s_or_b64 s[6:7], s[6:7], s[10:11]
 ; GFX10-GISEL-NEXT:    buffer_store_dword v0, v1, s[4:7], 0 offen
 ; GFX10-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX10-GISEL-NEXT:    s_endpgm
@@ -569,33 +497,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX11-GISEL:       ; %bb.0: ; %entry
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX11-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX11-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX11-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], 0 offen glc dlc
+; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX11-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], 0 offen glc dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX11-GISEL-NEXT:    s_clause 0x1
 ; GFX11-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX11-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX11-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX11-GISEL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX11-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX11-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX11-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX11-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX11-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX11-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX11-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX11-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX11-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], 0 offen dlc
 ; GFX11-GISEL-NEXT:    s_waitcnt_vscnt null, 0x0
 ; GFX11-GISEL-NEXT:    s_endpgm
@@ -640,33 +557,22 @@ define amdgpu_kernel void @buffer_nontemporal_and_volatile_load_store(ptr addrsp
 ; GFX12-GISEL:       ; %bb.0: ; %entry
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x0
-; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x10
-; GFX12-GISEL-NEXT:    s_mov_b32 s9, 0
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_mov_b32 s10, s9
-; GFX12-GISEL-NEXT:    s_mov_b32 s6, s9
+; GFX12-GISEL-NEXT:    s_load_b32 s11, s[4:5], 0x10
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s11, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v0, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[0:1], s[8:9], s[10:11]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[2:3], s[8:9], s[6:7]
-; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[0:3], null offen th:TH_LOAD_NT scope:SCOPE_SYS
+; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s9, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s10, s3
+; GFX12-GISEL-NEXT:    buffer_load_b32 v0, v0, s[8:11], null offen th:TH_LOAD_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_loadcnt 0x0
 ; GFX12-GISEL-NEXT:    s_clause 0x1
 ; GFX12-GISEL-NEXT:    s_load_b128 s[0:3], s[4:5], 0x20
 ; GFX12-GISEL-NEXT:    s_load_b32 s7, s[4:5], 0x30
-; GFX12-GISEL-NEXT:    s_mov_b32 s4, s9
 ; GFX12-GISEL-NEXT:    s_wait_kmcnt 0x0
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s1
-; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
 ; GFX12-GISEL-NEXT:    v_mov_b32_e32 v1, s0
-; GFX12-GISEL-NEXT:    s_or_b64 s[4:5], s[8:9], s[4:5]
-; GFX12-GISEL-NEXT:    s_mov_b32 s8, s3
-; GFX12-GISEL-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
-; GFX12-GISEL-NEXT:    s_or_b64 s[6:7], s[8:9], s[6:7]
+; GFX12-GISEL-NEXT:    s_mov_b32 s4, s1
+; GFX12-GISEL-NEXT:    s_mov_b32 s5, s2
+; GFX12-GISEL-NEXT:    s_mov_b32 s6, s3
 ; GFX12-GISEL-NEXT:    buffer_store_b32 v0, v1, s[4:7], null offen th:TH_STORE_NT scope:SCOPE_SYS
 ; GFX12-GISEL-NEXT:    s_wait_storecnt 0x0
 ; GFX12-GISEL-NEXT:    s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/minmax.ll b/llvm/test/CodeGen/AMDGPU/minmax.ll
index d578d2e9720f0..60570bdbfc23b 100644
--- a/llvm/test/CodeGen/AMDGPU/minmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/minmax.ll
@@ -1296,4 +1296,4 @@ declare half @llvm.minnum.f16(half, half)
 declare half @llvm.maxnum.f16(half, half)
 declare float @llvm.minnum.f32(float, float)
 declare float @llvm.maxnum.f32(float, float)
-attributes #0 = { nounwind "unsafe-fp-math"="false" "no-nans-fp-math"="true" }
+attributes #0 = { nounwind "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
index c1cf06e30c745..fba42c494343b 100644
--- a/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
+++ b/llvm/test/CodeGen/AMDGPU/optimize-compare.mir
@@ -388,9 +388,8 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 1, killed [[COPY]], implicit-def $scc
-  ; GCN-NEXT:   S_NOP 0, implicit killed $scc
-  ; GCN-NEXT:   S_CMP_EQ_U32 killed [[S_AND_B32_]], 1, implicit-def $scc
+  ; GCN-NEXT:   S_BITCMP1_B32 killed [[COPY]], 0, implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
@@ -416,6 +415,80 @@ body:             |
   bb.2:
     S_ENDPGM 0
 
+...
+---
+name:            xor_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: xor_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_XOR_B32_:%[0-9]+]]:sreg_32 = S_XOR_B32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_XOR_B32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
+...
+---
+name:            absdiff_1_cmp_lg_0_killed_scc
+body:             |
+  ; GCN-LABEL: name: absdiff_1_cmp_lg_0_killed_scc
+  ; GCN: bb.0:
+  ; GCN-NEXT:   successors: %bb.1(0x40000000), %bb.2(0x40000000)
+  ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+  ; GCN-NEXT:   [[S_ABSDIFF_I32_:%[0-9]+]]:sreg_32 = S_ABSDIFF_I32 1, killed [[COPY]], implicit-def $scc
+  ; GCN-NEXT:   S_NOP 0, implicit $scc
+  ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
+  ; GCN-NEXT:   S_BRANCH %bb.1
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.1:
+  ; GCN-NEXT:   successors: %bb.2(0x80000000)
+  ; GCN-NEXT: {{  $}}
+  ; GCN-NEXT: bb.2:
+  ; GCN-NEXT:   S_ENDPGM 0
+  bb.0:
+    successors: %bb.1(0x40000000), %bb.2(0x40000000)
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    %0:sreg_32 = COPY $sgpr0
+    %1:sreg_32 = S_ABSDIFF_I32 1, killed %0, implicit-def $scc
+    S_NOP 0, implicit killed $scc
+    S_CMP_LG_U32 killed %1:sreg_32, 0, implicit-def $scc
+    S_CBRANCH_SCC0 %bb.2, implicit $scc
+    S_BRANCH %bb.1
+
+  bb.1:
+    successors: %bb.2(0x80000000)
+
+  bb.2:
+    S_ENDPGM 0
+
 ...
 
 ---
@@ -2070,8 +2143,7 @@ body:             |
   ; GCN-NEXT:   liveins: $sgpr0, $vgpr0_vgpr1
   ; GCN-NEXT: {{  $}}
   ; GCN-NEXT:   [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def dead $scc
-  ; GCN-NEXT:   S_CMP_LG_U32 killed [[S_AND_B32_]], 0, implicit-def $scc
+  ; GCN-NEXT:   [[S_AND_B32_:%[0-9]+]]:sreg_32 = S_AND_B32 3, killed [[COPY]], implicit-def $scc
   ; GCN-NEXT:   S_CBRANCH_SCC0 %bb.2, implicit $scc
   ; GCN-NEXT:   S_BRANCH %bb.1
   ; GCN-NEXT: {{  $}}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll
index 15d175031d9fd..47e5ccc12b975 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs-debug-info-multi-entry.ll
@@ -69,33 +69,29 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef
 ; GFX942-NEXT:    s_mul_i32 s11, s5, s11
 ; GFX942-NEXT:    s_add_u32 s11, s14, s11
 ; GFX942-NEXT:    s_addc_u32 s14, 0, s15
-; GFX942-NEXT:    s_add_u32 s15, s10, s11
-; GFX942-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX942-NEXT:    s_cmp_lg_u64 s[10:11], 0
+; GFX942-NEXT:    s_add_u32 s10, s10, s11
 ; GFX942-NEXT:    s_addc_u32 s5, s5, s14
-; GFX942-NEXT:    s_mul_i32 s10, s1, s5
-; GFX942-NEXT:    s_mul_hi_u32 s11, s1, s15
-; GFX942-NEXT:    s_add_i32 s10, s11, s10
-; GFX942-NEXT:    s_mul_i32 s3, s3, s15
-; GFX942-NEXT:    s_add_i32 s10, s10, s3
-; GFX942-NEXT:    s_mul_i32 s1, s1, s15
-; GFX942-NEXT:    s_mul_hi_u32 s11, s5, s1
-; GFX942-NEXT:    s_mul_i32 s14, s5, s1
-; GFX942-NEXT:    s_mul_i32 s17, s15, s10
-; GFX942-NEXT:    s_mul_hi_u32 s1, s15, s1
-; GFX942-NEXT:    s_mul_hi_u32 s16, s15, s10
+; GFX942-NEXT:    s_mul_i32 s11, s1, s5
+; GFX942-NEXT:    s_mul_hi_u32 s14, s1, s10
+; GFX942-NEXT:    s_add_i32 s11, s14, s11
+; GFX942-NEXT:    s_mul_i32 s3, s3, s10
+; GFX942-NEXT:    s_add_i32 s11, s11, s3
+; GFX942-NEXT:    s_mul_i32 s1, s1, s10
+; GFX942-NEXT:    s_mul_hi_u32 s14, s5, s1
+; GFX942-NEXT:    s_mul_i32 s15, s5, s1
+; GFX942-NEXT:    s_mul_i32 s17, s10, s11
+; GFX942-NEXT:    s_mul_hi_u32 s1, s10, s1
+; GFX942-NEXT:    s_mul_hi_u32 s16, s10, s11
 ; GFX942-NEXT:    s_add_u32 s1, s1, s17
 ; GFX942-NEXT:    s_addc_u32 s16, 0, s16
-; GFX942-NEXT:    s_add_u32 s1, s1, s14
-; GFX942-NEXT:    s_mul_hi_u32 s3, s5, s10
-; GFX942-NEXT:    s_addc_u32 s1, s16, s11
+; GFX942-NEXT:    s_add_u32 s1, s1, s15
+; GFX942-NEXT:    s_mul_hi_u32 s3, s5, s11
+; GFX942-NEXT:    s_addc_u32 s1, s16, s14
 ; GFX942-NEXT:    s_addc_u32 s3, s3, 0
-; GFX942-NEXT:    s_mul_i32 s10, s5, s10
-; GFX942-NEXT:    s_add_u32 s1, s1, s10
+; GFX942-NEXT:    s_mul_i32 s11, s5, s11
+; GFX942-NEXT:    s_add_u32 s1, s1, s11
 ; GFX942-NEXT:    s_addc_u32 s3, 0, s3
-; GFX942-NEXT:    s_add_u32 s1, s15, s1
-; GFX942-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX942-NEXT:    s_cmp_lg_u64 s[10:11], 0
+; GFX942-NEXT:    s_add_u32 s1, s10, s1
 ; GFX942-NEXT:    s_addc_u32 s3, s5, s3
 ; GFX942-NEXT:    s_mul_i32 s10, s6, s3
 ; GFX942-NEXT:    s_mul_hi_u32 s11, s6, s1
@@ -118,37 +114,34 @@ define amdgpu_kernel void @preload_block_count_x(ptr addrspace(1) inreg noundef
 ; GFX942-NEXT:    s_add_i32 s5, s5, s10
 ; GFX942-NEXT:    s_sub_i32 s14, s7, s5
 ; GFX942-NEXT:    s_mul_i32 s10, s12, s1
-; GFX942-NEXT:    s_sub_u32 s16, s6, s10
+; GFX942-NEXT:    s_sub_u32 s15, s6, s10
 ; GFX942-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GFX942-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GFX942-NEXT:    s_subb_u32 s17, s14, s13
-; GFX942-NEXT:    s_sub_u32 s18, s16, s12
-; GFX942-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GFX942-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GFX942-NEXT:    s_subb_u32 s14, s17, 0
+; GFX942-NEXT:    s_subb_u32 s14, s14, s13
+; GFX942-NEXT:    s_sub_u32 s16, s15, s12
+; GFX942-NEXT:    s_subb_u32 s14, s14, 0
 ; GFX942-NEXT:    s_cmp_ge_u32 s14, s13
-; GFX942-NEXT:    s_cselect_b32 s15, -1, 0
-; GFX942-NEXT:    s_cmp_ge_u32 s18, s12
 ; GFX942-NEXT:    s_cselect_b32 s17, -1, 0
+; GFX942-NEXT:    s_cmp_ge_u32 s16, s12
+; GFX942-NEXT:    s_cselect_b32 s16, -1, 0
 ; GFX942-NEXT:    s_cmp_eq_u32 s14, s13
-; GFX942-NEXT:    s_cselect_b32 s14, s17, s15
-; GFX942-NEXT:    s_add_u32 s15, s1, 1
+; GFX942-NEXT:    s_cselect_b32 s14, s16, s17
+; GFX942-NEXT:    s_add_u32 s16, s1, 1
 ; GFX942-NEXT:    s_addc_u32 s17, s3, 0
 ; GFX942-NEXT:    s_add_u32 s18, s1, 2
 ; GFX942-NEXT:    s_addc_u32 s19, s3, 0
 ; GFX942-NEXT:    s_cmp_lg_u32 s14, 0
-; GFX942-NEXT:    s_cselect_b32 s14, s18, s15
-; GFX942-NEXT:    s_cselect_b32 s15, s19, s17
+; GFX942-NEXT:    s_cselect_b32 s14, s18, s16
+; GFX942-NEXT:    s_cselect_b32 s16, s19, s17
 ; GFX942-NEXT:    s_cmp_lg_u64 s[10:11], 0
 ; GFX942-NEXT:    s_subb_u32 s5, s7, s5
 ; GFX942-NEXT:    s_cmp_ge_u32 s5, s13
 ; GFX942-NEXT:    s_cselect_b32 s10, -1, 0
-; GFX942-NEXT:    s_cmp_ge_u32 s16, s12
+; GFX942-NEXT:    s_cmp_ge_u32 s15, s12
 ; GFX942-NEXT:    s_cselect_b32 s11, -1, 0
 ; GFX942-NEXT:    s_cmp_eq_u32 s5, s13
 ; GFX942-NEXT:    s_cselect_b32 s5, s11, s10
 ; GFX942-NEXT:    s_cmp_lg_u32 s5, 0
-; GFX942-NEXT:    s_cselect_b32 s11, s15, s3
+; GFX942-NEXT:    s_cselect_b32 s11, s16, s3
 ; GFX942-NEXT:    s_cselect_b32 s10, s14, s1
 ; GFX942-NEXT:    s_cbranch_execnz .LBB0_3
 ; GFX942-NEXT:  .LBB0_2:
diff --git a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
index ef3e04c0e9968..6ce614bd92480 100644
--- a/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/prevent-fmul-hoist-ir.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=fast -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=FP-CONTRACT-FAST %s
-; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off --enable-unsafe-fp-math -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
+; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=UNSAFE-FP-MATH %s
 ; RUN: opt -S -passes='simplifycfg<hoist-common-insts>' -mtriple=amdgcn-- --fp-contract=off -mcpu=gfx1030 < %s | FileCheck -check-prefix=GFX -check-prefix=NO-UNSAFE-FP-MATH %s
 
 define double @is_profitable_f64_contract(ptr dereferenceable(8) %ptr_x, ptr dereferenceable(8) %ptr_y, ptr dereferenceable(8) %ptr_a) #0 {
diff --git a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
index f53aaaad87e16..dd5f838b4a206 100644
--- a/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_cmp_0.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
-; RUN: llc -mtriple=amdgcn-amd-amdpal -mcpu=gfx900 < %s | FileCheck %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 < %s | FileCheck %s
 
 declare i32 @llvm.ctpop.i32(i32)
 declare i64 @llvm.ctpop.i64(i64)
@@ -10,7 +10,6 @@ define amdgpu_ps i32 @shl32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: shl32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -25,7 +24,6 @@ define amdgpu_ps i32 @shl64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: shl64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshl_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -40,7 +38,6 @@ define amdgpu_ps i32 @lshr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: lshr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -55,7 +52,6 @@ define amdgpu_ps i32 @lshr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: lshr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_lshr_b64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -70,7 +66,6 @@ define amdgpu_ps i32 @ashr32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: ashr32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -85,7 +80,6 @@ define amdgpu_ps i32 @ashr64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: ashr64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_ashr_i64 s[0:1], s[0:1], s2
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -100,7 +94,6 @@ define amdgpu_ps i32 @abs32(i32 inreg %val0) {
 ; CHECK-LABEL: abs32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_abs_i32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -121,7 +114,6 @@ define amdgpu_ps i32 @and32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: and32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -136,7 +128,6 @@ define amdgpu_ps i32 @and64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: and64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_and_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -151,7 +142,6 @@ define amdgpu_ps i32 @or32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: or32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -166,7 +156,6 @@ define amdgpu_ps i32 @or64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: or64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_or_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -181,7 +170,6 @@ define amdgpu_ps i32 @xor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -196,7 +184,6 @@ define amdgpu_ps i32 @xor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -211,7 +198,6 @@ define amdgpu_ps i32 @nand32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nand32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -231,7 +217,6 @@ define amdgpu_ps i32 @nand64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nand64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nand_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -251,7 +236,6 @@ define amdgpu_ps i32 @nor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: nor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -271,7 +255,6 @@ define amdgpu_ps i32 @nor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_nor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -291,7 +274,6 @@ define amdgpu_ps i32 @xnor32(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: xnor32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -311,7 +293,6 @@ define amdgpu_ps i32 @xnor64(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: xnor64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_xnor_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -331,7 +312,6 @@ define amdgpu_ps i32 @andn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: andn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -347,7 +327,6 @@ define amdgpu_ps i32 @nandn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: nandn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -363,7 +342,6 @@ define amdgpu_ps i32 @orn232(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: orn232:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b32 s0, s0, s1
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -379,7 +357,6 @@ define amdgpu_ps i32 @orn264(i64 inreg %val0, i64 inreg %val1) {
 ; CHECK-LABEL: orn264:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_orn2_b64 s[0:1], s[0:1], s[2:3]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -395,7 +372,6 @@ define amdgpu_ps i32 @bfe_i32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_i32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_i32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -433,7 +409,6 @@ define amdgpu_ps i32 @bfe_u32(i32 inreg %val0) {
 ; CHECK-LABEL: bfe_u32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bfe_u32 s0, s0, 0x80010
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; CHECK-NEXT:    v_readfirstlane_b32 s0, v0
@@ -513,7 +488,6 @@ define amdgpu_ps i32 @bcnt132(i32 inreg %val0) {
 ; CHECK-LABEL: bcnt132:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_bcnt1_i32_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -552,7 +526,6 @@ define amdgpu_ps i32 @quadmask32(i32 inreg %val0) {
 ; CHECK-LABEL: quadmask32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -571,7 +544,6 @@ define amdgpu_ps i32 @quadmask64(i64 inreg %val0) {
 ; CHECK-LABEL: quadmask64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_quadmask_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -590,7 +562,6 @@ define amdgpu_ps i32 @not32(i32 inreg %val0) {
 ; CHECK-LABEL: not32:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b32 s0, s0
-; CHECK-NEXT:    s_cmp_lg_u32 s0, 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s0
 ; CHECK-NEXT:    ;;#ASMEND
@@ -609,7 +580,6 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
 ; CHECK-LABEL: not64:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_not_b64 s[0:1], s[0:1]
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    ;;#ASMSTART
 ; CHECK-NEXT:    ; use s[0:1]
 ; CHECK-NEXT:    ;;#ASMEND
@@ -623,3 +593,35 @@ define amdgpu_ps i32 @not64(i64 inreg %val0) {
   %zext = zext i1 %cmp to i32
   ret i32 %zext
 }
+
+
+; --------------------------------------------------------------------------------
+; Negative tests
+; --------------------------------------------------------------------------------
+
+@1 = extern_weak dso_local addrspace(4) constant i32
+
+define amdgpu_ps i32 @si_pc_add_rel_offset_must_not_optimize() {
+; CHECK-LABEL: si_pc_add_rel_offset_must_not_optimize:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    s_getpc_b64 s[0:1]
+; CHECK-NEXT:    s_add_u32 s0, s0, __unnamed_1@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s1, s1, __unnamed_1@rel32@hi+12
+; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
+; CHECK-NEXT:    s_cbranch_scc0 .LBB35_2
+; CHECK-NEXT:  ; %bb.1: ; %endif
+; CHECK-NEXT:    s_mov_b32 s0, 1
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_2: ; %if
+; CHECK-NEXT:    s_mov_b32 s0, 0
+; CHECK-NEXT:    s_branch .LBB35_3
+; CHECK-NEXT:  .LBB35_3:
+  %cmp = icmp ne ptr addrspace(4) @1, null
+  br i1 %cmp, label %endif, label %if
+
+if:
+  ret i32 0
+
+endif:
+  ret i32 1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
index a828ee0a7883c..7552f6b3a2d48 100644
--- a/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_uaddo_usubo_pseudo.ll
@@ -12,8 +12,6 @@ define amdgpu_ps i32 @s_uaddo_pseudo(i32 inreg %val0) {
 ; CHECK-LABEL: s_uaddo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_add_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; CHECK-NEXT:    s_addc_u32 s0, 1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.uadd.with.overflow.i32(i32 %val0, i32 1)
@@ -32,8 +30,6 @@ define amdgpu_ps i32 @s_usubo_pseudo(i32 inreg %val0, i32 inreg %val1) {
 ; CHECK-LABEL: s_usubo_pseudo:
 ; CHECK:       ; %bb.0:
 ; CHECK-NEXT:    s_sub_u32 s0, s0, 1
-; CHECK-NEXT:    s_cselect_b64 s[2:3], -1, 0
-; CHECK-NEXT:    s_cmp_lg_u64 s[2:3], 0
 ; CHECK-NEXT:    s_subb_u32 s0, s1, 0
 ; CHECK-NEXT:    ; return to shader part epilog
   %pair = call { i32, i1 } @llvm.usub.with.overflow.i32(i32 %val0, i32 1)
diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
index 5f6d6226dd17e..71f5a94a7f245 100644
--- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll
@@ -56,10 +56,9 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s15, 0, s16
 ; GCN-NEXT:    s_add_u32 s16, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s16
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s12, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s15
 ; GCN-NEXT:    s_mul_i32 s0, s12, s14
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -90,7 +89,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s15, s16, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s14, s14, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s7, 31
 ; GCN-NEXT:    s_add_u32 s0, s6, s12
@@ -116,52 +114,50 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s4, v0
 ; GCN-NEXT:    s_addc_u32 s4, s4, 0
 ; GCN-NEXT:    s_mul_i32 s14, s7, s14
-; GCN-NEXT:    s_add_u32 s14, s1, s14
-; GCN-NEXT:    v_mov_b32_e32 v0, s14
+; GCN-NEXT:    s_add_u32 s16, s1, s14
+; GCN-NEXT:    v_mov_b32_e32 v0, s16
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
-; GCN-NEXT:    s_addc_u32 s15, 0, s4
+; GCN-NEXT:    s_addc_u32 s17, 0, s4
 ; GCN-NEXT:    s_mov_b32 s1, s5
-; GCN-NEXT:    s_mul_i32 s4, s10, s15
+; GCN-NEXT:    s_mul_i32 s4, s10, s17
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_i32 s4, s5, s4
-; GCN-NEXT:    s_mul_i32 s5, s11, s14
-; GCN-NEXT:    s_add_i32 s16, s4, s5
-; GCN-NEXT:    s_sub_i32 s17, s7, s16
-; GCN-NEXT:    s_mul_i32 s4, s10, s14
+; GCN-NEXT:    s_mul_i32 s5, s11, s16
+; GCN-NEXT:    s_add_i32 s18, s4, s5
+; GCN-NEXT:    s_sub_i32 s14, s7, s18
+; GCN-NEXT:    s_mul_i32 s4, s10, s16
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s18, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s17, s17, s11
-; GCN-NEXT:    s_sub_u32 s19, s6, s10
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s4, s5
+; GCN-NEXT:    s_subb_u32 s19, s14, s11
+; GCN-NEXT:    s_sub_u32 s20, s6, s10
+; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s14, s15
+; GCN-NEXT:    s_subb_u32 s14, s19, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s20, s10
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s11
+; GCN-NEXT:    s_cselect_b32 s14, s19, s15
+; GCN-NEXT:    s_add_u32 s15, s16, 1
+; GCN-NEXT:    s_addc_u32 s19, s17, 0
+; GCN-NEXT:    s_add_u32 s20, s16, 2
+; GCN-NEXT:    s_addc_u32 s21, s17, 0
+; GCN-NEXT:    s_cmp_lg_u32 s14, 0
+; GCN-NEXT:    s_cselect_b32 s14, s20, s15
+; GCN-NEXT:    s_cselect_b32 s15, s21, s19
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s17, 0
+; GCN-NEXT:    s_subb_u32 s4, s7, s18
 ; GCN-NEXT:    s_cmp_ge_u32 s4, s11
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s19, s10
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s4, s11
-; GCN-NEXT:    s_cselect_b32 s4, s17, s5
-; GCN-NEXT:    s_add_u32 s5, s14, 1
-; GCN-NEXT:    s_addc_u32 s17, s15, 0
-; GCN-NEXT:    s_add_u32 s19, s14, 2
-; GCN-NEXT:    s_addc_u32 s20, s15, 0
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s19, s5
-; GCN-NEXT:    s_cselect_b32 s5, s20, s17
-; GCN-NEXT:    s_cmp_lg_u32 s18, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s16
-; GCN-NEXT:    s_cmp_ge_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s16, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s10
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s11
-; GCN-NEXT:    s_cselect_b32 s6, s6, s16
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s5, s5, s15
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
+; GCN-NEXT:    s_cmp_eq_u32 s4, s11
+; GCN-NEXT:    s_cselect_b32 s4, s6, s5
+; GCN-NEXT:    s_cmp_lg_u32 s4, 0
+; GCN-NEXT:    s_cselect_b32 s5, s15, s17
+; GCN-NEXT:    s_cselect_b32 s4, s14, s16
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[12:13], s[8:9]
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[6:7]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s6
@@ -208,7 +204,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s18, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s16, 63, s16
@@ -242,7 +237,6 @@ define amdgpu_kernel void @s_test_sdiv(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[8:9]
@@ -1195,10 +1189,9 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s12, 0, s13
 ; GCN-NEXT:    s_add_u32 s13, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s13
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s11, s11, s12
 ; GCN-NEXT:    s_mul_i32 s8, s2, s11
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1229,7 +1222,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s13, s2
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s8, s11, s10
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s8, 24
@@ -1238,48 +1230,46 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
 ; GCN-NEXT:    s_add_u32 s8, s10, s8
-; GCN-NEXT:    s_addc_u32 s10, 0, s9
-; GCN-NEXT:    v_mov_b32_e32 v0, s10
+; GCN-NEXT:    s_addc_u32 s12, 0, s9
+; GCN-NEXT:    v_mov_b32_e32 v0, s12
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
-; GCN-NEXT:    s_mul_i32 s8, s7, s10
+; GCN-NEXT:    s_mul_i32 s8, s7, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
-; GCN-NEXT:    s_add_i32 s11, s9, s8
-; GCN-NEXT:    s_sub_i32 s12, 0, s11
-; GCN-NEXT:    s_mul_i32 s8, s6, s10
-; GCN-NEXT:    s_sub_u32 s13, 24, s8
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s12, s12, s7
-; GCN-NEXT:    s_sub_u32 s15, s13, s6
+; GCN-NEXT:    s_add_i32 s13, s9, s8
+; GCN-NEXT:    s_sub_i32 s10, 0, s13
+; GCN-NEXT:    s_mul_i32 s8, s6, s12
+; GCN-NEXT:    s_sub_u32 s14, 24, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s8, s9
+; GCN-NEXT:    s_subb_u32 s15, s10, s7
+; GCN-NEXT:    s_sub_u32 s16, s14, s6
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s11, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s6
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s10, s7
+; GCN-NEXT:    s_cselect_b32 s10, s15, s11
+; GCN-NEXT:    s_add_u32 s11, s12, 1
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_add_u32 s16, s12, 2
+; GCN-NEXT:    s_addc_u32 s17, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s10, 0
+; GCN-NEXT:    s_cselect_b32 s10, s16, s11
+; GCN-NEXT:    s_cselect_b32 s11, s17, s15
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_subb_u32 s8, 0, s13
 ; GCN-NEXT:    s_cmp_ge_u32 s8, s7
 ; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s6
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s7
-; GCN-NEXT:    s_cselect_b32 s8, s12, s9
-; GCN-NEXT:    s_add_u32 s9, s10, 1
-; GCN-NEXT:    s_addc_u32 s12, 0, 0
-; GCN-NEXT:    s_add_u32 s15, s10, 2
-; GCN-NEXT:    s_addc_u32 s16, 0, 0
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s8, s15, s9
-; GCN-NEXT:    s_cselect_b32 s9, s16, s12
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s11, 0, s11
-; GCN-NEXT:    s_cmp_ge_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s6
+; GCN-NEXT:    s_cmp_ge_u32 s14, s6
 ; GCN-NEXT:    s_cselect_b32 s6, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s11, s7
-; GCN-NEXT:    s_cselect_b32 s6, s6, s12
+; GCN-NEXT:    s_cmp_eq_u32 s8, s7
+; GCN-NEXT:    s_cselect_b32 s6, s6, s9
 ; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_cselect_b32 s7, s9, 0
-; GCN-NEXT:    s_cselect_b32 s6, s8, s10
+; GCN-NEXT:    s_cselect_b32 s7, s11, 0
+; GCN-NEXT:    s_cselect_b32 s6, s10, s12
 ; GCN-NEXT:    s_xor_b64 s[6:7], s[6:7], s[4:5]
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_subb_u32 s7, s7, s4
@@ -1315,7 +1305,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s10, 63, s10
@@ -1348,7 +1337,6 @@ define amdgpu_kernel void @s_test_sdiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index 09596e956d603..7ddd90e7aeb53 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @sitofp_i16_to_f16(
 ; SI-LABEL: sitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/srem.ll b/llvm/test/CodeGen/AMDGPU/srem.ll
index bbd179364374c..e12e31b14e97d 100644
--- a/llvm/test/CodeGen/AMDGPU/srem.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem.ll
@@ -1513,7 +1513,7 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s8
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s9
 ; GCN-NEXT:    s_sub_u32 s3, 0, s8
-; GCN-NEXT:    s_subb_u32 s12, 0, s9
+; GCN-NEXT:    s_subb_u32 s10, 0, s9
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -1522,56 +1522,52 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s13, v1
-; GCN-NEXT:    v_readfirstlane_b32 s10, v0
-; GCN-NEXT:    s_mul_i32 s11, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s15, s3, s10
-; GCN-NEXT:    s_mul_i32 s14, s12, s10
-; GCN-NEXT:    s_add_i32 s11, s15, s11
-; GCN-NEXT:    s_add_i32 s11, s11, s14
-; GCN-NEXT:    s_mul_i32 s16, s3, s10
-; GCN-NEXT:    s_mul_i32 s15, s10, s11
-; GCN-NEXT:    s_mul_hi_u32 s17, s10, s16
-; GCN-NEXT:    s_mul_hi_u32 s14, s10, s11
+; GCN-NEXT:    v_readfirstlane_b32 s11, v1
+; GCN-NEXT:    v_readfirstlane_b32 s12, v0
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s15, s3, s12
+; GCN-NEXT:    s_mul_i32 s14, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s15, s13
+; GCN-NEXT:    s_add_i32 s13, s13, s14
+; GCN-NEXT:    s_mul_i32 s16, s3, s12
+; GCN-NEXT:    s_mul_i32 s15, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s12, s16
+; GCN-NEXT:    s_mul_hi_u32 s14, s12, s13
 ; GCN-NEXT:    s_add_u32 s15, s17, s15
 ; GCN-NEXT:    s_addc_u32 s14, 0, s14
-; GCN-NEXT:    s_mul_hi_u32 s18, s13, s16
-; GCN-NEXT:    s_mul_i32 s16, s13, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s11, s16
+; GCN-NEXT:    s_mul_i32 s16, s11, s16
 ; GCN-NEXT:    s_add_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_hi_u32 s17, s13, s11
+; GCN-NEXT:    s_mul_hi_u32 s17, s11, s13
 ; GCN-NEXT:    s_addc_u32 s14, s14, s18
 ; GCN-NEXT:    s_addc_u32 s15, s17, 0
-; GCN-NEXT:    s_mul_i32 s11, s13, s11
-; GCN-NEXT:    s_add_u32 s11, s14, s11
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s13, s14, s13
 ; GCN-NEXT:    s_addc_u32 s14, 0, s15
-; GCN-NEXT:    s_add_u32 s15, s10, s11
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s13, s13, s14
-; GCN-NEXT:    s_mul_i32 s10, s3, s13
-; GCN-NEXT:    s_mul_hi_u32 s11, s3, s15
-; GCN-NEXT:    s_add_i32 s10, s11, s10
-; GCN-NEXT:    s_mul_i32 s12, s12, s15
-; GCN-NEXT:    s_add_i32 s10, s10, s12
-; GCN-NEXT:    s_mul_i32 s3, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s12, s13, s3
-; GCN-NEXT:    s_mul_i32 s14, s13, s3
-; GCN-NEXT:    s_mul_i32 s17, s15, s10
-; GCN-NEXT:    s_mul_hi_u32 s3, s15, s3
-; GCN-NEXT:    s_mul_hi_u32 s16, s15, s10
+; GCN-NEXT:    s_add_u32 s12, s12, s13
+; GCN-NEXT:    s_addc_u32 s11, s11, s14
+; GCN-NEXT:    s_mul_i32 s13, s3, s11
+; GCN-NEXT:    s_mul_hi_u32 s14, s3, s12
+; GCN-NEXT:    s_add_i32 s13, s14, s13
+; GCN-NEXT:    s_mul_i32 s10, s10, s12
+; GCN-NEXT:    s_add_i32 s13, s13, s10
+; GCN-NEXT:    s_mul_i32 s3, s3, s12
+; GCN-NEXT:    s_mul_hi_u32 s14, s11, s3
+; GCN-NEXT:    s_mul_i32 s15, s11, s3
+; GCN-NEXT:    s_mul_i32 s17, s12, s13
+; GCN-NEXT:    s_mul_hi_u32 s3, s12, s3
+; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
 ; GCN-NEXT:    s_add_u32 s3, s3, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_add_u32 s3, s3, s14
-; GCN-NEXT:    s_mul_hi_u32 s11, s13, s10
-; GCN-NEXT:    s_addc_u32 s3, s16, s12
-; GCN-NEXT:    s_addc_u32 s11, s11, 0
-; GCN-NEXT:    s_mul_i32 s10, s13, s10
-; GCN-NEXT:    s_add_u32 s3, s3, s10
-; GCN-NEXT:    s_addc_u32 s12, 0, s11
-; GCN-NEXT:    s_add_u32 s3, s15, s3
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[10:11], 0
-; GCN-NEXT:    s_addc_u32 s14, s13, s12
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_mul_hi_u32 s10, s11, s13
+; GCN-NEXT:    s_addc_u32 s3, s16, s14
+; GCN-NEXT:    s_addc_u32 s10, s10, 0
+; GCN-NEXT:    s_mul_i32 s13, s11, s13
+; GCN-NEXT:    s_add_u32 s3, s3, s13
+; GCN-NEXT:    s_addc_u32 s10, 0, s10
+; GCN-NEXT:    s_add_u32 s3, s12, s3
+; GCN-NEXT:    s_addc_u32 s14, s11, s10
 ; GCN-NEXT:    s_ashr_i32 s10, s5, 31
 ; GCN-NEXT:    s_add_u32 s12, s4, s10
 ; GCN-NEXT:    s_mov_b32 s11, s10
@@ -1600,11 +1596,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_mul_i32 s3, s8, s3
 ; GCN-NEXT:    s_sub_u32 s3, s12, s3
 ; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s12, s16, s9
 ; GCN-NEXT:    s_sub_u32 s18, s3, s8
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s19, s12, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s19, s9
 ; GCN-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1614,12 +1608,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; GCN-NEXT:    s_cselect_b32 s20, s21, s20
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s12, s12, s9
-; GCN-NEXT:    s_sub_u32 s21, s18, s8
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; GCN-NEXT:    s_sub_u32 s16, s18, s8
 ; GCN-NEXT:    s_subb_u32 s12, s12, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s20, 0
-; GCN-NEXT:    s_cselect_b32 s16, s21, s18
+; GCN-NEXT:    s_cselect_b32 s16, s16, s18
 ; GCN-NEXT:    s_cselect_b32 s12, s12, s19
 ; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; GCN-NEXT:    s_subb_u32 s5, s13, s5
@@ -1931,11 +1923,9 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s3, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -1945,12 +1935,10 @@ define amdgpu_kernel void @srem_i64(ptr addrspace(1) %out, ptr addrspace(1) %in)
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s3, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s3, s3, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s3, s3, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s5, s13, s5
@@ -2730,7 +2718,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s9, 0, s6
-; GCN-NEXT:    s_subb_u32 s16, 0, s7
+; GCN-NEXT:    s_subb_u32 s14, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2739,56 +2727,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s17, v1
-; GCN-NEXT:    v_readfirstlane_b32 s14, v0
-; GCN-NEXT:    s_mul_i32 s15, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s19, s9, s14
-; GCN-NEXT:    s_mul_i32 s18, s16, s14
-; GCN-NEXT:    s_add_i32 s15, s19, s15
-; GCN-NEXT:    s_add_i32 s15, s15, s18
-; GCN-NEXT:    s_mul_i32 s20, s9, s14
-; GCN-NEXT:    s_mul_i32 s19, s14, s15
-; GCN-NEXT:    s_mul_hi_u32 s21, s14, s20
-; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
+; GCN-NEXT:    v_readfirstlane_b32 s15, v1
+; GCN-NEXT:    v_readfirstlane_b32 s16, v0
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s9, s16
+; GCN-NEXT:    s_mul_i32 s18, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s19, s17
+; GCN-NEXT:    s_add_i32 s17, s17, s18
+; GCN-NEXT:    s_mul_i32 s20, s9, s16
+; GCN-NEXT:    s_mul_i32 s19, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s16, s20
+; GCN-NEXT:    s_mul_hi_u32 s18, s16, s17
 ; GCN-NEXT:    s_add_u32 s19, s21, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_mul_hi_u32 s22, s17, s20
-; GCN-NEXT:    s_mul_i32 s20, s17, s20
+; GCN-NEXT:    s_mul_hi_u32 s22, s15, s20
+; GCN-NEXT:    s_mul_i32 s20, s15, s20
 ; GCN-NEXT:    s_add_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_hi_u32 s21, s17, s15
+; GCN-NEXT:    s_mul_hi_u32 s21, s15, s17
 ; GCN-NEXT:    s_addc_u32 s18, s18, s22
 ; GCN-NEXT:    s_addc_u32 s19, s21, 0
-; GCN-NEXT:    s_mul_i32 s15, s17, s15
-; GCN-NEXT:    s_add_u32 s15, s18, s15
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s17, s18, s17
 ; GCN-NEXT:    s_addc_u32 s18, 0, s19
-; GCN-NEXT:    s_add_u32 s19, s14, s15
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_i32 s14, s9, s17
-; GCN-NEXT:    s_mul_hi_u32 s15, s9, s19
-; GCN-NEXT:    s_add_i32 s14, s15, s14
-; GCN-NEXT:    s_mul_i32 s16, s16, s19
-; GCN-NEXT:    s_add_i32 s14, s14, s16
-; GCN-NEXT:    s_mul_i32 s9, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s16, s17, s9
-; GCN-NEXT:    s_mul_i32 s18, s17, s9
-; GCN-NEXT:    s_mul_i32 s21, s19, s14
-; GCN-NEXT:    s_mul_hi_u32 s9, s19, s9
-; GCN-NEXT:    s_mul_hi_u32 s20, s19, s14
+; GCN-NEXT:    s_add_u32 s16, s16, s17
+; GCN-NEXT:    s_addc_u32 s15, s15, s18
+; GCN-NEXT:    s_mul_i32 s17, s9, s15
+; GCN-NEXT:    s_mul_hi_u32 s18, s9, s16
+; GCN-NEXT:    s_add_i32 s17, s18, s17
+; GCN-NEXT:    s_mul_i32 s14, s14, s16
+; GCN-NEXT:    s_add_i32 s17, s17, s14
+; GCN-NEXT:    s_mul_i32 s9, s9, s16
+; GCN-NEXT:    s_mul_hi_u32 s18, s15, s9
+; GCN-NEXT:    s_mul_i32 s19, s15, s9
+; GCN-NEXT:    s_mul_i32 s21, s16, s17
+; GCN-NEXT:    s_mul_hi_u32 s9, s16, s9
+; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
 ; GCN-NEXT:    s_add_u32 s9, s9, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_add_u32 s9, s9, s18
-; GCN-NEXT:    s_mul_hi_u32 s15, s17, s14
-; GCN-NEXT:    s_addc_u32 s9, s20, s16
-; GCN-NEXT:    s_addc_u32 s15, s15, 0
-; GCN-NEXT:    s_mul_i32 s14, s17, s14
-; GCN-NEXT:    s_add_u32 s9, s9, s14
-; GCN-NEXT:    s_addc_u32 s16, 0, s15
-; GCN-NEXT:    s_add_u32 s9, s19, s9
-; GCN-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[14:15], 0
-; GCN-NEXT:    s_addc_u32 s18, s17, s16
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_mul_hi_u32 s14, s15, s17
+; GCN-NEXT:    s_addc_u32 s9, s20, s18
+; GCN-NEXT:    s_addc_u32 s14, s14, 0
+; GCN-NEXT:    s_mul_i32 s17, s15, s17
+; GCN-NEXT:    s_add_u32 s9, s9, s17
+; GCN-NEXT:    s_addc_u32 s14, 0, s14
+; GCN-NEXT:    s_add_u32 s9, s16, s9
+; GCN-NEXT:    s_addc_u32 s18, s15, s14
 ; GCN-NEXT:    s_ashr_i32 s14, s11, 31
 ; GCN-NEXT:    s_add_u32 s16, s10, s14
 ; GCN-NEXT:    s_mov_b32 s15, s14
@@ -2817,11 +2801,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s6, s9
 ; GCN-NEXT:    s_sub_u32 s9, s16, s9
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s16, s20, s7
 ; GCN-NEXT:    s_sub_u32 s22, s9, s6
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s23, s16, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s23, s7
 ; GCN-NEXT:    s_cselect_b32 s24, -1, 0
@@ -2831,12 +2813,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s24, s25, s24
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s16, s16, s7
-; GCN-NEXT:    s_sub_u32 s25, s22, s6
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
+; GCN-NEXT:    s_sub_u32 s20, s22, s6
 ; GCN-NEXT:    s_subb_u32 s16, s16, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s24, 0
-; GCN-NEXT:    s_cselect_b32 s20, s25, s22
+; GCN-NEXT:    s_cselect_b32 s20, s20, s22
 ; GCN-NEXT:    s_cselect_b32 s16, s16, s23
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s11, s17, s11
@@ -2887,7 +2867,7 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -2896,56 +2876,52 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -2974,11 +2950,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -2988,12 +2962,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -3463,11 +3435,9 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v0
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -3477,12 +3447,10 @@ define amdgpu_kernel void @srem_v2i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
@@ -4934,7 +4902,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s6
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s7
 ; GCN-NEXT:    s_sub_u32 s17, 0, s6
-; GCN-NEXT:    s_subb_u32 s24, 0, s7
+; GCN-NEXT:    s_subb_u32 s22, 0, s7
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -4943,56 +4911,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s25, v1
-; GCN-NEXT:    v_readfirstlane_b32 s22, v0
-; GCN-NEXT:    s_mul_i32 s23, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s27, s17, s22
-; GCN-NEXT:    s_mul_i32 s26, s24, s22
-; GCN-NEXT:    s_add_i32 s23, s27, s23
-; GCN-NEXT:    s_add_i32 s23, s23, s26
-; GCN-NEXT:    s_mul_i32 s28, s17, s22
-; GCN-NEXT:    s_mul_i32 s27, s22, s23
-; GCN-NEXT:    s_mul_hi_u32 s29, s22, s28
-; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
+; GCN-NEXT:    v_readfirstlane_b32 s23, v1
+; GCN-NEXT:    v_readfirstlane_b32 s24, v0
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s17, s24
+; GCN-NEXT:    s_mul_i32 s26, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s27, s25
+; GCN-NEXT:    s_add_i32 s25, s25, s26
+; GCN-NEXT:    s_mul_i32 s28, s17, s24
+; GCN-NEXT:    s_mul_i32 s27, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s29, s24, s28
+; GCN-NEXT:    s_mul_hi_u32 s26, s24, s25
 ; GCN-NEXT:    s_add_u32 s27, s29, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_mul_hi_u32 s30, s25, s28
-; GCN-NEXT:    s_mul_i32 s28, s25, s28
+; GCN-NEXT:    s_mul_hi_u32 s30, s23, s28
+; GCN-NEXT:    s_mul_i32 s28, s23, s28
 ; GCN-NEXT:    s_add_u32 s27, s27, s28
-; GCN-NEXT:    s_mul_hi_u32 s29, s25, s23
+; GCN-NEXT:    s_mul_hi_u32 s29, s23, s25
 ; GCN-NEXT:    s_addc_u32 s26, s26, s30
 ; GCN-NEXT:    s_addc_u32 s27, s29, 0
-; GCN-NEXT:    s_mul_i32 s23, s25, s23
-; GCN-NEXT:    s_add_u32 s23, s26, s23
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s25, s26, s25
 ; GCN-NEXT:    s_addc_u32 s26, 0, s27
-; GCN-NEXT:    s_add_u32 s27, s22, s23
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_i32 s22, s17, s25
-; GCN-NEXT:    s_mul_hi_u32 s23, s17, s27
-; GCN-NEXT:    s_add_i32 s22, s23, s22
-; GCN-NEXT:    s_mul_i32 s24, s24, s27
-; GCN-NEXT:    s_add_i32 s22, s22, s24
-; GCN-NEXT:    s_mul_i32 s17, s17, s27
-; GCN-NEXT:    s_mul_hi_u32 s24, s25, s17
-; GCN-NEXT:    s_mul_i32 s26, s25, s17
-; GCN-NEXT:    s_mul_i32 s29, s27, s22
-; GCN-NEXT:    s_mul_hi_u32 s17, s27, s17
-; GCN-NEXT:    s_mul_hi_u32 s28, s27, s22
+; GCN-NEXT:    s_add_u32 s24, s24, s25
+; GCN-NEXT:    s_addc_u32 s23, s23, s26
+; GCN-NEXT:    s_mul_i32 s25, s17, s23
+; GCN-NEXT:    s_mul_hi_u32 s26, s17, s24
+; GCN-NEXT:    s_add_i32 s25, s26, s25
+; GCN-NEXT:    s_mul_i32 s22, s22, s24
+; GCN-NEXT:    s_add_i32 s25, s25, s22
+; GCN-NEXT:    s_mul_i32 s17, s17, s24
+; GCN-NEXT:    s_mul_hi_u32 s26, s23, s17
+; GCN-NEXT:    s_mul_i32 s27, s23, s17
+; GCN-NEXT:    s_mul_i32 s29, s24, s25
+; GCN-NEXT:    s_mul_hi_u32 s17, s24, s17
+; GCN-NEXT:    s_mul_hi_u32 s28, s24, s25
 ; GCN-NEXT:    s_add_u32 s17, s17, s29
 ; GCN-NEXT:    s_addc_u32 s28, 0, s28
-; GCN-NEXT:    s_add_u32 s17, s17, s26
-; GCN-NEXT:    s_mul_hi_u32 s23, s25, s22
-; GCN-NEXT:    s_addc_u32 s17, s28, s24
-; GCN-NEXT:    s_addc_u32 s23, s23, 0
-; GCN-NEXT:    s_mul_i32 s22, s25, s22
-; GCN-NEXT:    s_add_u32 s17, s17, s22
-; GCN-NEXT:    s_addc_u32 s24, 0, s23
-; GCN-NEXT:    s_add_u32 s17, s27, s17
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
-; GCN-NEXT:    s_addc_u32 s26, s25, s24
+; GCN-NEXT:    s_add_u32 s17, s17, s27
+; GCN-NEXT:    s_mul_hi_u32 s22, s23, s25
+; GCN-NEXT:    s_addc_u32 s17, s28, s26
+; GCN-NEXT:    s_addc_u32 s22, s22, 0
+; GCN-NEXT:    s_mul_i32 s25, s23, s25
+; GCN-NEXT:    s_add_u32 s17, s17, s25
+; GCN-NEXT:    s_addc_u32 s22, 0, s22
+; GCN-NEXT:    s_add_u32 s17, s24, s17
+; GCN-NEXT:    s_addc_u32 s26, s23, s22
 ; GCN-NEXT:    s_ashr_i32 s22, s19, 31
 ; GCN-NEXT:    s_add_u32 s24, s18, s22
 ; GCN-NEXT:    s_mov_b32 s23, s22
@@ -5021,11 +4985,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s17, s6, s17
 ; GCN-NEXT:    s_sub_u32 s17, s24, s17
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s24, s28, s7
 ; GCN-NEXT:    s_sub_u32 s30, s17, s6
 ; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s31, s24, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s31, s7
 ; GCN-NEXT:    s_cselect_b32 s33, -1, 0
@@ -5035,12 +4997,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s33, s34, s33
 ; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
 ; GCN-NEXT:    s_subb_u32 s24, s24, s7
-; GCN-NEXT:    s_sub_u32 s34, s30, s6
-; GCN-NEXT:    s_cselect_b64 s[28:29], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[28:29], 0
+; GCN-NEXT:    s_sub_u32 s28, s30, s6
 ; GCN-NEXT:    s_subb_u32 s24, s24, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s33, 0
-; GCN-NEXT:    s_cselect_b32 s28, s34, s30
+; GCN-NEXT:    s_cselect_b32 s28, s28, s30
 ; GCN-NEXT:    s_cselect_b32 s24, s24, s31
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s19, s25, s19
@@ -5091,7 +5051,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s18
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s19
 ; GCN-NEXT:    s_sub_u32 s13, 0, s18
-; GCN-NEXT:    s_subb_u32 s22, 0, s19
+; GCN-NEXT:    s_subb_u32 s20, 0, s19
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5100,56 +5060,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s23, v1
-; GCN-NEXT:    v_readfirstlane_b32 s20, v0
-; GCN-NEXT:    s_mul_i32 s21, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s25, s13, s20
-; GCN-NEXT:    s_mul_i32 s24, s22, s20
-; GCN-NEXT:    s_add_i32 s21, s25, s21
-; GCN-NEXT:    s_add_i32 s21, s21, s24
-; GCN-NEXT:    s_mul_i32 s26, s13, s20
-; GCN-NEXT:    s_mul_i32 s25, s20, s21
-; GCN-NEXT:    s_mul_hi_u32 s27, s20, s26
-; GCN-NEXT:    s_mul_hi_u32 s24, s20, s21
+; GCN-NEXT:    v_readfirstlane_b32 s21, v1
+; GCN-NEXT:    v_readfirstlane_b32 s22, v0
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s25, s13, s22
+; GCN-NEXT:    s_mul_i32 s24, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s25, s23
+; GCN-NEXT:    s_add_i32 s23, s23, s24
+; GCN-NEXT:    s_mul_i32 s26, s13, s22
+; GCN-NEXT:    s_mul_i32 s25, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s27, s22, s26
+; GCN-NEXT:    s_mul_hi_u32 s24, s22, s23
 ; GCN-NEXT:    s_add_u32 s25, s27, s25
 ; GCN-NEXT:    s_addc_u32 s24, 0, s24
-; GCN-NEXT:    s_mul_hi_u32 s28, s23, s26
-; GCN-NEXT:    s_mul_i32 s26, s23, s26
+; GCN-NEXT:    s_mul_hi_u32 s28, s21, s26
+; GCN-NEXT:    s_mul_i32 s26, s21, s26
 ; GCN-NEXT:    s_add_u32 s25, s25, s26
-; GCN-NEXT:    s_mul_hi_u32 s27, s23, s21
+; GCN-NEXT:    s_mul_hi_u32 s27, s21, s23
 ; GCN-NEXT:    s_addc_u32 s24, s24, s28
 ; GCN-NEXT:    s_addc_u32 s25, s27, 0
-; GCN-NEXT:    s_mul_i32 s21, s23, s21
-; GCN-NEXT:    s_add_u32 s21, s24, s21
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s23, s24, s23
 ; GCN-NEXT:    s_addc_u32 s24, 0, s25
-; GCN-NEXT:    s_add_u32 s25, s20, s21
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s23, s23, s24
-; GCN-NEXT:    s_mul_i32 s20, s13, s23
-; GCN-NEXT:    s_mul_hi_u32 s21, s13, s25
-; GCN-NEXT:    s_add_i32 s20, s21, s20
-; GCN-NEXT:    s_mul_i32 s22, s22, s25
-; GCN-NEXT:    s_add_i32 s20, s20, s22
-; GCN-NEXT:    s_mul_i32 s13, s13, s25
-; GCN-NEXT:    s_mul_hi_u32 s22, s23, s13
-; GCN-NEXT:    s_mul_i32 s24, s23, s13
-; GCN-NEXT:    s_mul_i32 s27, s25, s20
-; GCN-NEXT:    s_mul_hi_u32 s13, s25, s13
-; GCN-NEXT:    s_mul_hi_u32 s26, s25, s20
+; GCN-NEXT:    s_add_u32 s22, s22, s23
+; GCN-NEXT:    s_addc_u32 s21, s21, s24
+; GCN-NEXT:    s_mul_i32 s23, s13, s21
+; GCN-NEXT:    s_mul_hi_u32 s24, s13, s22
+; GCN-NEXT:    s_add_i32 s23, s24, s23
+; GCN-NEXT:    s_mul_i32 s20, s20, s22
+; GCN-NEXT:    s_add_i32 s23, s23, s20
+; GCN-NEXT:    s_mul_i32 s13, s13, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s21, s13
+; GCN-NEXT:    s_mul_i32 s25, s21, s13
+; GCN-NEXT:    s_mul_i32 s27, s22, s23
+; GCN-NEXT:    s_mul_hi_u32 s13, s22, s13
+; GCN-NEXT:    s_mul_hi_u32 s26, s22, s23
 ; GCN-NEXT:    s_add_u32 s13, s13, s27
 ; GCN-NEXT:    s_addc_u32 s26, 0, s26
-; GCN-NEXT:    s_add_u32 s13, s13, s24
-; GCN-NEXT:    s_mul_hi_u32 s21, s23, s20
-; GCN-NEXT:    s_addc_u32 s13, s26, s22
-; GCN-NEXT:    s_addc_u32 s21, s21, 0
-; GCN-NEXT:    s_mul_i32 s20, s23, s20
-; GCN-NEXT:    s_add_u32 s13, s13, s20
-; GCN-NEXT:    s_addc_u32 s22, 0, s21
-; GCN-NEXT:    s_add_u32 s13, s25, s13
-; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
-; GCN-NEXT:    s_addc_u32 s24, s23, s22
+; GCN-NEXT:    s_add_u32 s13, s13, s25
+; GCN-NEXT:    s_mul_hi_u32 s20, s21, s23
+; GCN-NEXT:    s_addc_u32 s13, s26, s24
+; GCN-NEXT:    s_addc_u32 s20, s20, 0
+; GCN-NEXT:    s_mul_i32 s23, s21, s23
+; GCN-NEXT:    s_add_u32 s13, s13, s23
+; GCN-NEXT:    s_addc_u32 s20, 0, s20
+; GCN-NEXT:    s_add_u32 s13, s22, s13
+; GCN-NEXT:    s_addc_u32 s24, s21, s20
 ; GCN-NEXT:    s_ashr_i32 s20, s15, 31
 ; GCN-NEXT:    s_add_u32 s22, s14, s20
 ; GCN-NEXT:    s_mov_b32 s21, s20
@@ -5178,11 +5134,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s13, s18, s13
 ; GCN-NEXT:    s_sub_u32 s13, s22, s13
 ; GCN-NEXT:    s_cselect_b64 s[24:25], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s22, s26, s19
 ; GCN-NEXT:    s_sub_u32 s28, s13, s18
 ; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s29, s22, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s29, s19
 ; GCN-NEXT:    s_cselect_b32 s30, -1, 0
@@ -5192,12 +5146,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s30, s31, s30
 ; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
 ; GCN-NEXT:    s_subb_u32 s22, s22, s19
-; GCN-NEXT:    s_sub_u32 s31, s28, s18
-; GCN-NEXT:    s_cselect_b64 s[26:27], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[26:27], 0
+; GCN-NEXT:    s_sub_u32 s26, s28, s18
 ; GCN-NEXT:    s_subb_u32 s22, s22, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s30, 0
-; GCN-NEXT:    s_cselect_b32 s26, s31, s28
+; GCN-NEXT:    s_cselect_b32 s26, s26, s28
 ; GCN-NEXT:    s_cselect_b32 s22, s22, s29
 ; GCN-NEXT:    s_cmp_lg_u64 s[24:25], 0
 ; GCN-NEXT:    s_subb_u32 s15, s23, s15
@@ -5257,7 +5209,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s14
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s15
 ; GCN-NEXT:    s_sub_u32 s9, 0, s14
-; GCN-NEXT:    s_subb_u32 s18, 0, s15
+; GCN-NEXT:    s_subb_u32 s16, 0, s15
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5266,56 +5218,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s19, v1
-; GCN-NEXT:    v_readfirstlane_b32 s16, v0
-; GCN-NEXT:    s_mul_i32 s17, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s21, s9, s16
-; GCN-NEXT:    s_mul_i32 s20, s18, s16
-; GCN-NEXT:    s_add_i32 s17, s21, s17
-; GCN-NEXT:    s_add_i32 s17, s17, s20
-; GCN-NEXT:    s_mul_i32 s22, s9, s16
-; GCN-NEXT:    s_mul_i32 s21, s16, s17
-; GCN-NEXT:    s_mul_hi_u32 s23, s16, s22
-; GCN-NEXT:    s_mul_hi_u32 s20, s16, s17
+; GCN-NEXT:    v_readfirstlane_b32 s17, v1
+; GCN-NEXT:    v_readfirstlane_b32 s18, v0
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s21, s9, s18
+; GCN-NEXT:    s_mul_i32 s20, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s21, s19
+; GCN-NEXT:    s_add_i32 s19, s19, s20
+; GCN-NEXT:    s_mul_i32 s22, s9, s18
+; GCN-NEXT:    s_mul_i32 s21, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s23, s18, s22
+; GCN-NEXT:    s_mul_hi_u32 s20, s18, s19
 ; GCN-NEXT:    s_add_u32 s21, s23, s21
 ; GCN-NEXT:    s_addc_u32 s20, 0, s20
-; GCN-NEXT:    s_mul_hi_u32 s24, s19, s22
-; GCN-NEXT:    s_mul_i32 s22, s19, s22
+; GCN-NEXT:    s_mul_hi_u32 s24, s17, s22
+; GCN-NEXT:    s_mul_i32 s22, s17, s22
 ; GCN-NEXT:    s_add_u32 s21, s21, s22
-; GCN-NEXT:    s_mul_hi_u32 s23, s19, s17
+; GCN-NEXT:    s_mul_hi_u32 s23, s17, s19
 ; GCN-NEXT:    s_addc_u32 s20, s20, s24
 ; GCN-NEXT:    s_addc_u32 s21, s23, 0
-; GCN-NEXT:    s_mul_i32 s17, s19, s17
-; GCN-NEXT:    s_add_u32 s17, s20, s17
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s19, s20, s19
 ; GCN-NEXT:    s_addc_u32 s20, 0, s21
-; GCN-NEXT:    s_add_u32 s21, s16, s17
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s19, s19, s20
-; GCN-NEXT:    s_mul_i32 s16, s9, s19
-; GCN-NEXT:    s_mul_hi_u32 s17, s9, s21
-; GCN-NEXT:    s_add_i32 s16, s17, s16
-; GCN-NEXT:    s_mul_i32 s18, s18, s21
-; GCN-NEXT:    s_add_i32 s16, s16, s18
-; GCN-NEXT:    s_mul_i32 s9, s9, s21
-; GCN-NEXT:    s_mul_hi_u32 s18, s19, s9
-; GCN-NEXT:    s_mul_i32 s20, s19, s9
-; GCN-NEXT:    s_mul_i32 s23, s21, s16
-; GCN-NEXT:    s_mul_hi_u32 s9, s21, s9
-; GCN-NEXT:    s_mul_hi_u32 s22, s21, s16
+; GCN-NEXT:    s_add_u32 s18, s18, s19
+; GCN-NEXT:    s_addc_u32 s17, s17, s20
+; GCN-NEXT:    s_mul_i32 s19, s9, s17
+; GCN-NEXT:    s_mul_hi_u32 s20, s9, s18
+; GCN-NEXT:    s_add_i32 s19, s20, s19
+; GCN-NEXT:    s_mul_i32 s16, s16, s18
+; GCN-NEXT:    s_add_i32 s19, s19, s16
+; GCN-NEXT:    s_mul_i32 s9, s9, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s17, s9
+; GCN-NEXT:    s_mul_i32 s21, s17, s9
+; GCN-NEXT:    s_mul_i32 s23, s18, s19
+; GCN-NEXT:    s_mul_hi_u32 s9, s18, s9
+; GCN-NEXT:    s_mul_hi_u32 s22, s18, s19
 ; GCN-NEXT:    s_add_u32 s9, s9, s23
 ; GCN-NEXT:    s_addc_u32 s22, 0, s22
-; GCN-NEXT:    s_add_u32 s9, s9, s20
-; GCN-NEXT:    s_mul_hi_u32 s17, s19, s16
-; GCN-NEXT:    s_addc_u32 s9, s22, s18
-; GCN-NEXT:    s_addc_u32 s17, s17, 0
-; GCN-NEXT:    s_mul_i32 s16, s19, s16
-; GCN-NEXT:    s_add_u32 s9, s9, s16
-; GCN-NEXT:    s_addc_u32 s18, 0, s17
-; GCN-NEXT:    s_add_u32 s9, s21, s9
-; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
-; GCN-NEXT:    s_addc_u32 s20, s19, s18
+; GCN-NEXT:    s_add_u32 s9, s9, s21
+; GCN-NEXT:    s_mul_hi_u32 s16, s17, s19
+; GCN-NEXT:    s_addc_u32 s9, s22, s20
+; GCN-NEXT:    s_addc_u32 s16, s16, 0
+; GCN-NEXT:    s_mul_i32 s19, s17, s19
+; GCN-NEXT:    s_add_u32 s9, s9, s19
+; GCN-NEXT:    s_addc_u32 s16, 0, s16
+; GCN-NEXT:    s_add_u32 s9, s18, s9
+; GCN-NEXT:    s_addc_u32 s20, s17, s16
 ; GCN-NEXT:    s_ashr_i32 s16, s11, 31
 ; GCN-NEXT:    s_add_u32 s18, s10, s16
 ; GCN-NEXT:    s_mov_b32 s17, s16
@@ -5344,11 +5292,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s9, s14, s9
 ; GCN-NEXT:    s_sub_u32 s9, s18, s9
 ; GCN-NEXT:    s_cselect_b64 s[20:21], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s18, s22, s15
 ; GCN-NEXT:    s_sub_u32 s24, s9, s14
 ; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s25, s18, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s25, s15
 ; GCN-NEXT:    s_cselect_b32 s26, -1, 0
@@ -5358,12 +5304,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s26, s27, s26
 ; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
 ; GCN-NEXT:    s_subb_u32 s18, s18, s15
-; GCN-NEXT:    s_sub_u32 s27, s24, s14
-; GCN-NEXT:    s_cselect_b64 s[22:23], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[22:23], 0
+; GCN-NEXT:    s_sub_u32 s22, s24, s14
 ; GCN-NEXT:    s_subb_u32 s18, s18, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s26, 0
-; GCN-NEXT:    s_cselect_b32 s22, s27, s24
+; GCN-NEXT:    s_cselect_b32 s22, s22, s24
 ; GCN-NEXT:    s_cselect_b32 s18, s18, s25
 ; GCN-NEXT:    s_cmp_lg_u64 s[20:21], 0
 ; GCN-NEXT:    s_subb_u32 s11, s19, s11
@@ -5420,7 +5364,7 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v0, s10
 ; GCN-NEXT:    v_cvt_f32_u32_e32 v1, s11
 ; GCN-NEXT:    s_sub_u32 s3, 0, s10
-; GCN-NEXT:    s_subb_u32 s14, 0, s11
+; GCN-NEXT:    s_subb_u32 s12, 0, s11
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GCN-NEXT:    v_rcp_f32_e32 v0, v0
 ; GCN-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -5429,56 +5373,52 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GCN-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GCN-NEXT:    v_readfirstlane_b32 s15, v1
-; GCN-NEXT:    v_readfirstlane_b32 s12, v0
-; GCN-NEXT:    s_mul_i32 s13, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s17, s3, s12
-; GCN-NEXT:    s_mul_i32 s16, s14, s12
-; GCN-NEXT:    s_add_i32 s13, s17, s13
-; GCN-NEXT:    s_add_i32 s13, s13, s16
-; GCN-NEXT:    s_mul_i32 s18, s3, s12
-; GCN-NEXT:    s_mul_i32 s17, s12, s13
-; GCN-NEXT:    s_mul_hi_u32 s19, s12, s18
-; GCN-NEXT:    s_mul_hi_u32 s16, s12, s13
+; GCN-NEXT:    v_readfirstlane_b32 s13, v1
+; GCN-NEXT:    v_readfirstlane_b32 s14, v0
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s17, s3, s14
+; GCN-NEXT:    s_mul_i32 s16, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s17, s15
+; GCN-NEXT:    s_add_i32 s15, s15, s16
+; GCN-NEXT:    s_mul_i32 s18, s3, s14
+; GCN-NEXT:    s_mul_i32 s17, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s19, s14, s18
+; GCN-NEXT:    s_mul_hi_u32 s16, s14, s15
 ; GCN-NEXT:    s_add_u32 s17, s19, s17
 ; GCN-NEXT:    s_addc_u32 s16, 0, s16
-; GCN-NEXT:    s_mul_hi_u32 s20, s15, s18
-; GCN-NEXT:    s_mul_i32 s18, s15, s18
+; GCN-NEXT:    s_mul_hi_u32 s20, s13, s18
+; GCN-NEXT:    s_mul_i32 s18, s13, s18
 ; GCN-NEXT:    s_add_u32 s17, s17, s18
-; GCN-NEXT:    s_mul_hi_u32 s19, s15, s13
+; GCN-NEXT:    s_mul_hi_u32 s19, s13, s15
 ; GCN-NEXT:    s_addc_u32 s16, s16, s20
 ; GCN-NEXT:    s_addc_u32 s17, s19, 0
-; GCN-NEXT:    s_mul_i32 s13, s15, s13
-; GCN-NEXT:    s_add_u32 s13, s16, s13
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s15, s16, s15
 ; GCN-NEXT:    s_addc_u32 s16, 0, s17
-; GCN-NEXT:    s_add_u32 s17, s12, s13
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s15, s15, s16
-; GCN-NEXT:    s_mul_i32 s12, s3, s15
-; GCN-NEXT:    s_mul_hi_u32 s13, s3, s17
-; GCN-NEXT:    s_add_i32 s12, s13, s12
-; GCN-NEXT:    s_mul_i32 s14, s14, s17
-; GCN-NEXT:    s_add_i32 s12, s12, s14
-; GCN-NEXT:    s_mul_i32 s3, s3, s17
-; GCN-NEXT:    s_mul_hi_u32 s14, s15, s3
-; GCN-NEXT:    s_mul_i32 s16, s15, s3
-; GCN-NEXT:    s_mul_i32 s19, s17, s12
-; GCN-NEXT:    s_mul_hi_u32 s3, s17, s3
-; GCN-NEXT:    s_mul_hi_u32 s18, s17, s12
+; GCN-NEXT:    s_add_u32 s14, s14, s15
+; GCN-NEXT:    s_addc_u32 s13, s13, s16
+; GCN-NEXT:    s_mul_i32 s15, s3, s13
+; GCN-NEXT:    s_mul_hi_u32 s16, s3, s14
+; GCN-NEXT:    s_add_i32 s15, s16, s15
+; GCN-NEXT:    s_mul_i32 s12, s12, s14
+; GCN-NEXT:    s_add_i32 s15, s15, s12
+; GCN-NEXT:    s_mul_i32 s3, s3, s14
+; GCN-NEXT:    s_mul_hi_u32 s16, s13, s3
+; GCN-NEXT:    s_mul_i32 s17, s13, s3
+; GCN-NEXT:    s_mul_i32 s19, s14, s15
+; GCN-NEXT:    s_mul_hi_u32 s3, s14, s3
+; GCN-NEXT:    s_mul_hi_u32 s18, s14, s15
 ; GCN-NEXT:    s_add_u32 s3, s3, s19
 ; GCN-NEXT:    s_addc_u32 s18, 0, s18
-; GCN-NEXT:    s_add_u32 s3, s3, s16
-; GCN-NEXT:    s_mul_hi_u32 s13, s15, s12
-; GCN-NEXT:    s_addc_u32 s3, s18, s14
-; GCN-NEXT:    s_addc_u32 s13, s13, 0
-; GCN-NEXT:    s_mul_i32 s12, s15, s12
-; GCN-NEXT:    s_add_u32 s3, s3, s12
-; GCN-NEXT:    s_addc_u32 s14, 0, s13
-; GCN-NEXT:    s_add_u32 s3, s17, s3
-; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[12:13], 0
-; GCN-NEXT:    s_addc_u32 s16, s15, s14
+; GCN-NEXT:    s_add_u32 s3, s3, s17
+; GCN-NEXT:    s_mul_hi_u32 s12, s13, s15
+; GCN-NEXT:    s_addc_u32 s3, s18, s16
+; GCN-NEXT:    s_addc_u32 s12, s12, 0
+; GCN-NEXT:    s_mul_i32 s15, s13, s15
+; GCN-NEXT:    s_add_u32 s3, s3, s15
+; GCN-NEXT:    s_addc_u32 s12, 0, s12
+; GCN-NEXT:    s_add_u32 s3, s14, s3
+; GCN-NEXT:    s_addc_u32 s16, s13, s12
 ; GCN-NEXT:    s_ashr_i32 s12, s5, 31
 ; GCN-NEXT:    s_add_u32 s14, s4, s12
 ; GCN-NEXT:    s_mov_b32 s13, s12
@@ -5507,11 +5447,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_mul_i32 s3, s10, s3
 ; GCN-NEXT:    s_sub_u32 s3, s14, s3
 ; GCN-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s14, s18, s11
 ; GCN-NEXT:    s_sub_u32 s20, s3, s10
 ; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s21, s14, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s21, s11
 ; GCN-NEXT:    s_cselect_b32 s22, -1, 0
@@ -5521,12 +5459,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; GCN-NEXT:    s_cselect_b32 s22, s23, s22
 ; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
 ; GCN-NEXT:    s_subb_u32 s14, s14, s11
-; GCN-NEXT:    s_sub_u32 s23, s20, s10
-; GCN-NEXT:    s_cselect_b64 s[18:19], -1, 0
-; GCN-NEXT:    s_cmp_lg_u64 s[18:19], 0
+; GCN-NEXT:    s_sub_u32 s18, s20, s10
 ; GCN-NEXT:    s_subb_u32 s14, s14, 0
 ; GCN-NEXT:    s_cmp_lg_u32 s22, 0
-; GCN-NEXT:    s_cselect_b32 s18, s23, s20
+; GCN-NEXT:    s_cselect_b32 s18, s18, s20
 ; GCN-NEXT:    s_cselect_b32 s14, s14, s21
 ; GCN-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; GCN-NEXT:    s_subb_u32 s5, s15, s5
@@ -6299,11 +6235,9 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    v_readfirstlane_b32 s14, v8
 ; TONGA-NEXT:    s_sub_u32 s12, s12, s14
 ; TONGA-NEXT:    s_cselect_b64 s[14:15], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
 ; TONGA-NEXT:    s_sub_u32 s18, s12, s6
 ; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s19, s1, 0
 ; TONGA-NEXT:    s_cmp_ge_u32 s19, s7
 ; TONGA-NEXT:    s_cselect_b32 s20, -1, 0
@@ -6313,12 +6247,10 @@ define amdgpu_kernel void @srem_v4i64(ptr addrspace(1) %out, ptr addrspace(1) %i
 ; TONGA-NEXT:    s_cselect_b32 s20, s21, s20
 ; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
 ; TONGA-NEXT:    s_subb_u32 s1, s1, s7
-; TONGA-NEXT:    s_sub_u32 s21, s18, s6
-; TONGA-NEXT:    s_cselect_b64 s[16:17], -1, 0
-; TONGA-NEXT:    s_cmp_lg_u64 s[16:17], 0
+; TONGA-NEXT:    s_sub_u32 s16, s18, s6
 ; TONGA-NEXT:    s_subb_u32 s1, s1, 0
 ; TONGA-NEXT:    s_cmp_lg_u32 s20, 0
-; TONGA-NEXT:    s_cselect_b32 s16, s21, s18
+; TONGA-NEXT:    s_cselect_b32 s16, s16, s18
 ; TONGA-NEXT:    s_cselect_b32 s1, s1, s19
 ; TONGA-NEXT:    s_cmp_lg_u64 s[14:15], 0
 ; TONGA-NEXT:    s_subb_u32 s3, s13, s3
diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll
index 33b0a5d129696..ea9bb0417dfa4 100644
--- a/llvm/test/CodeGen/AMDGPU/srem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/srem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_srem(ptr addrspace(1) %out, i64 %x, i64 %y) {
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -1016,10 +1009,9 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s8, s9
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s8, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s9, v0
@@ -1050,7 +1042,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    s_add_u32 s11, s14, s8
 ; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-NEXT:    s_or_b32 s8, s8, s9
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-NEXT:    s_addc_u32 s10, s12, s10
 ; GCN-NEXT:    s_ashr_i32 s8, s7, 31
 ; GCN-NEXT:    s_add_u32 s6, s6, s8
@@ -1083,46 +1074,43 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-NEXT:    v_readfirstlane_b32 s12, v0
 ; GCN-NEXT:    s_add_i32 s11, s12, s11
 ; GCN-NEXT:    s_mul_i32 s12, s5, s10
-; GCN-NEXT:    s_add_i32 s12, s11, s12
-; GCN-NEXT:    s_sub_i32 s13, s7, s12
+; GCN-NEXT:    s_add_i32 s14, s11, s12
+; GCN-NEXT:    s_sub_i32 s12, s7, s14
 ; GCN-NEXT:    s_mul_i32 s10, s4, s10
 ; GCN-NEXT:    s_sub_u32 s6, s6, s10
 ; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s14, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s15, s6, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
-; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s16, s13, 0
-; GCN-NEXT:    s_cmp_ge_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s11, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s15, s4
-; GCN-NEXT:    s_cselect_b32 s17, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s16, s5
-; GCN-NEXT:    s_cselect_b32 s17, s17, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s13, s13, s5
-; GCN-NEXT:    s_sub_u32 s18, s15, s4
-; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s13, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s6, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s17, s12, s13
+; GCN-NEXT:    s_subb_u32 s17, s15, 0
+; GCN-NEXT:    s_cmp_ge_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s16, s4
+; GCN-NEXT:    s_cselect_b32 s19, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s17, s5
+; GCN-NEXT:    s_cselect_b32 s18, s19, s18
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s15, s15, s5
+; GCN-NEXT:    s_sub_u32 s19, s16, s4
+; GCN-NEXT:    s_cselect_b64 s[12:13], -1, 0
+; GCN-NEXT:    s_or_b32 s12, s12, s13
+; GCN-NEXT:    s_subb_u32 s12, s15, 0
+; GCN-NEXT:    s_cmp_lg_u32 s18, 0
+; GCN-NEXT:    s_cselect_b32 s13, s19, s16
+; GCN-NEXT:    s_cselect_b32 s12, s12, s17
 ; GCN-NEXT:    s_or_b32 s10, s10, s11
-; GCN-NEXT:    s_cmp_lg_u32 s10, 0
-; GCN-NEXT:    s_subb_u32 s10, s13, 0
-; GCN-NEXT:    s_cmp_lg_u32 s17, 0
-; GCN-NEXT:    s_cselect_b32 s11, s18, s15
-; GCN-NEXT:    s_cselect_b32 s10, s10, s16
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s12
+; GCN-NEXT:    s_subb_u32 s7, s7, s14
 ; GCN-NEXT:    s_cmp_ge_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s12, -1, 0
+; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s7, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s12
+; GCN-NEXT:    s_cselect_b32 s4, s4, s10
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s5, s10, s7
-; GCN-NEXT:    s_cselect_b32 s4, s11, s6
+; GCN-NEXT:    s_cselect_b32 s5, s12, s7
+; GCN-NEXT:    s_cselect_b32 s4, s13, s6
 ; GCN-NEXT:    s_xor_b64 s[4:5], s[4:5], s[8:9]
 ; GCN-NEXT:    s_sub_u32 s4, s4, s8
 ; GCN-NEXT:    s_subb_u32 s5, s5, s8
@@ -1170,7 +1158,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s16, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s10, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s10, 0
 ; GCN-IR-NEXT:    s_addc_u32 s10, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s14, 63, s14
@@ -1204,7 +1191,6 @@ define amdgpu_kernel void @s_test_srem33_64(ptr addrspace(1) %out, i64 %x, i64 %
 ; GCN-IR-NEXT:    s_add_u32 s18, s18, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s20, s20, s21
-; GCN-IR-NEXT:    s_cmp_lg_u32 s20, 0
 ; GCN-IR-NEXT:    s_addc_u32 s19, s19, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[20:21], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[12:13], s[2:3]
@@ -1369,10 +1355,9 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s6, s7
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
+; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s6, s2, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s7, v0
@@ -1403,7 +1388,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s2, s11, s2
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-NEXT:    s_addc_u32 s6, s9, s8
 ; GCN-NEXT:    v_mul_hi_u32 v1, s2, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, 24
@@ -1418,45 +1402,42 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mul_i32 s7, s5, s6
 ; GCN-NEXT:    s_mul_i32 s6, s4, s6
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v0
-; GCN-NEXT:    s_add_i32 s8, s8, s7
-; GCN-NEXT:    s_sub_i32 s9, 0, s8
-; GCN-NEXT:    s_sub_u32 s10, 24, s6
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s12, s10, s4
+; GCN-NEXT:    s_add_i32 s10, s8, s7
+; GCN-NEXT:    s_sub_i32 s8, 0, s10
+; GCN-NEXT:    s_sub_u32 s11, 24, s6
 ; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s6, s7
+; GCN-NEXT:    s_subb_u32 s12, s8, s5
+; GCN-NEXT:    s_sub_u32 s13, s11, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s4
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s5
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s5
+; GCN-NEXT:    s_sub_u32 s16, s13, s4
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s13, s9, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s5
+; GCN-NEXT:    s_subb_u32 s6, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s6, s5
 ; GCN-NEXT:    s_cselect_b32 s7, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s4
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s5
-; GCN-NEXT:    s_cselect_b32 s14, s14, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s9, s9, s5
-; GCN-NEXT:    s_sub_u32 s15, s12, s4
-; GCN-NEXT:    s_cselect_b64 s[6:7], -1, 0
-; GCN-NEXT:    s_or_b32 s6, s6, s7
-; GCN-NEXT:    s_cmp_lg_u32 s6, 0
-; GCN-NEXT:    s_subb_u32 s6, s9, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s7, s15, s12
-; GCN-NEXT:    s_cselect_b32 s6, s6, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s8, 0, s8
-; GCN-NEXT:    s_cmp_ge_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s9, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s10, s4
+; GCN-NEXT:    s_cmp_ge_u32 s11, s4
 ; GCN-NEXT:    s_cselect_b32 s4, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s8, s5
-; GCN-NEXT:    s_cselect_b32 s4, s4, s9
+; GCN-NEXT:    s_cmp_eq_u32 s6, s5
+; GCN-NEXT:    s_cselect_b32 s4, s4, s7
 ; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_cselect_b32 s4, s6, s8
-; GCN-NEXT:    s_cselect_b32 s5, s7, s10
+; GCN-NEXT:    s_cselect_b32 s4, s8, s6
+; GCN-NEXT:    s_cselect_b32 s5, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -1489,7 +1470,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s8, s2, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s9, s10, s11
-; GCN-IR-NEXT:    s_cmp_lg_u32 s9, 0
 ; GCN-IR-NEXT:    s_addc_u32 s3, s3, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[10:11], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s2, 63, s2
@@ -1522,7 +1502,6 @@ define amdgpu_kernel void @s_test_srem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[6:7]
diff --git a/llvm/test/CodeGen/AMDGPU/stackguard.ll b/llvm/test/CodeGen/AMDGPU/stackguard.ll
new file mode 100644
index 0000000000000..393686f86ac40
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/stackguard.ll
@@ -0,0 +1,14 @@
+; RUN: not llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+; RUN: not llc -global-isel -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -filetype=null %s 2>&1 | FileCheck %s
+
+; FIXME: To actually support stackguard, need to fix intrinsic to
+; return pointer in any address space.
+
+; CHECK: error: unable to lower stackguard
+define i1 @test_stackguard(ptr %p1) {
+  %p2 = call ptr @llvm.stackguard()
+  %res = icmp ne ptr %p2, %p1
+  ret i1 %res
+}
+
+declare ptr @llvm.stackguard()
diff --git a/llvm/test/CodeGen/AMDGPU/uaddo.ll b/llvm/test/CodeGen/AMDGPU/uaddo.ll
index bb5918b256d28..bdd22f25e91c8 100644
--- a/llvm/test/CodeGen/AMDGPU/uaddo.ll
+++ b/llvm/test/CodeGen/AMDGPU/uaddo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_addc_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_add_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_addc_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_addc_u32 s4, s3, s7
+; GFX9-NEXT:    s_add_u32 s4, s2, s6
+; GFX9-NEXT:    s_addc_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_addc_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_uaddo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_addc_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -444,7 +435,6 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_add_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_addc_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -465,16 +455,14 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_add_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_add_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_addc_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_addc_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -486,12 +474,10 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_add_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_addc_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_add_u32 s0, s12, s14
+; GFX9-NEXT:    s_addc_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -504,10 +490,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_add_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_addc_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -520,10 +504,8 @@ define amdgpu_kernel void @s_uaddo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_add_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_addc_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll
index 41199b0268ae4..fd461ac80ea55 100644
--- a/llvm/test/CodeGen/AMDGPU/udiv64.ll
+++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll
@@ -148,7 +148,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -182,7 +181,6 @@ define amdgpu_kernel void @s_test_udiv_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[2:3], s[4:5]
@@ -831,10 +829,9 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -865,7 +862,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -874,52 +870,50 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    v_readfirstlane_b32 s8, v1
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
 ; GCN-NEXT:    s_add_u32 s4, s8, s4
-; GCN-NEXT:    s_addc_u32 s8, 0, s5
-; GCN-NEXT:    v_mov_b32_e32 v0, s8
+; GCN-NEXT:    s_addc_u32 s10, 0, s5
+; GCN-NEXT:    v_mov_b32_e32 v0, s10
 ; GCN-NEXT:    v_mul_hi_u32 v0, s2, v0
 ; GCN-NEXT:    s_mov_b32 s4, s0
 ; GCN-NEXT:    s_mov_b32 s5, s1
-; GCN-NEXT:    s_mul_i32 s0, s3, s8
+; GCN-NEXT:    s_mul_i32 s0, s3, s10
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
-; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s11, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_add_i32 s11, s1, s0
+; GCN-NEXT:    s_sub_i32 s8, 0, s11
+; GCN-NEXT:    s_mul_i32 s0, s2, s10
+; GCN-NEXT:    s_sub_u32 s12, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s9, s0, s1
+; GCN-NEXT:    s_subb_u32 s13, s8, s3
+; GCN-NEXT:    s_sub_u32 s14, s12, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s9, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s2
+; GCN-NEXT:    s_cselect_b32 s13, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s8, s3
+; GCN-NEXT:    s_cselect_b32 s8, s13, s9
+; GCN-NEXT:    s_add_u32 s9, s10, 1
+; GCN-NEXT:    s_addc_u32 s13, 0, 0
+; GCN-NEXT:    s_add_u32 s14, s10, 2
+; GCN-NEXT:    s_addc_u32 s15, 0, 0
+; GCN-NEXT:    s_cmp_lg_u32 s8, 0
+; GCN-NEXT:    s_cselect_b32 s8, s14, s9
+; GCN-NEXT:    s_cselect_b32 s9, s15, s13
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
+; GCN-NEXT:    s_subb_u32 s0, 0, s11
 ; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s2
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s12, s2
+; GCN-NEXT:    s_cselect_b32 s2, -1, 0
 ; GCN-NEXT:    s_cmp_eq_u32 s0, s3
-; GCN-NEXT:    s_cselect_b32 s0, s10, s1
-; GCN-NEXT:    s_add_u32 s1, s8, 1
-; GCN-NEXT:    s_addc_u32 s10, 0, 0
-; GCN-NEXT:    s_add_u32 s13, s8, 2
-; GCN-NEXT:    s_addc_u32 s14, 0, 0
+; GCN-NEXT:    s_cselect_b32 s0, s2, s1
 ; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_cselect_b32 s0, s13, s1
-; GCN-NEXT:    s_cselect_b32 s1, s14, s10
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s11, s2
-; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s1, s1, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s8
-; GCN-NEXT:    v_mov_b32_e32 v0, s0
-; GCN-NEXT:    v_mov_b32_e32 v1, s1
+; GCN-NEXT:    s_cselect_b32 s0, s9, 0
+; GCN-NEXT:    s_cselect_b32 s1, s8, s10
+; GCN-NEXT:    v_mov_b32_e32 v0, s1
+; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GCN-NEXT:    s_endpgm
 ;
@@ -945,7 +939,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -978,7 +971,6 @@ define amdgpu_kernel void @s_test_udiv_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1317,7 +1309,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1347,7 +1338,6 @@ define amdgpu_kernel void @s_test_udiv_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s10, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s12, s12, s13
-; GCN-IR-NEXT:    s_cmp_lg_u32 s12, 0
 ; GCN-IR-NEXT:    s_addc_u32 s11, s11, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index 9bcba6c9a3a77..2d7ce10644928 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti < %s | FileCheck -check-prefixes=SI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global < %s | FileCheck -check-prefixes=VI %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @uitofp_i16_to_f16(
 ; SI-LABEL: uitofp_i16_to_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll
index cdcc9146cc5ae..137dc1fe42294 100644
--- a/llvm/test/CodeGen/AMDGPU/urem64.ll
+++ b/llvm/test/CodeGen/AMDGPU/urem64.ll
@@ -51,10 +51,9 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_addc_u32 s13, 0, s14
 ; GCN-NEXT:    s_add_u32 s14, s0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v0, s14
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s10, v0
+; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s12, s12, s13
 ; GCN-NEXT:    s_mul_i32 s0, s10, s12
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
@@ -85,7 +84,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    s_add_u32 s11, s14, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
 ; GCN-NEXT:    s_addc_u32 s1, s12, s10
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mul_hi_u32 v1, s6, v0
@@ -115,46 +113,43 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-NEXT:    v_readfirstlane_b32 s10, v0
 ; GCN-NEXT:    s_add_i32 s5, s10, s5
 ; GCN-NEXT:    s_mul_i32 s10, s9, s4
-; GCN-NEXT:    s_add_i32 s10, s5, s10
-; GCN-NEXT:    s_sub_i32 s11, s7, s10
+; GCN-NEXT:    s_add_i32 s12, s5, s10
+; GCN-NEXT:    s_sub_i32 s10, s7, s12
 ; GCN-NEXT:    s_mul_i32 s4, s8, s4
 ; GCN-NEXT:    s_sub_u32 s6, s6, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s12, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s13, s6, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
+; GCN-NEXT:    s_or_b32 s11, s4, s5
+; GCN-NEXT:    s_subb_u32 s13, s10, s9
+; GCN-NEXT:    s_sub_u32 s14, s6, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s15, s10, s11
+; GCN-NEXT:    s_subb_u32 s15, s13, 0
+; GCN-NEXT:    s_cmp_ge_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s8
+; GCN-NEXT:    s_cselect_b32 s17, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s15, s9
+; GCN-NEXT:    s_cselect_b32 s16, s17, s16
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s13, s13, s9
+; GCN-NEXT:    s_sub_u32 s17, s14, s8
+; GCN-NEXT:    s_cselect_b64 s[10:11], -1, 0
+; GCN-NEXT:    s_or_b32 s10, s10, s11
+; GCN-NEXT:    s_subb_u32 s10, s13, 0
+; GCN-NEXT:    s_cmp_lg_u32 s16, 0
+; GCN-NEXT:    s_cselect_b32 s11, s17, s14
+; GCN-NEXT:    s_cselect_b32 s10, s10, s15
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s14, s11, 0
-; GCN-NEXT:    s_cmp_ge_u32 s14, s9
+; GCN-NEXT:    s_subb_u32 s4, s7, s12
+; GCN-NEXT:    s_cmp_ge_u32 s4, s9
 ; GCN-NEXT:    s_cselect_b32 s5, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s8
-; GCN-NEXT:    s_cselect_b32 s15, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s14, s9
-; GCN-NEXT:    s_cselect_b32 s15, s15, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s11, s11, s9
-; GCN-NEXT:    s_sub_u32 s16, s13, s8
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
-; GCN-NEXT:    s_subb_u32 s4, s11, 0
-; GCN-NEXT:    s_cmp_lg_u32 s15, 0
-; GCN-NEXT:    s_cselect_b32 s5, s16, s13
-; GCN-NEXT:    s_cselect_b32 s4, s4, s14
-; GCN-NEXT:    s_cmp_lg_u32 s12, 0
-; GCN-NEXT:    s_subb_u32 s7, s7, s10
-; GCN-NEXT:    s_cmp_ge_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
 ; GCN-NEXT:    s_cmp_ge_u32 s6, s8
-; GCN-NEXT:    s_cselect_b32 s8, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s7, s9
-; GCN-NEXT:    s_cselect_b32 s8, s8, s10
-; GCN-NEXT:    s_cmp_lg_u32 s8, 0
-; GCN-NEXT:    s_cselect_b32 s4, s4, s7
-; GCN-NEXT:    s_cselect_b32 s5, s5, s6
+; GCN-NEXT:    s_cselect_b32 s7, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s4, s9
+; GCN-NEXT:    s_cselect_b32 s5, s7, s5
+; GCN-NEXT:    s_cmp_lg_u32 s5, 0
+; GCN-NEXT:    s_cselect_b32 s4, s10, s4
+; GCN-NEXT:    s_cselect_b32 s5, s11, s6
 ; GCN-NEXT:    v_mov_b32_e32 v0, s5
 ; GCN-NEXT:    v_mov_b32_e32 v1, s4
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
@@ -187,7 +182,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s14, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s8, s8, s9
-; GCN-IR-NEXT:    s_cmp_lg_u32 s8, 0
 ; GCN-IR-NEXT:    s_addc_u32 s8, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[8:9], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s12, 63, s12
@@ -221,7 +215,6 @@ define amdgpu_kernel void @s_test_urem_i64(ptr addrspace(1) %out, i64 %x, i64 %y
 ; GCN-IR-NEXT:    s_add_u32 s16, s16, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s18, s18, s19
-; GCN-IR-NEXT:    s_cmp_lg_u32 s18, 0
 ; GCN-IR-NEXT:    s_addc_u32 s17, s17, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[18:19], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
@@ -853,10 +846,9 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_addc_u32 s10, 0, s11
 ; GCN-NEXT:    s_add_u32 s11, s4, s5
 ; GCN-NEXT:    v_mov_b32_e32 v0, s11
-; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    v_mul_hi_u32 v0, s6, v0
+; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s9, s9, s10
 ; GCN-NEXT:    s_mul_i32 s4, s6, s9
 ; GCN-NEXT:    v_readfirstlane_b32 s5, v0
@@ -887,7 +879,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_add_u32 s8, s11, s4
 ; GCN-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; GCN-NEXT:    s_or_b32 s4, s4, s5
-; GCN-NEXT:    s_cmp_lg_u32 s4, 0
 ; GCN-NEXT:    s_addc_u32 s4, s9, s6
 ; GCN-NEXT:    v_mul_hi_u32 v1, s8, 24
 ; GCN-NEXT:    v_mul_hi_u32 v0, s4, 24
@@ -903,46 +894,43 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-NEXT:    s_mov_b32 s5, s1
 ; GCN-NEXT:    s_mul_i32 s0, s3, s8
 ; GCN-NEXT:    v_readfirstlane_b32 s1, v0
-; GCN-NEXT:    s_add_i32 s9, s1, s0
-; GCN-NEXT:    s_sub_i32 s10, 0, s9
+; GCN-NEXT:    s_add_i32 s10, s1, s0
+; GCN-NEXT:    s_sub_i32 s9, 0, s10
 ; GCN-NEXT:    s_mul_i32 s0, s2, s8
-; GCN-NEXT:    s_sub_u32 s8, 24, s0
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s11, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s12, s8, s2
+; GCN-NEXT:    s_sub_u32 s11, 24, s0
 ; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s0, s1
+; GCN-NEXT:    s_subb_u32 s12, s9, s3
+; GCN-NEXT:    s_sub_u32 s13, s11, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s14, s8, s9
+; GCN-NEXT:    s_subb_u32 s14, s12, 0
+; GCN-NEXT:    s_cmp_ge_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, -1, 0
+; GCN-NEXT:    s_cmp_ge_u32 s13, s2
+; GCN-NEXT:    s_cselect_b32 s16, -1, 0
+; GCN-NEXT:    s_cmp_eq_u32 s14, s3
+; GCN-NEXT:    s_cselect_b32 s15, s16, s15
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s12, s12, s3
+; GCN-NEXT:    s_sub_u32 s16, s13, s2
+; GCN-NEXT:    s_cselect_b64 s[8:9], -1, 0
+; GCN-NEXT:    s_or_b32 s8, s8, s9
+; GCN-NEXT:    s_subb_u32 s8, s12, 0
+; GCN-NEXT:    s_cmp_lg_u32 s15, 0
+; GCN-NEXT:    s_cselect_b32 s9, s16, s13
+; GCN-NEXT:    s_cselect_b32 s8, s8, s14
 ; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s13, s10, 0
-; GCN-NEXT:    s_cmp_ge_u32 s13, s3
+; GCN-NEXT:    s_subb_u32 s0, 0, s10
+; GCN-NEXT:    s_cmp_ge_u32 s0, s3
 ; GCN-NEXT:    s_cselect_b32 s1, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s12, s2
-; GCN-NEXT:    s_cselect_b32 s14, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s13, s3
-; GCN-NEXT:    s_cselect_b32 s14, s14, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s10, s10, s3
-; GCN-NEXT:    s_sub_u32 s15, s12, s2
-; GCN-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GCN-NEXT:    s_or_b32 s0, s0, s1
-; GCN-NEXT:    s_cmp_lg_u32 s0, 0
-; GCN-NEXT:    s_subb_u32 s0, s10, 0
-; GCN-NEXT:    s_cmp_lg_u32 s14, 0
-; GCN-NEXT:    s_cselect_b32 s1, s15, s12
-; GCN-NEXT:    s_cselect_b32 s0, s0, s13
-; GCN-NEXT:    s_cmp_lg_u32 s11, 0
-; GCN-NEXT:    s_subb_u32 s9, 0, s9
-; GCN-NEXT:    s_cmp_ge_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s10, -1, 0
-; GCN-NEXT:    s_cmp_ge_u32 s8, s2
+; GCN-NEXT:    s_cmp_ge_u32 s11, s2
 ; GCN-NEXT:    s_cselect_b32 s2, -1, 0
-; GCN-NEXT:    s_cmp_eq_u32 s9, s3
-; GCN-NEXT:    s_cselect_b32 s2, s2, s10
-; GCN-NEXT:    s_cmp_lg_u32 s2, 0
-; GCN-NEXT:    s_cselect_b32 s0, s0, s9
-; GCN-NEXT:    s_cselect_b32 s1, s1, s8
+; GCN-NEXT:    s_cmp_eq_u32 s0, s3
+; GCN-NEXT:    s_cselect_b32 s1, s2, s1
+; GCN-NEXT:    s_cmp_lg_u32 s1, 0
+; GCN-NEXT:    s_cselect_b32 s0, s8, s0
+; GCN-NEXT:    s_cselect_b32 s1, s9, s11
 ; GCN-NEXT:    v_mov_b32_e32 v0, s1
 ; GCN-NEXT:    v_mov_b32_e32 v1, s0
 ; GCN-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
@@ -970,7 +958,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s10, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1003,7 +990,6 @@ define amdgpu_kernel void @s_test_urem_k_num_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s14, s14, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s16, s16, s17
-; GCN-IR-NEXT:    s_cmp_lg_u32 s16, 0
 ; GCN-IR-NEXT:    s_addc_u32 s15, s15, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[16:17], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[8:9], s[4:5]
@@ -1093,7 +1079,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s11, s8, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s6, s6, s7
-; GCN-IR-NEXT:    s_cmp_lg_u32 s6, 0
 ; GCN-IR-NEXT:    s_addc_u32 s6, s9, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[6:7], -1, 0
 ; GCN-IR-NEXT:    s_sub_i32 s8, 63, s8
@@ -1123,7 +1108,6 @@ define amdgpu_kernel void @s_test_urem_k_den_i64(ptr addrspace(1) %out, i64 %x)
 ; GCN-IR-NEXT:    s_add_u32 s12, s12, 1
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_or_b32 s14, s14, s15
-; GCN-IR-NEXT:    s_cmp_lg_u32 s14, 0
 ; GCN-IR-NEXT:    s_addc_u32 s13, s13, 0
 ; GCN-IR-NEXT:    s_cselect_b64 s[14:15], -1, 0
 ; GCN-IR-NEXT:    s_mov_b64 s[10:11], s[4:5]
diff --git a/llvm/test/CodeGen/AMDGPU/usubo.ll b/llvm/test/CodeGen/AMDGPU/usubo.ll
index d67a7b151948e..e8db6471b6a46 100644
--- a/llvm/test/CodeGen/AMDGPU/usubo.ll
+++ b/llvm/test/CodeGen/AMDGPU/usubo.ll
@@ -18,7 +18,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; SI-NEXT:    s_mov_b32 s5, s1
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    s_or_b32 s0, s0, s1
-; SI-NEXT:    s_cmp_lg_u32 s0, 0
 ; SI-NEXT:    s_subb_u32 s3, s3, s9
 ; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -35,10 +34,8 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
 ; VI-NEXT:    s_sub_u32 s2, s2, s4
-; VI-NEXT:    v_mov_b32_e32 v1, s1
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
 ; VI-NEXT:    s_subb_u32 s3, s3, s5
+; VI-NEXT:    v_mov_b32_e32 v1, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
@@ -53,14 +50,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX9-NEXT:    s_load_dwordx2 s[6:7], s[4:5], 0x34
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s6, s2, s6
-; GFX9-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX9-NEXT:    s_subb_u32 s4, s3, s7
+; GFX9-NEXT:    s_sub_u32 s4, s2, s6
+; GFX9-NEXT:    s_subb_u32 s5, s3, s7
 ; GFX9-NEXT:    s_cselect_b64 s[2:3], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[2:3]
-; GFX9-NEXT:    v_mov_b32_e32 v1, s4
-; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s6, v0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    v_add_co_u32_e32 v0, vcc, s4, v0
 ; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -73,8 +68,6 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s2, s2, s6
-; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX10-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX10-NEXT:    s_subb_u32 s3, s3, s7
 ; GFX10-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX10-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
@@ -91,14 +84,12 @@ define amdgpu_kernel void @s_usubo_i64_zext(ptr addrspace(1) %out, i64 %a, i64 %
 ; GFX11-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s2, s2, s4
-; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_cmp_lg_u32 s4, 0
 ; GFX11-NEXT:    s_subb_u32 s3, s3, s5
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
+; GFX11-NEXT:    s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_u32 v0, s2, s2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11-NEXT:    v_add_co_ci_u32_e64 v1, null, s3, 0, s2
 ; GFX11-NEXT:    global_store_b64 v2, v[0:1], s[0:1]
 ; GFX11-NEXT:    s_endpgm
@@ -443,7 +434,6 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; SI-NEXT:    s_sub_u32 s4, s4, s6
 ; SI-NEXT:    s_cselect_b64 s[12:13], -1, 0
 ; SI-NEXT:    s_or_b32 s6, s12, s13
-; SI-NEXT:    s_cmp_lg_u32 s6, 0
 ; SI-NEXT:    s_subb_u32 s5, s5, s7
 ; SI-NEXT:    s_mov_b32 s8, s0
 ; SI-NEXT:    s_mov_b32 s9, s1
@@ -464,16 +454,14 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; VI:       ; %bb.0:
 ; VI-NEXT:    s_load_dwordx8 s[0:7], s[4:5], 0x24
 ; VI-NEXT:    s_waitcnt lgkmcnt(0)
-; VI-NEXT:    v_mov_b32_e32 v2, s2
-; VI-NEXT:    s_sub_u32 s2, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v0, s0
+; VI-NEXT:    s_sub_u32 s0, s4, s6
 ; VI-NEXT:    v_mov_b32_e32 v1, s1
+; VI-NEXT:    s_subb_u32 s1, s5, s7
+; VI-NEXT:    v_mov_b32_e32 v4, s0
+; VI-NEXT:    v_mov_b32_e32 v5, s1
 ; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; VI-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; VI-NEXT:    s_subb_u32 s0, s5, s7
-; VI-NEXT:    v_mov_b32_e32 v4, s2
-; VI-NEXT:    v_mov_b32_e32 v5, s0
-; VI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; VI-NEXT:    v_mov_b32_e32 v2, s2
 ; VI-NEXT:    v_mov_b32_e32 v3, s3
 ; VI-NEXT:    flat_store_dwordx2 v[0:1], v[4:5]
 ; VI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
@@ -485,12 +473,10 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX9-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
 ; GFX9-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    s_sub_u32 s2, s12, s14
-; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
-; GFX9-NEXT:    s_cmp_lg_u64 s[0:1], 0
-; GFX9-NEXT:    s_subb_u32 s0, s13, s15
-; GFX9-NEXT:    v_mov_b32_e32 v0, s2
-; GFX9-NEXT:    v_mov_b32_e32 v1, s0
+; GFX9-NEXT:    s_sub_u32 s0, s12, s14
+; GFX9-NEXT:    s_subb_u32 s1, s13, s15
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX9-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[0:1]
 ; GFX9-NEXT:    global_store_dwordx2 v2, v[0:1], s[8:9]
@@ -503,10 +489,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX10-NEXT:    v_mov_b32_e32 v2, 0
 ; GFX10-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-NEXT:    s_sub_u32 s0, s12, s14
-; GFX10-NEXT:    s_cselect_b32 s1, -1, 0
-; GFX10-NEXT:    v_mov_b32_e32 v0, s0
-; GFX10-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX10-NEXT:    s_subb_u32 s1, s13, s15
+; GFX10-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX10-NEXT:    s_cselect_b32 s0, -1, 0
 ; GFX10-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s0
@@ -519,10 +503,8 @@ define amdgpu_kernel void @s_usubo_i64(ptr addrspace(1) %out, ptr addrspace(1) %
 ; GFX11-NEXT:    s_load_b256 s[0:7], s[4:5], 0x24
 ; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX11-NEXT:    s_sub_u32 s4, s4, s6
-; GFX11-NEXT:    s_cselect_b32 s6, -1, 0
-; GFX11-NEXT:    v_mov_b32_e32 v0, s4
-; GFX11-NEXT:    s_cmp_lg_u32 s6, 0
 ; GFX11-NEXT:    s_subb_u32 s5, s5, s7
+; GFX11-NEXT:    v_mov_b32_e32 v0, s4
 ; GFX11-NEXT:    s_cselect_b32 s4, -1, 0
 ; GFX11-NEXT:    v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s5
 ; GFX11-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index debfc6dbbdbce..110013258bd89 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -774,44 +774,40 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_add_u32 s11, s12, s11
 ; GFX1032-NEXT:    s_addc_u32 s12, 0, s13
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s11
-; GFX1032-NEXT:    s_cselect_b32 s11, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s9, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s11, 0
-; GFX1032-NEXT:    s_mul_i32 s11, s9, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s12
-; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s9, s8
+; GFX1032-NEXT:    s_mul_i32 s12, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s9, s9, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s12, s8, s11
-; GFX1032-NEXT:    s_add_i32 s9, s13, s9
-; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s11
+; GFX1032-NEXT:    s_mul_i32 s10, s10, s8
+; GFX1032-NEXT:    s_add_i32 s9, s11, s9
+; GFX1032-NEXT:    s_mul_i32 s11, s5, s12
 ; GFX1032-NEXT:    s_add_i32 s9, s9, s10
-; GFX1032-NEXT:    s_mul_i32 s10, s5, s11
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s8, s12
 ; GFX1032-NEXT:    s_mul_i32 s15, s8, s9
 ; GFX1032-NEXT:    s_mul_hi_u32 s14, s8, s9
-; GFX1032-NEXT:    s_add_u32 s12, s12, s15
+; GFX1032-NEXT:    s_add_u32 s10, s10, s15
+; GFX1032-NEXT:    s_mul_hi_u32 s13, s5, s12
 ; GFX1032-NEXT:    s_addc_u32 s14, 0, s14
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s5, s9
-; GFX1032-NEXT:    s_add_u32 s10, s12, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s12, s5, s9
+; GFX1032-NEXT:    s_add_u32 s10, s10, s11
 ; GFX1032-NEXT:    s_mul_i32 s9, s5, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, s14, s13
-; GFX1032-NEXT:    s_addc_u32 s11, s11, 0
+; GFX1032-NEXT:    s_addc_u32 s11, s12, 0
 ; GFX1032-NEXT:    s_add_u32 s9, s10, s9
 ; GFX1032-NEXT:    s_addc_u32 s10, 0, s11
 ; GFX1032-NEXT:    s_add_u32 s8, s8, s9
-; GFX1032-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s8
-; GFX1032-NEXT:    s_cmp_lg_u32 s9, 0
-; GFX1032-NEXT:    s_mul_hi_u32 s9, s3, s8
 ; GFX1032-NEXT:    s_addc_u32 s5, s5, s10
-; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_mul_hi_u32 s9, s2, s8
 ; GFX1032-NEXT:    s_mul_i32 s12, s2, s5
-; GFX1032-NEXT:    s_mul_hi_u32 s10, s2, s5
-; GFX1032-NEXT:    s_add_u32 s11, s11, s12
-; GFX1032-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1032-NEXT:    s_mul_hi_u32 s11, s2, s5
+; GFX1032-NEXT:    s_mul_hi_u32 s10, s3, s8
+; GFX1032-NEXT:    s_mul_i32 s8, s3, s8
+; GFX1032-NEXT:    s_add_u32 s9, s9, s12
+; GFX1032-NEXT:    s_addc_u32 s11, 0, s11
 ; GFX1032-NEXT:    s_mul_hi_u32 s13, s3, s5
-; GFX1032-NEXT:    s_add_u32 s8, s11, s8
+; GFX1032-NEXT:    s_add_u32 s8, s9, s8
 ; GFX1032-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1032-NEXT:    s_addc_u32 s8, s10, s9
+; GFX1032-NEXT:    s_addc_u32 s8, s11, s10
 ; GFX1032-NEXT:    s_addc_u32 s9, s13, 0
 ; GFX1032-NEXT:    s_add_u32 s5, s8, s5
 ; GFX1032-NEXT:    s_addc_u32 s8, 0, s9
@@ -824,11 +820,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1032-NEXT:    s_sub_i32 s11, s3, s9
 ; GFX1032-NEXT:    s_sub_u32 s10, s2, s10
 ; GFX1032-NEXT:    s_cselect_b32 s12, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s12, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, s1
 ; GFX1032-NEXT:    s_sub_u32 s13, s10, s0
-; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1032-NEXT:    s_cmp_lg_u32 s14, 0
 ; GFX1032-NEXT:    s_subb_u32 s11, s11, 0
 ; GFX1032-NEXT:    s_cmp_ge_u32 s11, s1
 ; GFX1032-NEXT:    s_cselect_b32 s14, -1, 0
@@ -901,8 +894,8 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:  ; %bb.1:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v1, s1
-; GFX1064-NEXT:    s_sub_u32 s9, 0, s0
-; GFX1064-NEXT:    s_subb_u32 s10, 0, s1
+; GFX1064-NEXT:    s_sub_u32 s8, 0, s0
+; GFX1064-NEXT:    s_subb_u32 s9, 0, s1
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0x4f800000, v0
 ; GFX1064-NEXT:    v_rcp_f32_e32 v0, v0
 ; GFX1064-NEXT:    v_mul_f32_e32 v0, 0x5f7ffffc, v0
@@ -911,109 +904,102 @@ define amdgpu_kernel void @test_udiv64(ptr addrspace(1) %arg) #0 {
 ; GFX1064-NEXT:    v_madmk_f32 v0, v1, 0xcf800000, v0
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v1, v1
 ; GFX1064-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX1064-NEXT:    v_readfirstlane_b32 s8, v1
-; GFX1064-NEXT:    v_readfirstlane_b32 s4, v0
-; GFX1064-NEXT:    s_mul_i32 s5, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s9, s4
-; GFX1064-NEXT:    s_mul_i32 s11, s10, s4
-; GFX1064-NEXT:    s_add_i32 s5, s12, s5
-; GFX1064-NEXT:    s_mul_i32 s13, s9, s4
-; GFX1064-NEXT:    s_add_i32 s5, s5, s11
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s13
-; GFX1064-NEXT:    s_mul_i32 s15, s4, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s14, s8, s13
-; GFX1064-NEXT:    s_mul_i32 s11, s8, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s4, s5
+; GFX1064-NEXT:    v_readfirstlane_b32 s4, v1
+; GFX1064-NEXT:    v_readfirstlane_b32 s5, v0
+; GFX1064-NEXT:    s_mul_i32 s10, s8, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s9, s5
+; GFX1064-NEXT:    s_add_i32 s10, s12, s10
+; GFX1064-NEXT:    s_mul_i32 s13, s8, s5
+; GFX1064-NEXT:    s_add_i32 s10, s10, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s5, s13
+; GFX1064-NEXT:    s_mul_i32 s15, s5, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s14, s4, s13
+; GFX1064-NEXT:    s_mul_i32 s11, s4, s13
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s10
 ; GFX1064-NEXT:    s_add_u32 s12, s12, s15
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s16, s8, s5
+; GFX1064-NEXT:    s_mul_hi_u32 s16, s4, s10
 ; GFX1064-NEXT:    s_add_u32 s11, s12, s11
-; GFX1064-NEXT:    s_mul_i32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, s13, s14
 ; GFX1064-NEXT:    s_addc_u32 s12, s16, 0
-; GFX1064-NEXT:    s_add_u32 s5, s11, s5
+; GFX1064-NEXT:    s_add_u32 s10, s11, s10
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s12
-; GFX1064-NEXT:    s_add_u32 s12, s4, s5
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s9, s12
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_i32 s4, s9, s12
-; GFX1064-NEXT:    s_addc_u32 s8, s8, s11
-; GFX1064-NEXT:    s_mul_i32 s10, s10, s12
-; GFX1064-NEXT:    s_mul_i32 s9, s9, s8
-; GFX1064-NEXT:    s_mul_hi_u32 s5, s12, s4
-; GFX1064-NEXT:    s_add_i32 s9, s13, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s8, s4
-; GFX1064-NEXT:    s_add_i32 s9, s9, s10
-; GFX1064-NEXT:    s_mul_i32 s4, s8, s4
-; GFX1064-NEXT:    s_mul_i32 s14, s12, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s13, s12, s9
-; GFX1064-NEXT:    s_add_u32 s5, s5, s14
+; GFX1064-NEXT:    s_add_u32 s5, s5, s10
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s11
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s8, s8, s4
+; GFX1064-NEXT:    s_mul_i32 s9, s9, s5
+; GFX1064-NEXT:    s_add_i32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_i32 s10, s4, s11
+; GFX1064-NEXT:    s_add_i32 s8, s8, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s5, s11
+; GFX1064-NEXT:    s_mul_i32 s14, s5, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s13, s5, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s14
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s4, s11
 ; GFX1064-NEXT:    s_addc_u32 s13, 0, s13
-; GFX1064-NEXT:    s_mul_hi_u32 s10, s8, s9
-; GFX1064-NEXT:    s_add_u32 s4, s5, s4
-; GFX1064-NEXT:    s_mul_i32 s9, s8, s9
-; GFX1064-NEXT:    s_addc_u32 s4, s13, s11
-; GFX1064-NEXT:    s_addc_u32 s5, s10, 0
-; GFX1064-NEXT:    s_add_u32 s4, s4, s9
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s5
-; GFX1064-NEXT:    s_add_u32 s10, s12, s4
-; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_mul_hi_u32 s11, s2, s10
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_mul_hi_u32 s4, s3, s10
-; GFX1064-NEXT:    s_addc_u32 s5, s8, s9
-; GFX1064-NEXT:    s_mul_i32 s8, s3, s10
-; GFX1064-NEXT:    s_mul_i32 s10, s2, s5
-; GFX1064-NEXT:    s_mul_hi_u32 s9, s2, s5
-; GFX1064-NEXT:    s_add_u32 s10, s11, s10
-; GFX1064-NEXT:    s_addc_u32 s9, 0, s9
-; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s5
-; GFX1064-NEXT:    s_add_u32 s8, s10, s8
+; GFX1064-NEXT:    s_mul_hi_u32 s11, s4, s8
+; GFX1064-NEXT:    s_add_u32 s9, s9, s10
+; GFX1064-NEXT:    s_mul_i32 s8, s4, s8
+; GFX1064-NEXT:    s_addc_u32 s9, s13, s12
+; GFX1064-NEXT:    s_addc_u32 s10, s11, 0
+; GFX1064-NEXT:    s_add_u32 s8, s9, s8
+; GFX1064-NEXT:    s_addc_u32 s9, 0, s10
+; GFX1064-NEXT:    s_add_u32 s5, s5, s8
+; GFX1064-NEXT:    s_addc_u32 s4, s4, s9
+; GFX1064-NEXT:    s_mul_hi_u32 s8, s2, s5
+; GFX1064-NEXT:    s_mul_i32 s11, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s10, s2, s4
+; GFX1064-NEXT:    s_mul_hi_u32 s9, s3, s5
 ; GFX1064-NEXT:    s_mul_i32 s5, s3, s5
-; GFX1064-NEXT:    s_addc_u32 s4, s9, s4
+; GFX1064-NEXT:    s_add_u32 s8, s8, s11
+; GFX1064-NEXT:    s_addc_u32 s10, 0, s10
+; GFX1064-NEXT:    s_mul_hi_u32 s12, s3, s4
+; GFX1064-NEXT:    s_add_u32 s5, s8, s5
+; GFX1064-NEXT:    s_mul_i32 s4, s3, s4
+; GFX1064-NEXT:    s_addc_u32 s5, s10, s9
 ; GFX1064-NEXT:    s_addc_u32 s8, s12, 0
-; GFX1064-NEXT:    s_add_u32 s10, s4, s5
+; GFX1064-NEXT:    s_add_u32 s10, s5, s4
 ; GFX1064-NEXT:    s_addc_u32 s11, 0, s8
 ; GFX1064-NEXT:    s_mul_hi_u32 s4, s0, s10
 ; GFX1064-NEXT:    s_mul_i32 s5, s0, s11
 ; GFX1064-NEXT:    s_mul_i32 s8, s1, s10
 ; GFX1064-NEXT:    s_add_i32 s4, s4, s5
-; GFX1064-NEXT:    s_add_i32 s12, s4, s8
+; GFX1064-NEXT:    s_add_i32 s8, s4, s8
 ; GFX1064-NEXT:    s_mul_i32 s4, s0, s10
-; GFX1064-NEXT:    s_sub_i32 s8, s3, s12
-; GFX1064-NEXT:    s_sub_u32 s13, s2, s4
+; GFX1064-NEXT:    s_sub_i32 s9, s3, s8
+; GFX1064-NEXT:    s_sub_u32 s12, s2, s4
 ; GFX1064-NEXT:    s_cselect_b64 s[4:5], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s14, s8, s1
-; GFX1064-NEXT:    s_sub_u32 s15, s13, s0
-; GFX1064-NEXT:    s_cselect_b64 s[8:9], -1, 0
-; GFX1064-NEXT:    s_cmp_lg_u64 s[8:9], 0
-; GFX1064-NEXT:    s_subb_u32 s8, s14, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s9, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s15, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, s1
+; GFX1064-NEXT:    s_sub_u32 s13, s12, s0
+; GFX1064-NEXT:    s_subb_u32 s9, s9, 0
+; GFX1064-NEXT:    s_cmp_ge_u32 s9, s1
 ; GFX1064-NEXT:    s_cselect_b32 s14, -1, 0
-; GFX1064-NEXT:    s_cmp_eq_u32 s8, s1
-; GFX1064-NEXT:    s_cselect_b32 s8, s14, s9
-; GFX1064-NEXT:    s_add_u32 s9, s10, 1
+; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cselect_b32 s13, -1, 0
+; GFX1064-NEXT:    s_cmp_eq_u32 s9, s1
+; GFX1064-NEXT:    s_cselect_b32 s9, s13, s14
+; GFX1064-NEXT:    s_add_u32 s13, s10, 1
 ; GFX1064-NEXT:    s_addc_u32 s14, s11, 0
 ; GFX1064-NEXT:    s_add_u32 s15, s10, 2
 ; GFX1064-NEXT:    s_addc_u32 s16, s11, 0
-; GFX1064-NEXT:    s_cmp_lg_u32 s8, 0
-; GFX1064-NEXT:    s_cselect_b32 s15, s15, s9
+; GFX1064-NEXT:    s_cmp_lg_u32 s9, 0
+; GFX1064-NEXT:    s_cselect_b32 s13, s15, s13
 ; GFX1064-NEXT:    s_cselect_b32 s14, s16, s14
 ; GFX1064-NEXT:    s_cmp_lg_u64 s[4:5], 0
-; GFX1064-NEXT:    s_subb_u32 s3, s3, s12
+; GFX1064-NEXT:    s_subb_u32 s3, s3, s8
 ; GFX1064-NEXT:    s_cmp_ge_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s4, -1, 0
-; GFX1064-NEXT:    s_cmp_ge_u32 s13, s0
+; GFX1064-NEXT:    s_cmp_ge_u32 s12, s0
 ; GFX1064-NEXT:    s_cselect_b32 s5, -1, 0
 ; GFX1064-NEXT:    s_cmp_eq_u32 s3, s1
 ; GFX1064-NEXT:    s_cselect_b32 s1, s5, s4
 ; GFX1064-NEXT:    s_cmp_lg_u32 s1, 0
 ; GFX1064-NEXT:    s_cselect_b32 s5, s14, s11
-; GFX1064-NEXT:    s_cselect_b32 s4, s15, s10
+; GFX1064-NEXT:    s_cselect_b32 s4, s13, s10
 ; GFX1064-NEXT:    s_cbranch_execnz .LBB15_3
 ; GFX1064-NEXT:  .LBB15_2:
 ; GFX1064-NEXT:    v_cvt_f32_u32_e32 v0, s0
diff --git a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
index 64d055bc40e98..4445383bd0ace 100644
--- a/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
+++ b/llvm/test/CodeGen/AMDGPU/workitem-intrinsic-opts.ll
@@ -271,7 +271,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; DAGISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; DAGISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; DAGISEL-GFX8-NEXT:    s_cselect_b64 s[4:5], -1, 0
 ; DAGISEL-GFX8-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[4:5]
 ; DAGISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -281,7 +280,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; DAGISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; DAGISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX942-NEXT:    s_cselect_b64 s[0:1], -1, 0
 ; DAGISEL-GFX942-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; DAGISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -299,8 +297,6 @@ define i1 @workgroup_nonzero() {
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; DAGISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; DAGISEL-GFX12-NEXT:    s_cselect_b32 s0, -1, 0
 ; DAGISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; DAGISEL-GFX12-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s0
@@ -311,7 +307,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s12, s13
 ; GISEL-GFX8-NEXT:    s_or_b32 s4, s4, s14
-; GISEL-GFX8-NEXT:    s_cmp_lg_u32 s4, 0
 ; GISEL-GFX8-NEXT:    s_cselect_b32 s4, 1, 0
 ; GISEL-GFX8-NEXT:    v_mov_b32_e32 v0, s4
 ; GISEL-GFX8-NEXT:    s_setpc_b64 s[30:31]
@@ -321,7 +316,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX942-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s12, s13
 ; GISEL-GFX942-NEXT:    s_or_b32 s0, s0, s14
-; GISEL-GFX942-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX942-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX942-NEXT:    v_mov_b32_e32 v0, s0
 ; GISEL-GFX942-NEXT:    s_setpc_b64 s[30:31]
@@ -339,8 +333,6 @@ define i1 @workgroup_nonzero() {
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, ttmp9, s0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    s_or_b32 s0, s0, s1
-; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
-; GISEL-GFX12-NEXT:    s_cmp_lg_u32 s0, 0
 ; GISEL-GFX12-NEXT:    s_cselect_b32 s0, 1, 0
 ; GISEL-GFX12-NEXT:    s_wait_alu 0xfffe
 ; GISEL-GFX12-NEXT:    v_mov_b32_e32 v0, s0
diff --git a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
index 96639ed420782..bfaf79937d915 100644
--- a/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
+++ b/llvm/test/CodeGen/ARM/2014-05-14-DwarfEHCrash.ll
@@ -45,6 +45,6 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
index 812ac23f151fa..a18023c297f1d 100644
--- a/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
+++ b/llvm/test/CodeGen/ARM/ARMLoadStoreDBG.mir
@@ -31,8 +31,8 @@
   ; Function Attrs: nounwind readnone
   declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-  attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+  attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
   attributes #2 = { nounwind readnone }
   attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
index 25119fe748d34..7df230ccda67e 100644
--- a/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
+++ b/llvm/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll
@@ -441,8 +441,7 @@ entry:
 define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_v2s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
 ; CHECK: G_EXTRACT_VECTOR_ELT [[VEC]](<2 x s32>)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -453,8 +452,7 @@ define i32 @test_shufflevector_s32_v2s32(i32 %arg) {
 define i32 @test_shufflevector_s32_s32_s32(i32 %arg) {
 ; CHECK-LABEL: name: test_shufflevector_s32_s32_s32
 ; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $r0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0)
+; CHECK: r0 = COPY [[ARG]](s32)
   %vec = insertelement <1 x i32> undef, i32 %arg, i32 0
   %shuffle = shufflevector <1 x i32> %vec, <1 x i32> undef, <1 x i32> zeroinitializer
   %res = extractelement <1 x i32> %shuffle, i32 0
diff --git a/llvm/test/CodeGen/ARM/O3-pipeline.ll b/llvm/test/CodeGen/ARM/O3-pipeline.ll
index 9601a2e4e3d12..273114822ec44 100644
--- a/llvm/test/CodeGen/ARM/O3-pipeline.ll
+++ b/llvm/test/CodeGen/ARM/O3-pipeline.ll
@@ -166,6 +166,7 @@
 ; CHECK-NEXT:      ARM Execution Domain Fix
 ; CHECK-NEXT:      BreakFalseDeps
 ; CHECK-NEXT:      ARM pseudo instruction expansion pass
+; CHECK-NEXT:      Insert KCFI indirect call checks
 ; CHECK-NEXT:      Thumb2 instruction size reduce pass
 ; CHECK-NEXT:      MachineDominator Tree Construction
 ; CHECK-NEXT:      Machine Natural Loop Construction
diff --git a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
index d0bdd66d131d2..e4dc92d0d6d84 100644
--- a/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
+++ b/llvm/test/CodeGen/ARM/Windows/wineh-basic.ll
@@ -36,8 +36,8 @@ declare arm_aapcs_vfpcc i32 @__CxxFrameHandler3(...)
 
 declare arm_aapcs_vfpcc void @__std_terminate() local_unnamed_addr
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+strict-align,+vfp3" "use-soft-float"="false" }
 attributes #2 = { noreturn nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/byval_load_align.ll b/llvm/test/CodeGen/ARM/byval_load_align.ll
index c594bd38a0b31..5bb4fe7c95a41 100644
--- a/llvm/test/CodeGen/ARM/byval_load_align.ll
+++ b/llvm/test/CodeGen/ARM/byval_load_align.ll
@@ -22,6 +22,6 @@ entry:
 
 declare void @Logger(i8 signext, ptr byval(%struct.ModuleID)) #1
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
index 972a4708994db..cabd43edff9d6 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
index ec8d5b8ad94af..3d3974ee6ba3b 100644
--- a/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/ARM/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",%progbits,.text
+; CHECK: .section .llvm.callgraph,"o",%llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
index 3e8c9f4e2198a..bb3c04a54caff 100644
--- a/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
+++ b/llvm/test/CodeGen/ARM/cfguard-module-flag.ll
@@ -21,7 +21,7 @@ entry:
   ; CHECK-NOT: __guard_check_icall_fptr
   ; CHECK-NOT: __guard_dispatch_icall_fptr
 }
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false"}
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+armv7-a,+dsp,+fp16,+neon,+strict-align,+thumb-mode,+vfp3" "use-soft-float"="false"}
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 2, !"cfguard", i32 1}
diff --git a/llvm/test/CodeGen/ARM/clang-section.ll b/llvm/test/CodeGen/ARM/clang-section.ll
index 9277d90dba65b..9c32ab27cd9f2 100644
--- a/llvm/test/CodeGen/ARM/clang-section.ll
+++ b/llvm/test/CodeGen/ARM/clang-section.ll
@@ -35,8 +35,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
 attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
 attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
 attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a9" "target-features"="+dsp,+fp16,+neon,+vfp3" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
index 47f4e1a0a90f6..ae36da480537f 100644
--- a/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
+++ b/llvm/test/CodeGen/ARM/cmse-clear-float-bigend.mir
@@ -16,7 +16,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
 
-  attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "cmse_nonsecure_entry" nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+8msecext,+armv8-m.main,-d32,-fp64,+fp-armv8,+hwdiv,+thumb-mode,-crypto,-fullfp16,-neon" "use-soft-float"="false" }
   attributes #1 = { nounwind }
   attributes #2 = { "cmse_nonsecure_call" nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
index 4d4853c2ad687..7960b79e1a751 100644
--- a/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
+++ b/llvm/test/CodeGen/ARM/coalesce-dbgvalue.ll
@@ -72,8 +72,8 @@ declare i32 @fn3(...) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
index 246eeebd41f36..4bc6c41df889a 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote-dbg.ll
@@ -19,7 +19,7 @@ entry:
   ret ptr getelementptr inbounds ([4 x i8], ptr @.str, i32 0, i32 1), !dbg !16
 }
 
-attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { minsize norecurse nounwind optsize readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+hwdiv,+soft-float,-crypto,-neon" "use-soft-float"="true" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/constantpool-promote.ll b/llvm/test/CodeGen/ARM/constantpool-promote.ll
index c383b391db833..87f14ebfe00b6 100644
--- a/llvm/test/CodeGen/ARM/constantpool-promote.ll
+++ b/llvm/test/CodeGen/ARM/constantpool-promote.ll
@@ -200,8 +200,8 @@ declare void @d(ptr) #1
 declare void @llvm.memcpy.p0.p0.i32(ptr nocapture writeonly, ptr nocapture readonly, i32, i1)
 declare void @llvm.memmove.p0.p0.i32(ptr, ptr, i32, i1) local_unnamed_addr
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/ARM/early-cfi-sections.ll b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
index 72b87025a0c3c..ef99ae5851d47 100644
--- a/llvm/test/CodeGen/ARM/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/ARM/early-cfi-sections.ll
@@ -13,7 +13,7 @@ entry:
   ret void, !dbg !10
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+soft-float,+strict-align,-crypto,-neon" "use-soft-float"="true" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
diff --git a/llvm/test/CodeGen/ARM/fp16-vld.ll b/llvm/test/CodeGen/ARM/fp16-vld.ll
index 549546e37fc82..778685cfe7e1b 100644
--- a/llvm/test/CodeGen/ARM/fp16-vld.ll
+++ b/llvm/test/CodeGen/ARM/fp16-vld.ll
@@ -43,4 +43,4 @@ byeblock:
   ret void
 }
 
-attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "target-cpu"="generic" "target-features"="+armv8.2-a,+fullfp16,+strict-align,-thumb-mode" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/global-merge-1.ll b/llvm/test/CodeGen/ARM/global-merge-1.ll
index 46e9d969bd6ac..05719ae4eb37d 100644
--- a/llvm/test/CodeGen/ARM/global-merge-1.ll
+++ b/llvm/test/CodeGen/ARM/global-merge-1.ll
@@ -74,9 +74,9 @@ define internal ptr @returnFoo() #2 {
   ret ptr @foo
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
index 27534a6d2c587..bdd842abfc9fa 100644
--- a/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
+++ b/llvm/test/CodeGen/ARM/isel-v8i32-crash.ll
@@ -21,4 +21,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/kcfi-arm.ll b/llvm/test/CodeGen/ARM/kcfi-arm.ll
new file mode 100644
index 0000000000000..e3696cf8ea4ea
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-arm.ll
@@ -0,0 +1,138 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL:     BLX %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT:    BLX killed $r0, csr_aapcs,{{.*}}
+; KCFI-NEXT:  }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL:     TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_ARM $r0, 12345678
+; KCFI-NEXT:    TAILJMPr killed $r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT:  }
+
+; ASM:       .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp0
+; ASM-NEXT:    udf #33760
+; ASM-NEXT:  .Ltmp0:
+; ASM-NEXT:    blx r0
+; ASM-NEXT:    pop {r11, pc}
+
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp1
+; ASM-NEXT:    udf #33760
+; ASM-NEXT:  .Ltmp1:
+; ASM-NEXT:    bx r0
+
+  tail call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument.
+; With 5+ arguments (target + 4 args), r0-r3 are all used for arguments,
+; forcing r3 to be spilled when we need it as scratch register.
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    mov lr, r3
+; ASM-NEXT:    ldr r3, [sp, #8]
+; ASM-NEXT:    mov r12, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    mov r2, lr
+; ASM-NEXT:    stmdb sp!, {r3}
+; ASM-NEXT:    bic r3, r12, #1
+; ASM-NEXT:    ldr r3, [r3, #-4]
+; ASM-NEXT:    eor r3, r3, #78
+; ASM-NEXT:    eor r3, r3, #24832
+; ASM-NEXT:    eor r3, r3, #12320768
+; ASM-NEXT:    eors r3, r3, #0
+; ASM-NEXT:    ldm sp!, {r3}
+; ASM-NEXT:    beq .Ltmp2
+; ASM-NEXT:    udf #33772
+; ASM-NEXT:  .Ltmp2:
+; ASM-NEXT:    blx r12
+; ASM-NEXT:    pop {r11, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; Compiler shuffles arguments into place, saving r3 (c) in lr, loading d from stack
+; r3 is live as 4th argument, so push it before KCFI check
+; Restore r3 immediately after comparison, before branch
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12, so r3 used as scratch without spilling
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r11, lr}
+; ASM-NEXT:    push {r11, lr}
+; ASM-NEXT:    mov r3, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    bic r12, r3, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq .Ltmp3
+; ASM-NEXT:    udf #33763
+; ASM-NEXT:  .Ltmp3:
+; ASM-NEXT:    blx r3
+; ASM-NEXT:    pop {r11, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Compiler puts target→r3, a→r0, b→r1
+; r3 is the target, so we use r12 as scratch (no spill needed)
+  call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
new file mode 100644
index 0000000000000..8e71cae3131d4
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-cbz-range.ll
@@ -0,0 +1,81 @@
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -filetype=obj < %s
+; RUN: llc -mtriple=thumbv7-linux-gnueabi < %s | FileCheck %s
+
+; This test verifies that KCFI instrumentation doesn't cause "out of range
+; pc-relative fixup value" errors when generating object files.
+;
+; The test creates a scenario with enough KCFI-instrumented indirect calls
+; (~32 bytes each) that would push a cbz/cbnz instruction out of its ±126 byte
+; range if the KCFI_CHECK pseudo-instruction size is not properly accounted for.
+;
+; Without the fix (KCFI_CHECK returns size 0):
+;   - Backend thinks KCFI checks take no space
+;   - Generates cbz to branch over the code
+;   - During assembly, cbz target is >126 bytes away
+;   - Assembly fails with "error: out of range pc-relative fixup value"
+;
+; With the fix (KCFI_CHECK returns size 32 for Thumb2):
+;   - Backend correctly accounts for KCFI check expansion
+;   - Avoids cbz or uses longer-range branch instructions
+;   - Assembly succeeds, object file is generated
+
+declare void @external_function(i32)
+
+; Test WITHOUT KCFI: should generate cbz since calls are small
+; CHECK-LABEL: test_without_kcfi:
+; CHECK: cbz
+; CHECK-NOT: bic{{.*}}#1
+define i32 @test_without_kcfi(ptr %callback, i32 %x) {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+  ; Regular (non-KCFI) indirect calls - much smaller
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+  call void %callback()
+
+  call void @external_function(i32 %x)
+  %add1 = add i32 %x, 1
+  ret i32 %add1
+
+if_zero:
+  call void @external_function(i32 0)
+  ret i32 0
+}
+
+; Test WITH KCFI: should NOT generate cbz due to large KCFI checks
+; CHECK-LABEL: test_with_kcfi:
+; CHECK-NOT: cbz
+; CHECK: bic{{.*}}#1
+define i32 @test_with_kcfi(ptr %callback, i32 %x) !kcfi_type !1 {
+entry:
+  %cmp = icmp eq i32 %x, 0
+  br i1 %cmp, label %if_zero, label %if_nonzero
+
+if_nonzero:
+  ; Six KCFI-instrumented indirect calls (~192 bytes total, exceeds cbz range)
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+  call void %callback() [ "kcfi"(i32 12345678) ]
+
+  ; Regular call to prevent optimization
+  call void @external_function(i32 %x)
+  %add1 = add i32 %x, 1
+  ret i32 %add1
+
+if_zero:
+  call void @external_function(i32 0)
+  ret i32 0
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
new file mode 100644
index 0000000000000..f8e083891eadb
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-patchable-function-prefix.ll
@@ -0,0 +1,99 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=armv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK:          .p2align 2
+; CHECK-NOT:        nop
+; CHECK:          .long   12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-4]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp0
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK-NOT:       .long
+; CHECK-NOT:        nop
+define void @f2(ptr noundef %x) {
+; CHECK-LABEL: f2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-4]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp1
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK:          .long   12345678
+; CHECK-COUNT-11:   nop
+define void @f3(ptr noundef %x) #0 !kcfi_type !1 {
+; CHECK-LABEL: f3:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-48]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp3
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; CHECK:          .p2align 2
+; CHECK-COUNT-11:   nop
+define void @f4(ptr noundef %x) #0 {
+; CHECK-LABEL: f4:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push {r11, lr}
+; CHECK-NEXT:    bic r12, r0, #1
+; CHECK-NEXT:    ldr r12, [r12, #-48]
+; CHECK-NEXT:    eor r12, r12, #78
+; CHECK-NEXT:    eor r12, r12, #24832
+; CHECK-NEXT:    eor r12, r12, #12320768
+; CHECK-NEXT:    eors r12, r12, #0
+; CHECK-NEXT:    beq .Ltmp5
+; CHECK-NEXT:    udf #33760
+; CHECK-NEXT:  .Ltmp5:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r11, pc}
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+attributes #0 = { "patchable-function-prefix"="11" }
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb.ll b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
new file mode 100644
index 0000000000000..7c02d83034d60
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb.ll
@@ -0,0 +1,215 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
+
+; This test verifies that Thumb1 (ARMv6-M) generates correct code for backend KCFI.
+; Thumb1 uses the backend KCFI implementation with Thumb1-specific instructions.
+
+; Test function without KCFI annotation
+; CHECK-LABEL: .globl nosan
+; CHECK-NEXT:  .p2align 1
+; CHECK-NEXT:  .type nosan,%function
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define dso_local void @nosan() nounwind {
+; CHECK-LABEL: nosan:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bx lr
+  ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; CHECK-LABEL: .globl target_func
+; CHECK-NEXT:  .p2align 2
+; CHECK-NEXT:  .type target_func,%function
+; CHECK-NEXT:  .long 3170468932
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define void @target_func() !kcfi_type !1 {
+; CHECK-LABEL: target_func:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    bx lr
+  ret void
+}
+
+; Test indirect call with KCFI check using operand bundles
+; CHECK-LABEL: .globl f1
+; CHECK:       .p2align 2
+; CHECK-NEXT:  .type f1,%function
+; CHECK-NEXT:  .long 3170468932
+; CHECK-NEXT:  .code 16
+; CHECK-NEXT:  .thumb_func
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f1:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    beq .Ltmp0
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp0:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r7, pc}
+  call void %x() [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with tail call - backend KCFI supports tail calls
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; CHECK-LABEL: f2:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r0
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    beq .Ltmp1
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp1:
+; CHECK-NEXT:    blx r0
+; CHECK-NEXT:    pop {r7, pc}
+  tail call void %x() [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with R2 live (3 arguments) - compiler shuffles args, no spilling needed
+define void @f3_r2_live(ptr noundef %x, i32 %a, i32 %b, i32 %c) !kcfi_type !1 {
+; CHECK-LABEL: f3_r2_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, lr}
+; CHECK-NEXT:    push {r4, lr}
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    mov r2, r3
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    beq .Ltmp2
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp2:
+; CHECK-NEXT:    blx r4
+; CHECK-NEXT:    pop {r4, pc}
+; Compiler shuffles: target→r4, c→r2, a→r0, b→r1
+; R2 is live (3rd arg), so we push it, then uses R3 as temp, R2 as scratch
+  call void %x(i32 %a, i32 %b, i32 %c) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test with both R2 and R3 live (4 arguments) - compiler moves to r5/r4, uses R3 temp and R12 scratch
+define void @f4_r2_r3_live(ptr noundef %x, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f4_r2_r3_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r4, r5, r7, lr}
+; CHECK-NEXT:    push {r4, r5, r7, lr}
+; CHECK-NEXT:    mov r5, r3
+; CHECK-NEXT:    mov r4, r0
+; CHECK-NEXT:    ldr r3, [sp, #16]
+; CHECK-NEXT:    mov r0, r1
+; CHECK-NEXT:    mov r1, r2
+; CHECK-NEXT:    mov r2, r5
+; CHECK-NEXT:    push {r3}
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r4
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    pop {r3}
+; CHECK-NEXT:    beq .Ltmp3
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp3:
+; CHECK-NEXT:    blx r4
+; CHECK-NEXT:    pop {r4, r5, r7, pc}
+; Compiler shuffles: r3→r5, target→r4, d→r3 (from stack), a→r0, b→r1, c→r2
+; Then pushes r3 (d value), then r2, uses R3 as temp, R2 as scratch
+  call void %x(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+; Test where target ends up in R12, forcing R2 as scratch, with both R2 and R3 live
+; This uses inline asm to force target into R12, with 4 call arguments to make R2/R3 live
+define void @f5_r12_target_r2_r3_live(i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; CHECK-LABEL: f5_r12_target_r2_r3_live:
+; CHECK:       @ %bb.0:
+; CHECK-NEXT:    .save {r7, lr}
+; CHECK-NEXT:    push {r7, lr}
+; CHECK-NEXT:    @APP
+; CHECK-NEXT:    @NO_APP
+; CHECK-NEXT:    push {r3}
+; CHECK-NEXT:    push {r2}
+; CHECK-NEXT:    movs r3, #1
+; CHECK-NEXT:    mov r2, r12
+; CHECK-NEXT:    bics r2, r3
+; CHECK-NEXT:    subs r2, #4
+; CHECK-NEXT:    ldr r2, [r2]
+; CHECK-NEXT:    movs r3, #188
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #249
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #132
+; CHECK-NEXT:    lsls r3, r3, #8
+; CHECK-NEXT:    adds r3, #68
+; CHECK-NEXT:    cmp r2, r3
+; CHECK-NEXT:    pop {r2}
+; CHECK-NEXT:    pop {r3}
+; CHECK-NEXT:    beq .Ltmp4
+; CHECK-NEXT:    bkpt #0
+; CHECK-NEXT:  .Ltmp4:
+; CHECK-NEXT:    blx r12
+; CHECK-NEXT:    pop {r7, pc}
+; Use inline asm to get function pointer into R12
+; With 4 arguments (r0-r3), both R2 and R3 are live
+; Target in R12 means R2 is scratch, R3 is temp, and both need spilling
+  %target = call ptr asm "", "={r12}"()
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 -1124498364) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/kcfi-thumb2.ll b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
new file mode 100644
index 0000000000000..f319d98b845de
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/kcfi-thumb2.ll
@@ -0,0 +1,163 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs < %s | FileCheck %s --check-prefix=ASM
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=finalize-isel < %s | FileCheck %s --check-prefixes=MIR,ISEL
+; RUN: llc -mtriple=thumbv7-linux-gnueabi -verify-machineinstrs -stop-after=kcfi < %s | FileCheck %s --check-prefixes=MIR,KCFI
+
+; MIR checks for all functions (grouped here to prevent update_llc_test_checks.py from removing them)
+
+; MIR-LABEL: name: f1
+; MIR: body:
+
+; ISEL:     tBLXr 14 /* CC::al */, $noreg, %0, csr_aapcs,{{.*}} cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT:    tBLXr 14 /* CC::al */, $noreg, {{(killed )?}}$r0, csr_aapcs,{{.*}}
+; KCFI-NEXT:  }
+
+; MIR-LABEL: name: f2
+; MIR: body:
+
+; ISEL:     TCRETURNri %0, 0, csr_aapcs, implicit $sp, cfi-type 12345678
+
+; KCFI:       BUNDLE{{.*}} {
+; KCFI-NEXT:    KCFI_CHECK_Thumb2 $r0, 12345678
+; KCFI-NEXT:    tTAILJMPr {{(killed )?}}$r0, csr_aapcs, implicit $sp, implicit $sp
+; KCFI-NEXT:  }
+
+; Test function without KCFI annotation
+; ASM-LABEL: .globl nosan
+; ASM-NEXT:  .p2align 1
+; ASM-NEXT:  .type nosan,%function
+; ASM-NEXT:  .code 16
+; ASM-NEXT:  .thumb_func
+define dso_local void @nosan() nounwind {
+; ASM-LABEL: nosan:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bx lr
+  ret void
+}
+
+; Test function with KCFI annotation - verifies type hash emission
+;; The alignment is at least 4 to avoid unaligned type hash loads when this
+;; instrumented function is indirectly called.
+; ASM-LABEL: .globl target_func
+; ASM-NEXT:  .p2align 2
+; ASM-NEXT:  .type target_func,%function
+; ASM-NEXT:  .long 12345678
+; ASM-NEXT:  .code 16
+; ASM-NEXT:  .thumb_func
+define void @target_func() !kcfi_type !1 {
+; ASM-LABEL: target_func:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bx lr
+  ret void
+}
+
+; Test indirect call with KCFI check
+; ASM:       .long 12345678
+define void @f1(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f1:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp0
+; ASM-NEXT:    udf #128
+; ASM-NEXT:  .Ltmp0:
+; ASM-NEXT:    blx r0
+; ASM-NEXT:    pop {r7, pc}
+
+  call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with tail call
+define void @f2(ptr noundef %x) !kcfi_type !1 {
+; ASM-LABEL: f2:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    bic r12, r0, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp1
+; ASM-NEXT:    udf #128
+; ASM-NEXT:  .Ltmp1:
+; ASM-NEXT:    bx r0
+
+  tail call void %x() [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test r3 spill/reload when target is r12 and r3 is a call argument (Thumb2)
+define void @f3_r3_spill(ptr noundef %target, i32 %a, i32 %b, i32 %c, i32 %d) !kcfi_type !1 {
+; ASM-LABEL: f3_r3_spill:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    mov lr, r3
+; ASM-NEXT:    ldr r3, [sp, #8]
+; ASM-NEXT:    mov r12, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    mov r2, lr
+; ASM-NEXT:    push {r3}
+; ASM-NEXT:    bic r3, r12, #1
+; ASM-NEXT:    ldr r3, [r3, #-4]
+; ASM-NEXT:    eor r3, r3, #78
+; ASM-NEXT:    eor r3, r3, #24832
+; ASM-NEXT:    eor r3, r3, #12320768
+; ASM-NEXT:    eors r3, r3, #0
+; ASM-NEXT:    pop {r3}
+; ASM-NEXT:    beq.w .Ltmp2
+; ASM-NEXT:    udf #140
+; ASM-NEXT:  .Ltmp2:
+; ASM-NEXT:    blx r12
+; ASM-NEXT:    pop {r7, pc}
+; Arguments: r0=%target, r1=%a, r2=%b, r3=%c, [sp+8]=%d
+; Call needs: r0=%a, r1=%b, r2=%c, r3=%d, target in r12
+; r3 is live as 4th argument, so push it before KCFI check
+  call void %target(i32 %a, i32 %b, i32 %c, i32 %d) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+; Test with 3 arguments - r3 not live, target in r12 or elsewhere, r12 used as scratch
+define void @f4_r3_unused(ptr noundef %target, i32 %a, i32 %b) !kcfi_type !1 {
+; ASM-LABEL: f4_r3_unused:
+; ASM:       @ %bb.0:
+; ASM-NEXT:    .save {r7, lr}
+; ASM-NEXT:    push {r7, lr}
+; ASM-NEXT:    mov r3, r0
+; ASM-NEXT:    mov r0, r1
+; ASM-NEXT:    mov r1, r2
+; ASM-NEXT:    bic r12, r3, #1
+; ASM-NEXT:    ldr r12, [r12, #-4]
+; ASM-NEXT:    eor r12, r12, #78
+; ASM-NEXT:    eor r12, r12, #24832
+; ASM-NEXT:    eor r12, r12, #12320768
+; ASM-NEXT:    eors r12, r12, #0
+; ASM-NEXT:    beq.w .Ltmp3
+; ASM-NEXT:    udf #131
+; ASM-NEXT:  .Ltmp3:
+; ASM-NEXT:    blx r3
+; ASM-NEXT:    pop {r7, pc}
+; Only 3 arguments total, so r3 is not used as call argument
+; Target might be in r3, using r12 as scratch (no spill needed)
+  call void %target(i32 %a, i32 %b) [ "kcfi"(i32 12345678) ]
+  ret void
+}
+
+!llvm.module.flags = !{!0}
+!0 = !{i32 4, !"kcfi", i32 1}
+!1 = !{i32 12345678}
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; ISEL: {{.*}}
+; KCFI: {{.*}}
+; MIR: {{.*}}
diff --git a/llvm/test/CodeGen/ARM/kcfi.ll b/llvm/test/CodeGen/ARM/kcfi.ll
deleted file mode 100644
index 9e16468c9347b..0000000000000
--- a/llvm/test/CodeGen/ARM/kcfi.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; RUN: llc -mtriple=thumbv6m-none-eabi < %s | FileCheck %s
-
-; CHECK-LABEL: .globl nosan
-; CHECK-NEXT:  .p2align 1
-; CHECK-NEXT:  .type nosan,%function
-; CHECK-NEXT:  .code 16
-; CHECK-NEXT:  .thumb_func
-; CHECK-NEXT:  nosan:
-define dso_local void @nosan() nounwind {
-  ret void
-}
-
-;; The alignment is at least 4 to avoid unaligned type hash loads when this
-;; instrumented function is indirectly called.
-; CHECK-LABEL: .globl f1
-; CHECK-NEXT:  .p2align 2
-; CHECK-NEXT:  .type f1,%function
-; CHECK-NEXT:  .long 3170468932
-; CHECK-NEXT:  .code 16
-; CHECK-NEXT:  .thumb_func
-; CHECK-NEXT:  f1:
-define void @f1(ptr noundef %x) !kcfi_type !1 {
-  ret void
-}
-
-!llvm.module.flags = !{!0}
-!0 = !{i32 4, !"kcfi", i32 1}
-!1 = !{i32 -1124498364}
diff --git a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
index 4b6d14efd0ecb..1d5c8589429a4 100644
--- a/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
+++ b/llvm/test/CodeGen/ARM/load-combine-big-endian.ll
@@ -53,14 +53,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
 ; BSWAP is not supported by 32 bit target
 ; CHECK-LABEL: load_i32_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -223,21 +221,16 @@ define i32 @load_i32_by_i16_i8(ptr %arg) {
 define i64 @load_i64_by_i8_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r2, #255
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    and r2, r12, r1, lsr #8
-; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, r1, #65280
-; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    lsl r1, r1, #24
-; CHECK-NEXT:    orr r1, r1, r3, lsl #8
-; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    orr r2, r2, #16711680
+; CHECK-NEXT:    and r3, r0, r2
+; CHECK-NEXT:    and r0, r2, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r3, ror #8
+; CHECK-NEXT:    and r3, r1, r2
+; CHECK-NEXT:    and r1, r2, r1, ror #24
+; CHECK-NEXT:    orr r1, r1, r3, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -377,14 +370,12 @@ define i64 @load_i64_by_i8(ptr %arg) {
 define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_nonzero_offset:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #1]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset:
@@ -434,14 +425,12 @@ define i32 @load_i32_by_i8_nonzero_offset(ptr %arg) {
 define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_neg_offset:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset:
@@ -587,14 +576,12 @@ declare i16 @llvm.bswap.i16(i16)
 define i32 @load_i32_by_bswap_i16(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_bswap_i16:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
@@ -667,14 +654,12 @@ define i32 @load_i32_by_i8_base_offset_index(ptr %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r0, r1
-; CHECK-NEXT:    mov r1, #65280
+; CHECK-NEXT:    mov r1, #255
+; CHECK-NEXT:    orr r1, r1, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #12]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index:
@@ -733,14 +718,12 @@ define i32 @load_i32_by_i8_base_offset_index_2(ptr %arg, i32 %i) {
 ; CHECK-LABEL: load_i32_by_i8_base_offset_index_2:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    add r0, r1, r0
-; CHECK-NEXT:    mov r1, #65280
+; CHECK-NEXT:    mov r1, #255
+; CHECK-NEXT:    orr r1, r1, #16711680
 ; CHECK-NEXT:    ldr r0, [r0, #13]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_base_offset_index_2:
diff --git a/llvm/test/CodeGen/ARM/load-combine.ll b/llvm/test/CodeGen/ARM/load-combine.ll
index 0f6ec8aa47386..70873672d6523 100644
--- a/llvm/test/CodeGen/ARM/load-combine.ll
+++ b/llvm/test/CodeGen/ARM/load-combine.ll
@@ -117,14 +117,12 @@ define i32 @load_i32_by_i8_bswap(ptr %arg) {
 ; BSWAP is not supported by 32 bit target
 ; CHECK-LABEL: load_i32_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_bswap:
@@ -237,21 +235,16 @@ define i64 @load_i64_by_i8(ptr %arg) {
 define i64 @load_i64_by_i8_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i64_by_i8_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r2, #255
 ; CHECK-NEXT:    ldr r1, [r0]
-; CHECK-NEXT:    mov r12, #65280
 ; CHECK-NEXT:    ldr r0, [r0, #4]
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r3, r12, r0, lsr #8
-; CHECK-NEXT:    orr r3, r3, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    and r2, r12, r1, lsr #8
-; CHECK-NEXT:    orr r0, r0, r3
-; CHECK-NEXT:    and r3, r1, #65280
-; CHECK-NEXT:    orr r2, r2, r1, lsr #24
-; CHECK-NEXT:    lsl r1, r1, #24
-; CHECK-NEXT:    orr r1, r1, r3, lsl #8
-; CHECK-NEXT:    orr r1, r1, r2
+; CHECK-NEXT:    orr r2, r2, #16711680
+; CHECK-NEXT:    and r3, r0, r2
+; CHECK-NEXT:    and r0, r2, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r3, ror #8
+; CHECK-NEXT:    and r3, r1, r2
+; CHECK-NEXT:    and r1, r2, r1, ror #24
+; CHECK-NEXT:    orr r1, r1, r3, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i64_by_i8_bswap:
@@ -413,14 +406,12 @@ define i32 @load_i32_by_i8_neg_offset(ptr %arg) {
 define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_nonzero_offset_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #1]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_nonzero_offset_bswap:
@@ -469,14 +460,12 @@ define i32 @load_i32_by_i8_nonzero_offset_bswap(ptr %arg) {
 define i32 @load_i32_by_i8_neg_offset_bswap(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_i8_neg_offset_bswap:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0, #-4]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_i8_neg_offset_bswap:
@@ -527,14 +516,12 @@ declare i16 @llvm.bswap.i16(i16)
 define i32 @load_i32_by_bswap_i16(ptr %arg) {
 ; CHECK-LABEL: load_i32_by_bswap_i16:
 ; CHECK:       @ %bb.0:
+; CHECK-NEXT:    mov r1, #255
 ; CHECK-NEXT:    ldr r0, [r0]
-; CHECK-NEXT:    mov r1, #65280
-; CHECK-NEXT:    and r2, r0, #65280
-; CHECK-NEXT:    and r1, r1, r0, lsr #8
-; CHECK-NEXT:    orr r1, r1, r0, lsr #24
-; CHECK-NEXT:    lsl r0, r0, #24
-; CHECK-NEXT:    orr r0, r0, r2, lsl #8
-; CHECK-NEXT:    orr r0, r0, r1
+; CHECK-NEXT:    orr r1, r1, #16711680
+; CHECK-NEXT:    and r2, r0, r1
+; CHECK-NEXT:    and r0, r1, r0, ror #24
+; CHECK-NEXT:    orr r0, r0, r2, ror #8
 ; CHECK-NEXT:    mov pc, lr
 ;
 ; CHECK-ARMv6-LABEL: load_i32_by_bswap_i16:
diff --git a/llvm/test/CodeGen/ARM/out-of-registers.ll b/llvm/test/CodeGen/ARM/out-of-registers.ll
index c6488f1789c1f..8da2069ea0ecc 100644
--- a/llvm/test/CodeGen/ARM/out-of-registers.ll
+++ b/llvm/test/CodeGen/ARM/out-of-registers.ll
@@ -32,7 +32,7 @@ declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vl
 
 ; Function Attrs: nounwind
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 attributes #2 = { nounwind readonly }
 
diff --git a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
index 71db2944c3884..99ed6f3148ffd 100644
--- a/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
+++ b/llvm/test/CodeGen/ARM/relax-per-target-feature.ll
@@ -30,5 +30,5 @@ entry:
 
 attributes #0 = { nounwind  "disable-tail-calls"="false" "target-cpu"="cortex-a53" "target-features"="+crypto,+fp-armv8,+neon,+soft-float-abi,+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
 
-attributes #2 = { nounwind  "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #2 = { nounwind  "disable-tail-calls"="false" "target-cpu"="arm7tdmi" "target-features"="+strict-align,+thumb-mode,-crc,-dotprod,-dsp,-hwdiv,-hwdiv-arm,-ras" "use-soft-float"="true" }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
index 76df93b286c33..2aa7611347a07 100644
--- a/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
+++ b/llvm/test/CodeGen/ARM/softfp-constant-comparison.ll
@@ -32,4 +32,4 @@ land.end:                                         ; preds = %land.rhs, %entry
   ret void
 }
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m4" "target-features"="+armv7e-m,+dsp,+fp16,+hwdiv,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp,-aes,-crc,-crypto,-dotprod,-fp16fml,-fullfp16,-hwdiv-arm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
index 6f2cb4251b456..2cf6d291ca19e 100644
--- a/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
+++ b/llvm/test/CodeGen/ARM/stack-protector-bmovpcb_call.ll
@@ -25,7 +25,7 @@ declare void @llvm.memcpy.p0.p0.i32(ptr nocapture, ptr nocapture readonly, i32,
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #2
 
-attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/ARM/stack_guard_remat.ll b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
index 983ef1336daa4..0930ccc69343b 100644
--- a/llvm/test/CodeGen/ARM/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/ARM/stack_guard_remat.ll
@@ -68,7 +68,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ;--- pic-flag.ll
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
index 24df0d3e56d1c..868dc03880650 100644
--- a/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
+++ b/llvm/test/CodeGen/ARM/struct-byval-frame-index.ll
@@ -34,4 +34,4 @@ entry:
 ; Function Attrs: nounwind
 declare void @RestoreMVBlock8x8(i32, i32, ptr byval(%structN) nocapture, i32) #1
 
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/subtarget-align.ll b/llvm/test/CodeGen/ARM/subtarget-align.ll
index a24b4870d928d..f87e21f06d42c 100644
--- a/llvm/test/CodeGen/ARM/subtarget-align.ll
+++ b/llvm/test/CodeGen/ARM/subtarget-align.ll
@@ -18,7 +18,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "use-soft-float"="false" }
 attributes #1 = { "target-cpu"="arm7tdmi" "target-features"="+armv4t" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/ARM/unschedule-first-call.ll b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
index e0bb7873eab27..ad422f71b5661 100644
--- a/llvm/test/CodeGen/ARM/unschedule-first-call.ll
+++ b/llvm/test/CodeGen/ARM/unschedule-first-call.ll
@@ -128,7 +128,7 @@ declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
 ; Function Attrs: nounwind readnone
 declare { i64, i1 } @llvm.ssub.with.overflow.i64(i64, i64) #1
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="arm1176jzf-s" "target-features"="+dsp,+strict-align,+vfp2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/ARM/vector-spilling.ll b/llvm/test/CodeGen/ARM/vector-spilling.ll
index 5dc20a8aa9b1b..8d1339844eae9 100644
--- a/llvm/test/CodeGen/ARM/vector-spilling.ll
+++ b/llvm/test/CodeGen/ARM/vector-spilling.ll
@@ -30,4 +30,4 @@ entry:
 
 declare void @foo(<8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>, <8 x i64>)
 
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
index 892b2610c73b2..4e36711dcc2f5 100644
--- a/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
+++ b/llvm/test/CodeGen/ARM/vldm-sched-a9.ll
@@ -132,4 +132,4 @@ entry:
 
 declare void @capture(ptr, ptr)
 
-attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noredzone "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/AVR/dynalloca.ll b/llvm/test/CodeGen/AVR/dynalloca.ll
index 3face71c988b0..b32910bf8a762 100644
--- a/llvm/test/CodeGen/AVR/dynalloca.ll
+++ b/llvm/test/CodeGen/AVR/dynalloca.ll
@@ -64,16 +64,16 @@ define void @dynalloca2(i16 %x) {
 ; CHECK-NEXT: out 63, r0
 ; CHECK-NEXT: out 61, {{.*}}
 ; Store values on the stack
-; CHECK: ldi r20, 0
-; CHECK: ldi r21, 0
-; CHECK: std Z+8, r21
-; CHECK: std Z+7, r20
-; CHECK: std Z+6, r21
-; CHECK: std Z+5, r20
-; CHECK: std Z+4, r21
-; CHECK: std Z+3, r20
-; CHECK: std Z+2, r21
-; CHECK: std Z+1, r20
+; CHECK: ldi [[REG1:r[0-9]+]], 0
+; CHECK: ldi [[REG2:r[0-9]+]], 0
+; CHECK: std Z+8, [[REG2]]
+; CHECK: std Z+7, [[REG1]]
+; CHECK: std Z+6, [[REG2]]
+; CHECK: std Z+5, [[REG1]]
+; CHECK: std Z+4, [[REG2]]
+; CHECK: std Z+3, [[REG1]]
+; CHECK: std Z+2, [[REG2]]
+; CHECK: std Z+1, [[REG1]]
 ; CHECK: call
 ; Call frame restore
 ; CHECK-NEXT: in r30, 61
diff --git a/llvm/test/CodeGen/AVR/issue-163015.ll b/llvm/test/CodeGen/AVR/issue-163015.ll
new file mode 100644
index 0000000000000..6c4dc518fed2f
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/issue-163015.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -mtriple=avr | FileCheck %s
+
+@ui1 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@ui2 = protected local_unnamed_addr global i64 zeroinitializer, align 8
+@failed = private unnamed_addr addrspace(1) constant [12 x i8] c"test failed\00"
+@stats2 = external protected global i16, align 1
+
+; CHECK-LABEL: main:
+define i32 @main() addrspace(1) {
+entry:
+  store i64 94, ptr @ui1, align 8
+  store i64 53, ptr @ui2, align 8
+  tail call addrspace(1) void @foo(i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 32, ptr @stats2)
+  %11 = load i64, ptr @ui1, align 8
+  %12 = load i64, ptr @ui2, align 8
+
+; COM: CHECK: call __udivdi3
+  %15 = udiv i64 %11, %12
+
+; look for the buggy pattern where r30/r31 are being clobbered, corrupting the stack pointer
+; CHECK-NOT: std  Z+{{[1-9]+}}, r30 
+; CHECK-NOT: std  Z+{{[1-9]+}}, r31
+
+; CHECK: call expect
+  tail call addrspace(1) void @expect(i64 %15, i64 1, i16 ptrtoint (ptr addrspace(1) @failed to i16), i16 11, i8 2, i16 33)
+
+; CHECK: ret
+  ret i32 0
+}
+
+declare protected void @expect(i64, i64, i16, i16, i8, i16) local_unnamed_addr addrspace(1) #0
+declare protected void @foo(i16, i16, i8, i16, i16) local_unnamed_addr addrspace(1) #0
diff --git a/llvm/test/CodeGen/BPF/BTF/binary-format.ll b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
index 3b1be1af43ec5..fd09566f6058c 100644
--- a/llvm/test/CodeGen/BPF/BTF/binary-format.ll
+++ b/llvm/test/CodeGen/BPF/BTF/binary-format.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -gdwarf-5 -gembed-source -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %a, !dbg !14
@@ -42,10 +42,7 @@ entry:
 ; CHECK-EB: 0x00000050 00000008 0000000f 00000018 00000410
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
index 2fb8d25a2e07b..16723349cca2a 100644
--- a/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
+++ b/llvm/test/CodeGen/BPF/BTF/builtin-btf-type-id.ll
@@ -24,7 +24,7 @@
 @bpf_log = internal global ptr inttoptr (i64 999 to ptr), align 8, !dbg !17
 
 ; Function Attrs: nounwind
-define dso_local void @prog1() #0 !dbg !28 {
+define dso_local void @prog1() !dbg !28 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !31, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 0, i64 0), !dbg !36, !llvm.preserve.access.index !7
@@ -33,10 +33,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #1
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local void @prog2() #0 !dbg !38 {
+define dso_local void @prog2() !dbg !38 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !39, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 1, i64 0), !dbg !40, !llvm.preserve.access.index !6
@@ -45,7 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-define dso_local void @prog3() #0 !dbg !42 {
+define dso_local void @prog3() !dbg !42 {
 entry:
   %0 = load ptr, ptr @bpf_log, align 8, !dbg !43, !tbaa !32
   %1 = call i64 @llvm.bpf.btf.type.id(i32 2, i64 1), !dbg !44, !llvm.preserve.access.index !11
@@ -96,9 +96,6 @@ entry:
 ; CHECK-NEXT:        .long   48
 ; CHECK-NEXT:        .long   7
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!24, !25, !26}
 !llvm.ident = !{!27}
diff --git a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
index cc14a32b9811f..1c2b1d111cc8c 100644
--- a/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
+++ b/llvm/test/CodeGen/BPF/BTF/char-no-debuginfo.ll
@@ -10,7 +10,7 @@
 @g = dso_local local_unnamed_addr global i32 5, section "maps", align 4
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 {
+define dso_local i32 @test() local_unnamed_addr {
   %1 = load i32, ptr @g, align 4, !tbaa !2
   ret i32 %1
 }
@@ -18,8 +18,6 @@ define dso_local i32 @test() local_unnamed_addr #0 {
 ; CHECK-NOT:         .section        .BTF
 ; CHECK-NOT:         .section        .BTF.ext
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
index a8550167de182..fa0aa5b8698b9 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-builtin.ll
@@ -10,7 +10,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind readonly
-define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr #0 !dbg !13 {
+define dso_local i64 @test(ptr readonly %skb) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %skb, metadata !17, metadata !DIExpression()), !dbg !18
   %call = tail call i64 @llvm.bpf.load.byte(ptr %skb, i64 10), !dbg !19
@@ -54,13 +54,9 @@ entry:
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readonly
-declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64) #1
+declare !dbg !4 i64 @llvm.bpf.load.byte(ptr, i64)
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
index b7cbb48fc1f49..9a31beb8dfce4 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-func-arg.ll
@@ -8,9 +8,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -49,11 +49,7 @@ entry:
 ; CHECK:             .ascii  "char"                  # string offset=55
 ; CHECK:             .ascii  "global_func"           # string offset=60
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
index 299aa1d94fdd8..c3f93abb5fd45 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-global-var.ll
@@ -10,7 +10,7 @@
 @a = external dso_local local_unnamed_addr global i8, align 1
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !7 {
   %1 = load i8, ptr @a, align 1, !dbg !11, !tbaa !12
   %2 = sext i8 %1 to i32, !dbg !11
   ret i32 %2, !dbg !15
@@ -45,8 +45,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .ascii  "/home/yhs/work/tests/llvm/bug/test.c" # string offset=15
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
index d11addd387475..0ddd6344b293b 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak-section.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -69,10 +69,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .byte   0
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=72
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
index 9e82295812530..fbfc03b1f0f91 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func-weak.ll
@@ -10,12 +10,12 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
+declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -62,10 +62,6 @@ declare !dbg !4 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
 !llvm.ident = !{!12}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
index 262abb39ca7a9..0ba4732ca6f8e 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-func.ll
@@ -10,9 +10,9 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test() local_unnamed_addr !dbg !13 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !16
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !16
   ret i32 %call, !dbg !17
 }
 
@@ -61,11 +61,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "global_func"           # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !4 dso_local i32 @global_func(i8 signext) local_unnamed_addr
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
index b6e14fcd56900..27793d1583deb 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-section.ll
@@ -13,9 +13,9 @@
 @ch = external dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
@@ -84,11 +84,7 @@ entry:
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !dbg !6 dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
index 63ab578597146..ffec16b755a1a 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct-weak.ll
@@ -12,7 +12,7 @@
 
 @global = extern_weak dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -68,8 +68,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
index 3ecda4f513b9b..dfe5e5ef095ff 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-struct.ll
@@ -13,7 +13,7 @@
 @global = external dso_local local_unnamed_addr global %struct.t1, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test() local_unnamed_addr !dbg !15 {
 entry:
   %0 = load i32, ptr @global, align 4, !dbg !18, !tbaa !19
   ret i32 %0, !dbg !24
@@ -69,8 +69,6 @@ entry:
 ; CHECK-NEXT:        .ascii  "global"                # string offset=66
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
 !llvm.ident = !{!14}
diff --git a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
index 57ca18cdf12aa..7d28987fd6b4f 100644
--- a/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
+++ b/llvm/test/CodeGen/BPF/BTF/extern-var-weak-section.ll
@@ -12,15 +12,15 @@
 
 @ch = extern_weak dso_local local_unnamed_addr global i8, section "abc", align 1, !dbg !0
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
-  %call = tail call i32 @global_func(i8 signext 0) #2, !dbg !19
+  %call = tail call i32 @global_func(i8 signext 0), !dbg !19
   %0 = load i8, ptr @ch, align 1, !dbg !20, !tbaa !21
   %conv = sext i8 %0 to i32, !dbg !20
   %add = add nsw i32 %call, %conv, !dbg !24
   ret i32 %add, !dbg !25
 }
-declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr #1 section "abc"
+declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed_addr section "abc"
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -84,10 +84,6 @@ declare !dbg !6 extern_weak dso_local i32 @global_func(i8 signext) local_unnamed
 ; CHECK-NEXT:        .ascii  "abc"                   # string offset=75
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
 !llvm.ident = !{!15}
diff --git a/llvm/test/CodeGen/BPF/BTF/filename.ll b/llvm/test/CodeGen/BPF/BTF/filename.ll
index ae08aea71b2cb..0d8742fa8cc54 100644
--- a/llvm/test/CodeGen/BPF/BTF/filename.ll
+++ b/llvm/test/CodeGen/BPF/BTF/filename.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone uwtable
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
   ret i32 0, !dbg !11
 }
 
@@ -63,8 +63,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1038                    # Line 1 Col 14
 
-attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
index b700be9fd1be0..f8c3de5576cb9 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-func-ptr.ll
@@ -14,7 +14,7 @@
 @b1 = common dso_local local_unnamed_addr global %struct.t1 zeroinitializer, align 8, !dbg !6
 
 ; Function Attrs: nounwind readnone
-define dso_local void @f1(i32 %p2) local_unnamed_addr #0 !dbg !19 {
+define dso_local void @f1(i32 %p2) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata i32 %p2, metadata !21, metadata !DIExpression()), !dbg !22
   ret void, !dbg !23
@@ -95,10 +95,7 @@ entry:
 ; CHECK-NEXT:        .long   3091                    # Line 3 Col 19
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
index 2f562b2097022..745645d10681b 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-non-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32 returned) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 %0, !dbg !14
 }
@@ -73,10 +73,7 @@ define dso_local i32 @f1(i32 returned) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-source.ll b/llvm/test/CodeGen/BPF/BTF/func-source.ll
index a485d2cf4a98d..c305e83acdac6 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-source.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-source.ll
@@ -10,7 +10,7 @@
 ; correct reference to the lines in the string table.
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f() local_unnamed_addr !dbg !7 {
 entry:
   ret void, !dbg !10
 }
@@ -63,8 +63,6 @@ entry:
 ; CHECK-NEXT:        .long   18
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
index 25705366089d0..388deeb845bf9 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-typedef.ll
@@ -9,7 +9,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f(i32 returned %a) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f(i32 returned %a) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata i32 %a, metadata !14, metadata !DIExpression()), !dbg !15
   ret i32 %a, !dbg !16
@@ -85,12 +85,8 @@ entry:
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   3092                    # Line 3 Col 20
 
-
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
index f9439e606ae87..380642c2540ac 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-unused-arg.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(i32) local_unnamed_addr !dbg !7 {
   call void @llvm.dbg.value(metadata i32 %0, metadata !12, metadata !DIExpression()), !dbg !13
   ret i32 0, !dbg !14
 }
@@ -69,10 +69,7 @@ define dso_local i32 @f1(i32) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   1042                    # Line 1 Col 18
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/func-void.ll b/llvm/test/CodeGen/BPF/BTF/func-void.ll
index bf70b6a9d065c..920570088894e 100644
--- a/llvm/test/CodeGen/BPF/BTF/func-void.ll
+++ b/llvm/test/CodeGen/BPF/BTF/func-void.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm t.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
+define dso_local void @f1() local_unnamed_addr !dbg !7 {
   ret void, !dbg !10
 }
 
@@ -57,8 +57,6 @@ define dso_local void @f1() local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .long   0
 ; CHECK-NEXT:        .long   1040                    # Line 1 Col 16
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
index 6ef7a30883105..5c797f734ab8d 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-1.ll
@@ -21,16 +21,16 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5] }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
   call void @llvm.dbg.value(metadata ptr @.str, metadata !12, metadata !DIExpression()), !dbg !25
-  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 16, ptr nonnull %val), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %val, metadata !16, metadata !DIExpression()), !dbg !27
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(16) %val, ptr nonnull align 4 dereferenceable(16) @__const.test.val, i64 16, i1 false), !dbg !27
-  tail call void @foo(ptr @.str) #4, !dbg !28
-  call void @foo(ptr nonnull %val) #4, !dbg !29
-  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val) #4, !dbg !30
+  tail call void @foo(ptr @.str), !dbg !28
+  call void @foo(ptr nonnull %val), !dbg !29
+  call void @llvm.lifetime.end.p0(i64 16, ptr nonnull %val), !dbg !30
   ret i32 0, !dbg !31
 }
 
@@ -39,27 +39,21 @@ entry:
 ; CHECK-NOT:   BTF_KIND_DATASEC
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !32 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
index 0e183a5c14279..243cd871049f5 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var-readonly-2.ll
@@ -19,14 +19,14 @@
 @__const.test.val = private unnamed_addr constant %struct.anon { [4 x i32] [i32 2, i32 3, i32 4, i32 5], i8 4 }, align 4
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   %val = alloca %struct.anon, align 4
-  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val) #4, !dbg !23
+  call void @llvm.lifetime.start.p0(i64 20, ptr nonnull %val), !dbg !23
   call void @llvm.dbg.declare(metadata ptr %val, metadata !12, metadata !DIExpression()), !dbg !24
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 4 dereferenceable(20) %val, ptr nonnull align 4 dereferenceable(20) @__const.test.val, i64 20, i1 false), !dbg !24
-  call void @foo(ptr nonnull %val) #4, !dbg !25
-  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val) #4, !dbg !26
+  call void @foo(ptr nonnull %val), !dbg !25
+  call void @llvm.lifetime.end.p0(i64 20, ptr nonnull %val), !dbg !26
   ret i32 0, !dbg !27
 }
 
@@ -38,24 +38,18 @@ entry:
 ; CHECK:             .ascii  ".rodata"                       # string offset=42
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr #3
+declare !dbg !28 dso_local void @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/local-var.ll b/llvm/test/CodeGen/BPF/BTF/local-var.ll
index dd79923b6b7ba..fa605d823c01a 100644
--- a/llvm/test/CodeGen/BPF/BTF/local-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/local-var.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @foo(i8 signext) local_unnamed_addr !dbg !7 {
   %2 = alloca i16, align 2
   call void @llvm.dbg.value(metadata i8 %0, metadata !13, metadata !DIExpression()), !dbg !17
   call void @llvm.lifetime.start.p0(i64 2, ptr nonnull %2), !dbg !18
@@ -59,20 +59,16 @@ define dso_local i32 @foo(i8 signext) local_unnamed_addr #0 !dbg !7 {
 ; CHECK-NEXT:        .byte   0
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
index 8fef9c2ad8991..733815d155ce5 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-const.ll
@@ -22,14 +22,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !23
   ret i32 0, !dbg !24
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !25 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !25 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = load i32, ptr %arg, align 4, !dbg !35, !tbaa !36
@@ -64,11 +64,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=72
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
index 4c8aa1fbed614..727daead8788d 100644
--- a/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
+++ b/llvm/test/CodeGen/BPF/BTF/pruning-typedef.ll
@@ -24,14 +24,14 @@
 %struct.s2 = type { %struct.tt }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test1(ptr nocapture readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   ret i32 0, !dbg !25
 }
 
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr #1 !dbg !26 {
+define dso_local i32 @test2(ptr nocapture readonly %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = load i32, ptr %arg, align 4, !dbg !36, !tbaa !37
@@ -71,11 +71,7 @@ entry:
 ; CHECK:        .ascii  "m2"                    # string offset=81
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
new file mode 100644
index 0000000000000..df0cbeb3dd625
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named-2.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; This IR is hand-written.
+
+; ModuleID = 'ptr-named-2.ll'
+source_filename = "ptr-named-2.ll"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel-unknown-none"
+
+%struct.TypeExamples = type { i32*, i32, i32, i32* }
+
+@type_examples = internal global %struct.TypeExamples zeroinitializer, align 8, !dbg !0
+
+!llvm.dbg.cu = !{!1}
+!llvm.module.flags = !{!2, !3, !4}
+!llvm.ident = !{!21}
+
+; CHECK-BTF:      [1] STRUCT 'TypeExamples' size=32 vlen=4
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT:         'volatile' type_id=4 bits_offset=64
+; CHECK-BTF-NEXT:         'const' type_id=5 bits_offset=128
+; CHECK-BTF-NEXT:         'restrict_ptr' type_id=6 bits_offset=192
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'int' size=4 bits_offset=0 nr_bits=32 encoding=SIGNED
+; CHECK-BTF-NEXT: [4] VOLATILE '(anon)' type_id=3
+; CHECK-BTF-NEXT: [5] CONST '(anon)' type_id=3
+; CHECK-BTF-NEXT: [6] RESTRICT '(anon)' type_id=7
+; CHECK-BTF-NEXT: [7] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [8] VAR 'type_examples' type_id=1, linkage=static
+; CHECK-BTF-NEXT: [9] DATASEC '.bss' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=8 offset=0 size=24
+
+!0 = !DIGlobalVariableExpression(var: !5, expr: !DIExpression())
+!1 = distinct !DICompileUnit(language: DW_LANG_C99, file: !6, isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, enums: !7, globals: !8, splitDebugInlining: false, nameTableKind: None)
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{i32 1, !"wchar_size", i32 4}
+!5 = distinct !DIGlobalVariable(name: "type_examples", scope: !1, file: !6, line: 12, type: !9, isLocal: true, isDefinition: true)
+!6 = !DIFile(filename: "ptr-named-2.ll", directory: "/tmp")
+!7 = !{}
+!8 = !{!0}
+!9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "TypeExamples", file: !6, line: 5, size: 256, elements: !10)
+!10 = !{!11, !12, !13, !14}
+!11 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !9, file: !6, line: 6, baseType: !15, size: 64)
+!12 = !DIDerivedType(tag: DW_TAG_member, name: "volatile", scope: !9, file: !6, line: 7, baseType: !17, size: 64, offset: 64)
+!13 = !DIDerivedType(tag: DW_TAG_member, name: "const", scope: !9, file: !6, line: 8, baseType: !18, size: 64, offset: 128)
+!14 = !DIDerivedType(tag: DW_TAG_member, name: "restrict_ptr", scope: !9, file: !6, line: 9, baseType: !19, size: 64, offset: 192)
+!15 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*int", baseType: !16, size: 64)
+!16 = !DIBasicType(name: "int", size: 32, encoding: DW_ATE_signed)
+!17 = !DIDerivedType(tag: DW_TAG_volatile_type, name: "volatile int", baseType: !16)
+!18 = !DIDerivedType(tag: DW_TAG_const_type, name: "const int", baseType: !16)
+!19 = !DIDerivedType(tag: DW_TAG_restrict_type, name: "*int restrict", baseType: !20)
+!20 = !DIDerivedType(tag: DW_TAG_pointer_type, baseType: !16, size: 64)
+!21 = !{!"my hand-written IR"}
diff --git a/llvm/test/CodeGen/BPF/BTF/ptr-named.ll b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
new file mode 100644
index 0000000000000..f081d34c2adf6
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/BTF/ptr-named.ll
@@ -0,0 +1,75 @@
+; RUN: llc -mtriple=bpfel -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+; RUN: llc -mtriple=bpfeb -filetype=obj -o %t1 %s
+; RUN: llvm-objcopy --dump-section='.BTF'=%t2 %t1
+; RUN: %python %p/print_btf.py %t2 | FileCheck -check-prefixes=CHECK-BTF %s
+;
+; Source:
+;   #![no_std]
+;   #![no_main]
+;
+;   pub struct MyType {
+;       ptr: *const u32,
+;   }
+;
+;   impl MyType {
+;       pub const fn new() -> Self {
+;           let ptr = core::ptr::null();
+;           Self { ptr }
+;       }
+;   }
+;
+;   unsafe impl Sync for MyType {}
+;
+;   #[unsafe(no_mangle)]
+;   pub static X: MyType = MyType::new();
+;
+;   #[cfg(not(test))]
+;   #[panic_handler]
+;   fn panic(_info: &core::panic::PanicInfo) -> ! {
+;       loop {}
+;   }
+; Compilation flag:
+;   cargo +nightly rustc -Zbuild-std=core --target=bpfel-unknown-none -- --emit=llvm-bc
+;   llvm-extract --glob=X $(find target/ -name "*.bc" | head -n 1) -o ptr-named.bc
+;   llvm-dis ptr-named.bc -o ptr-named.ll
+
+; ModuleID = 'ptr-named.bc'
+source_filename = "1m2uqe50qkwxmo53ydydvou91"
+target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128"
+target triple = "bpfel"
+
+@X = constant [8 x i8] zeroinitializer, align 8, !dbg !0
+
+!llvm.module.flags = !{!11, !12, !13, !14}
+!llvm.ident = !{!15}
+!llvm.dbg.cu = !{!16}
+
+; CHECK-BTF:      [1] STRUCT 'MyType' size=8 vlen=1
+; CHECK-BTF-NEXT:         'ptr' type_id=2 bits_offset=0
+; CHECK-BTF-NEXT: [2] PTR '(anon)' type_id=3
+; CHECK-BTF-NEXT: [3] INT 'u32' size=4 bits_offset=0 nr_bits=32 encoding=(none)
+; CHECK-BTF-NEXT: [4] VAR 'X' type_id=1, linkage=global
+; CHECK-BTF-NEXT: [5] DATASEC '.rodata' size=0 vlen=1
+; CHECK-BTF-NEXT:         type_id=4 offset=0 size=8
+
+!0 = !DIGlobalVariableExpression(var: !1, expr: !DIExpression())
+!1 = distinct !DIGlobalVariable(name: "X", scope: !2, file: !3, line: 19, type: !4, isLocal: false, isDefinition: true, align: 64)
+!2 = !DINamespace(name: "ptr_named", scope: null)
+!3 = !DIFile(filename: "ptr-named/src/main.rs", directory: "/tmp/ptr-named", checksumkind: CSK_MD5, checksum: "e37168304600b30cbb5ba168f0384932")
+!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "MyType", scope: !2, file: !5, size: 64, align: 64, flags: DIFlagPublic, elements: !6, templateParams: !10, identifier: "7609fa40332dd486922f074276a171c3")
+!5 = !DIFile(filename: "<unknown>", directory: "")
+!6 = !{!7}
+!7 = !DIDerivedType(tag: DW_TAG_member, name: "ptr", scope: !4, file: !5, baseType: !8, size: 64, align: 64, flags: DIFlagPrivate)
+!8 = !DIDerivedType(tag: DW_TAG_pointer_type, name: "*const u32", baseType: !9, size: 64, align: 64, addressSpace: 0)
+!9 = !DIBasicType(name: "u32", size: 32, encoding: DW_ATE_unsigned)
+!10 = !{}
+!11 = !{i32 8, !"PIC Level", i32 2}
+!12 = !{i32 7, !"PIE Level", i32 2}
+!13 = !{i32 7, !"Dwarf Version", i32 4}
+!14 = !{i32 2, !"Debug Info Version", i32 3}
+!15 = !{!"rustc version 1.92.0-nightly (c8905eaa6 2025-09-28)"}
+!16 = distinct !DICompileUnit(language: DW_LANG_Rust, file: !17, producer: "clang LLVM (rustc version 1.92.0-nightly (c8905eaa6 2025-09-28))", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug, globals: !18, splitDebugInlining: false, nameTableKind: None)
+!17 = !DIFile(filename: "ptr-named/src/main.rs/@/1m2uqe50qkwxmo53ydydvou91", directory: "/tmp/ptr-named")
+!18 = !{!0}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-func.ll b/llvm/test/CodeGen/BPF/BTF/static-func.ll
index fc79dbf4dbaf0..65064076362d3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-func.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-func.ll
@@ -9,18 +9,18 @@
 ;   clang -target bpf -O2 -g -S -emit-llvm test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test2() local_unnamed_addr #0 !dbg !12 {
+define dso_local i32 @test2() local_unnamed_addr !dbg !12 {
 entry:
   %call = tail call fastcc i32 @test1(), !dbg !13
   ret i32 %call, !dbg !14
 }
 ; Function Attrs: noinline nounwind
-define internal fastcc i32 @test1() unnamed_addr #1 !dbg !15 {
+define internal fastcc i32 @test1() unnamed_addr !dbg !15 {
 entry:
-  %call = tail call i32 @foo() #3, !dbg !16
+  %call = tail call i32 @foo(), !dbg !16
   ret i32 %call, !dbg !17
 }
-declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
+declare !dbg !4 dso_local i32 @foo() local_unnamed_addr
 
 ; CHECK:             .section        .BTF,"",@progbits
 ; CHECK-NEXT:        .short  60319                   # 0xeb9f
@@ -67,11 +67,6 @@ declare !dbg !4 dso_local i32 @foo() local_unnamed_addr #2
 ; CHECK-NEXT:        .ascii  "foo"                   # string offset=60
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
 !llvm.ident = !{!11}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
index 1827c97c9e29c..fedec387f3ac4 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-derived-type.ll
@@ -17,7 +17,7 @@
 @v4 = internal constant ptr null, align 8, !dbg !19
 
 ; Function Attrs: norecurse nounwind
-define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
+define dso_local i64 @foo() local_unnamed_addr !dbg !27 {
   %1 = load volatile ptr, ptr @v1, align 8, !dbg !29, !tbaa !30
   %2 = load volatile ptr, ptr @v2, align 8, !dbg !34, !tbaa !30
   %3 = ptrtoint ptr %1 to i64, !dbg !35
@@ -141,8 +141,6 @@ define dso_local i64 @foo() local_unnamed_addr #0 !dbg !27 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=87
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!23, !24, !25}
 !llvm.ident = !{!26}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
index cc785b78c5df9..deef48adee9ae 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
index 2b62882b32245..8f29a83fedb13 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-inited.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 3, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
index a4ae948273690..e16b4672f53d3 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly-sec.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
index a9d60ce8d413b..1ddd49904135b 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-readonly.ll
@@ -14,7 +14,7 @@
 @a = internal constant i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !22, !tbaa !23
   %2 = sext i8 %1 to i32, !dbg !22
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !26, !tbaa !27
@@ -99,8 +99,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".rodata"               # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!18, !19, !20}
 !llvm.ident = !{!21}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
index ac27b2bbe19b2..0ff8f2eb7db84 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-sec.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, section "maps", align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  "maps"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
index 28da203409eab..fe9f5085bdc94 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var-zerolen-array.ll
@@ -15,7 +15,7 @@
 @sv = internal global { i32, i32, [10 x i8] } { i32 3, i32 4, [10 x i8] c"abcdefghi\00" }, align 4, !dbg !0
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test() local_unnamed_addr !dbg !21 {
   %1 = load volatile i32, ptr @sv, align 4, !dbg !24, !tbaa !25
   ret i32 %1, !dbg !29
 }
@@ -104,8 +104,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !21 {
 ; CHECK-NEXT:        .ascii  ".data"                 # string offset=89
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
 !llvm.ident = !{!20}
diff --git a/llvm/test/CodeGen/BPF/BTF/static-var.ll b/llvm/test/CodeGen/BPF/BTF/static-var.ll
index 461bd27a366d9..f7710e3f3ce45 100644
--- a/llvm/test/CodeGen/BPF/BTF/static-var.ll
+++ b/llvm/test/CodeGen/BPF/BTF/static-var.ll
@@ -14,7 +14,7 @@
 @a = internal global i8 0, align 1, !dbg !10
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
+define dso_local i32 @foo() local_unnamed_addr !dbg !2 {
   %1 = load volatile i8, ptr @a, align 1, !dbg !20, !tbaa !21
   %2 = sext i8 %1 to i32, !dbg !20
   %3 = load volatile i16, ptr @foo.b, align 2, !dbg !24, !tbaa !25
@@ -93,8 +93,6 @@ define dso_local i32 @foo() local_unnamed_addr #0 !dbg !2 {
 ; CHECK-NEXT:        .ascii  ".bss"                  # string offset=71
 ; CHECK-NEXT:        .byte   0
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!16, !17, !18}
 !llvm.ident = !{!19}
diff --git a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
index 5b125ea8673eb..68d4be0adedd5 100644
--- a/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/struct-anon-2.ll
@@ -15,7 +15,7 @@
 %struct.anon.0 = type { i64 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @f1(ptr nocapture readnone %s1) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %s1, metadata !25, metadata !DIExpression()), !dbg !26
   ret i32 0, !dbg !27
@@ -65,12 +65,8 @@ entry:
 ; CHECK:             .ascii  "B1"                    # string offset=17
 ; CHECK:             .ascii  "long int"              # string offset=20
 
-
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
index 4b3b55789f8d0..14cb8e0803a36 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global-2.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 2, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,9 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".data"                 # string offset=62
 
-
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/BTF/weak-global.ll b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
index ea0a887410373..5605e0b6bb087 100644
--- a/llvm/test/CodeGen/BPF/BTF/weak-global.ll
+++ b/llvm/test/CodeGen/BPF/BTF/weak-global.ll
@@ -11,7 +11,7 @@
 
 @g = weak dso_local local_unnamed_addr global i8 0, align 1, !dbg !0
 ; Function Attrs: norecurse nounwind readonly
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test() local_unnamed_addr !dbg !11 {
 entry:
   %0 = load i8, ptr @g, align 1, !dbg !15, !tbaa !16
   %conv = sext i8 %0 to i32, !dbg !15
@@ -37,8 +37,6 @@ entry:
 ; CHECK:             .byte   103                     # string offset=60
 ; CHECK:             .ascii  ".bss"                  # string offset=62
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!7, !8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
index 23a461752e1b8..eecb99382c6ab 100644
--- a/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/btf-id-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -24,13 +24,13 @@ entry:
 }
 
 ; Function Attrs: nofree nosync nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.btf.type.id(i32, i64) #2
+declare i64 @llvm.bpf.btf.type.id(i32, i64)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !25 {
+define dso_local i32 @bar(ptr %arg) !dbg !25 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -58,10 +58,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   6
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nofree nosync nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
index 40a2432d67850..0851f25deaf25 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-alu32.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 @c = common dso_local global %struct.b zeroinitializer, align 4, !dbg !0
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @f() local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @f() local_unnamed_addr !dbg !15 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr elementtype(%struct.b) nonnull @c, i32 1, i32 1), !dbg !18, !llvm.preserve.access.index !6
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !19
@@ -40,13 +40,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.bs(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
index b8b7a0b8e7c2d..51df39b508d6f 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1-bpfeb.ll
@@ -25,7 +25,7 @@ target triple = "bpfeb"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
@@ -177,4 +173,3 @@ attributes #2 = { nounwind readnone speculatable }
 !36 = !DILocation(line: 14, column: 10, scope: !13)
 !37 = !DILocation(line: 13, column: 67, scope: !13)
 !38 = !DILocation(line: 12, column: 3, scope: !13)
-
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
index 4cf0a13f31e80..295c105c3bf82 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-1.ll
@@ -25,7 +25,7 @@ target triple = "bpfel"
 %struct.s = type { i64, i32, i32, i32, i8, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 ; CHECK-ALU64-LABEL: test:
 ; CHECK-ALU64:       .Ltest$local:
 ; CHECK-ALU64-NEXT:    .type .Ltest$local,@function
@@ -122,17 +122,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
index cdcd7e6f5e374..8f83404a5643e 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2-bpfeb.ll
@@ -26,7 +26,7 @@ target triple = "bpfeb"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
index dd7f1c7a62645..1a7619aaf171b 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-bitfield-2.ll
@@ -26,7 +26,7 @@ target triple = "bpfel"
 %struct.s = type <{ i8, i16 }>
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 4), !dbg !29, !llvm.preserve.access.index !18
@@ -70,17 +70,13 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
index 126bd0a98369e..5a98b05cbaada 100644
--- a/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
+++ b/llvm/test/CodeGen/BPF/CORE/field-reloc-duplicate.ll
@@ -13,7 +13,7 @@
 %struct.s1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @foo(ptr %arg) #0 !dbg !7 {
+define dso_local i32 @foo(ptr %arg) !dbg !7 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -25,13 +25,13 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind
-define dso_local i32 @bar(ptr %arg) #0 !dbg !29 {
+define dso_local i32 @bar(ptr %arg) !dbg !29 {
 entry:
   %arg.addr = alloca ptr, align 8
   store ptr %arg, ptr %arg.addr, align 8, !tbaa !18
@@ -60,10 +60,6 @@ entry:
 ; CHECK-NEXT:        .long   26
 ; CHECK-NEXT:        .long   0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
index 90681d3c56e7d..00c3a6d41b818 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.value(metadata ptr null, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr elementtype(%struct.s1) null, i32 0, i32 0), !dbg !23, !llvm.preserve.access.index !8
@@ -40,17 +40,13 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg) #1
+declare ptr @llvm.preserve.array.access.index.p0.s1s.p0.s1s(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64 immarg)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
index d6bed6c308e67..7e2e8e68ef5fd 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-array.ll
@@ -15,12 +15,12 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr elementtype(%struct.s) %arg, i32 0, i32 2), !dbg !19, !llvm.preserve.access.index !11
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %0, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %1), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -39,22 +39,16 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.ss.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
index 525f38de765b2..cb6674f7d6044 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
index 11235b55790f1..2697201cf33aa 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !16
@@ -71,27 +71,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0.s1s(ptr, i64)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
index e3382d658ca5e..b7541f029f9c6 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
index fda759275ab02..022056751bea4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-byte-size-4.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readnone %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr elementtype(%struct.s1) %arg, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !17
@@ -41,17 +41,13 @@ entry:
 ; CHECK-NEXT:        .long   1
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
index 69872db3bed0b..0404deb145703 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
index 90706e9b4b98f..240083fa84217 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
index 22970401e8884..57dd5b735f1db 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-existence-3.ll
@@ -19,7 +19,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !22
@@ -59,27 +59,23 @@ entry:
 ; CHECK-NEXT:        .long   2
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
index 503a26c391acf..7caa667b4c448 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1-bpfeb.ll
@@ -23,7 +23,7 @@ target triple = "bpfeb"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
index 0327f1a0b5e89..c518573a51e65 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-1.ll
@@ -23,7 +23,7 @@ target triple = "bpfel"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -86,20 +86,16 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
index 2a92d088dc6b6..6bf29d414313d 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-lshift-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -60,24 +60,20 @@ entry:
 ; CHECK-NEXT:        .long   4
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
index 6e62bb308b580..441366f49a6d1 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %struct.s1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !34, !llvm.preserve.access.index !16
@@ -85,20 +85,16 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
index 77ea26a1ff9b3..7bc994db23c7e 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-2.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i8 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !30
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !31, !llvm.preserve.access.index !16
@@ -59,24 +59,20 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
index 556f69f491aae..ebfecff239135 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-rshift-3.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 %struct.s1 = type { [5 x [5 x i8]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !18 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !36, !llvm.preserve.access.index !23
@@ -60,27 +60,23 @@ entry:
 ; CHECK-NEXT:        .long   5
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
index 2741050b58107..d50701cd0749b 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-1.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.u1 = type { i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !29, metadata !DIExpression()), !dbg !35
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !30, metadata !DIExpression()), !dbg !35
@@ -85,29 +85,25 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.u1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
index b71bbf3de868b..312d40f49b7d5 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-2.ll
@@ -25,7 +25,7 @@ target triple = "bpf"
 %struct.s1 = type { i32, i16 }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !42, !llvm.preserve.access.index !24
@@ -76,24 +76,20 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
index 5caea9751b790..12a21c7e55504 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-fieldinfo-signedness-3.ll
@@ -24,7 +24,7 @@ target triple = "bpf"
 %struct.s1 = type { [10 x i32], [10 x [10 x i32]] }
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !29 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !29 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !43, metadata !DIExpression()), !dbg !46
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr %arg, i32 1), !dbg !47, !llvm.preserve.access.index !33
@@ -66,27 +66,23 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.u1s.p0.u1s(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #1
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!25, !26, !27}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
index 8b95b1c48e79f..13c7d1dfbe7ca 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-struct.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -39,19 +39,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
index 88658b66972c8..8583322e7e018 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-enum-value.ll
@@ -20,7 +20,7 @@ target triple = "bpf"
 @2 = private unnamed_addr constant [18 x i8] c"VAL10:-2147483648\00", align 1
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !18 {
+define dso_local i32 @test() local_unnamed_addr !dbg !18 {
 entry:
   %0 = tail call i64 @llvm.bpf.preserve.enum.value(i32 0, ptr @0, i64 0), !dbg !23, !llvm.preserve.access.index !3
   %1 = tail call i64 @llvm.bpf.preserve.enum.value(i32 1, ptr @1, i64 1), !dbg !24, !llvm.preserve.access.index !3
@@ -81,10 +81,7 @@ entry:
 ; CHECK-NEXT:        .long   11
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i64 @llvm.bpf.preserve.enum.value(i32, ptr, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
index 0bdf954c8debe..6f316d9fabe63 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-exist.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 0), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 0), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   8
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
index ddd371159721e..d3aacc7226cb4 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-1.ll
@@ -17,7 +17,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = tail call i32 @llvm.bpf.preserve.type.info(i32 0, i64 1), !dbg !19, !llvm.preserve.access.index !8
   %1 = tail call i32 @llvm.bpf.preserve.type.info(i32 1, i64 1), !dbg !20, !llvm.preserve.access.index !21
@@ -59,10 +59,7 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #1
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
index b2f8e485a0814..ad4fc9688bc3f 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-typeinfo-type-size-2.ll
@@ -20,7 +20,7 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   call void @llvm.dbg.declare(metadata ptr undef, metadata !20, metadata !DIExpression()), !dbg !28
   call void @llvm.dbg.declare(metadata ptr undef, metadata !19, metadata !DIExpression()), !dbg !29
@@ -65,14 +65,10 @@ entry:
 ; CHECK-NEXT:        .long   9
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.type.info(i32, i64) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { nounwind readnone }
+declare i32 @llvm.bpf.preserve.type.info(i32, i64)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
index ef360929014fd..e0217dd536276 100644
--- a/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/intrinsic-union.ll
@@ -15,11 +15,11 @@ target triple = "bpf"
 %union.u = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !17, metadata !DIExpression()), !dbg !18
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %arg, i32 1), !dbg !19, !llvm.preserve.access.index !12
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 ; CHECK-LABEL: test
@@ -38,19 +38,13 @@ entry:
 ; CHECK-NEXT: .long   26
 ; CHECK-NEXT: .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
index 4c6ce1ef14579..819ee3124b08d 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-elf-ama-symbol.ll
@@ -15,7 +15,7 @@ target triple = "bpf"
 %struct.tt = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !16, metadata !DIExpression()), !dbg !17
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr elementtype(%struct.tt) %arg, i32 0, i32 0), !dbg !18, !llvm.preserve.access.index !12
@@ -26,14 +26,10 @@ entry:
 ; CHECK-NOT: llvm.tt:0:0$0:0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.tts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
index 9998c985e2aee..c3f83955eb20d 100644
--- a/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/no-narrow-load.ll
@@ -28,7 +28,7 @@ target triple = "bpf"
 %struct.data_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr readonly %args) local_unnamed_addr #0 !dbg !12 {
+define dso_local void @test(ptr readonly %args) local_unnamed_addr !dbg !12 {
 entry:
   %data = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %args, metadata !22, metadata !DIExpression()), !dbg !29
@@ -36,7 +36,7 @@ entry:
   %1 = load i32, ptr %0, align 4, !dbg !30, !tbaa !31
   %and = and i32 %1, 65536, !dbg !36
   call void @llvm.dbg.value(metadata i32 %and, metadata !23, metadata !DIExpression()), !dbg !29
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data) #5, !dbg !37
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %data), !dbg !37
   call void @llvm.dbg.declare(metadata ptr %data, metadata !24, metadata !DIExpression()), !dbg !38
   store i64 0, ptr %data, align 8, !dbg !38
   %tobool = icmp eq i32 %and, 0, !dbg !39
@@ -60,8 +60,8 @@ lor.end:                                          ; preds = %lor.end.critedge, %
   %5 = phi i32 [ %phitmp, %cond.false ], [ 1, %lor.end.critedge ]
   %d2 = getelementptr inbounds %struct.data_t, ptr %data, i64 0, i32 1, !dbg !49
   store i32 %5, ptr %d2, align 4, !dbg !50, !tbaa !51
-  call void @output(ptr nonnull %data) #5, !dbg !52
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data) #5, !dbg !53
+  call void @output(ptr nonnull %data), !dbg !52
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %data), !dbg !53
   ret void, !dbg !53
 }
 
@@ -71,28 +71,21 @@ lor.end:                                          ; preds = %lor.end.critedge, %
 ; CHECK: r[[LOAD]] &= 32768
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.info_ts(ptr, i32 immarg, i32 immarg)
 
-declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr #4
+declare !dbg !4 dso_local void @output(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable willreturn }
-attributes #2 = { argmemonly nounwind willreturn }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9, !10}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
index 5da2bbd43fa09..1ce453cd0488e 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-access-str.ll
@@ -18,13 +18,13 @@ target triple = "bpf"
 %struct.t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg1, ptr %arg2) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg1, metadata !22, metadata !DIExpression()), !dbg !24
   call void @llvm.dbg.value(metadata ptr %arg2, metadata !23, metadata !DIExpression()), !dbg !24
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg1, i32 1, i32 1), !dbg !25, !llvm.preserve.access.index !12
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg2, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %0, ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %0, ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -46,22 +46,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
index 024ed04f6e5e1..0fdd704d63957 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-basic.ll
@@ -24,19 +24,19 @@ target triple = "bpf"
 %struct.net_device = type opaque
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca ptr, align 8
   call void @llvm.dbg.value(metadata ptr %0, metadata !26, metadata !DIExpression()), !dbg !28
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2) #4, !dbg !29
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %2), !dbg !29
   call void @llvm.dbg.value(metadata ptr null, metadata !27, metadata !DIExpression()), !dbg !28
   store ptr null, ptr %2, align 8, !dbg !30, !tbaa !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
-  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3) #4, !dbg !36
+  %4 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 8, ptr %3), !dbg !36
   %5 = load ptr, ptr %2, align 8, !dbg !37, !tbaa !31
   call void @llvm.dbg.value(metadata ptr %5, metadata !27, metadata !DIExpression()), !dbg !28
   %6 = icmp ne ptr %5, null, !dbg !38
   %7 = zext i1 %6 to i32, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %2), !dbg !39
   ret i32 %7, !dbg !40
 }
 
@@ -122,22 +122,16 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
index e12221ecb3528..65859c8601388 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-1.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !22 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !22 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !32, metadata !DIExpression()), !dbg !33
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !26
@@ -30,7 +30,7 @@ entry:
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %2, i32 0, i32 0), !dbg !34, !llvm.preserve.access.index !4
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %3, i32 1, i32 2), !dbg !34, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %4, i32 1, i32 1), !dbg !34, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %5) #4, !dbg !35
+  %call = tail call i32 @get_value(ptr %5), !dbg !35
   ret i32 %call, !dbg !36
 }
 
@@ -60,13 +60,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -75,13 +75,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !19, !20}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
index 1764c9d12a36b..f42e7e6b4a55a 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-array-2.ll
@@ -21,7 +21,7 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !24 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !24 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !34, metadata !DIExpression()), !dbg !35
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !28
@@ -31,7 +31,7 @@ entry:
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x %struct.v1]]) %3, i32 1, i32 2), !dbg !36, !llvm.preserve.access.index !5
   %5 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x %struct.v1]) %4, i32 1, i32 3), !dbg !36, !llvm.preserve.access.index !18
   %6 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %5, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %6) #4, !dbg !37
+  %call = tail call i32 @get_value(ptr %6), !dbg !37
   ret i32 %call, !dbg !38
 }
 
@@ -62,13 +62,13 @@ entry:
 ; CHECK-NEXT:         .long   107
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
@@ -79,13 +79,7 @@ declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21, !22}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
index bbff3f6695ea9..38b1c99ff47de 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-1.ll
@@ -21,12 +21,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !18
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !5
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %1), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -60,22 +60,16 @@ entry:
 ; CHECK-NEXT:        .long   [[ACCESS_STR]]
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
index bdc17e626ec30..7730ee3ad8c76 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-2.ll
@@ -24,12 +24,12 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr elementtype(%struct.v1) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %1), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -47,7 +47,6 @@ entry:
 ; CHECK:             .ascii  "0:1"                   # string offset=45
 ; CHECK:             .ascii  "v1"                    # string offset=91
 
-
 ; CHECK:             .long   16                      # FieldReloc
 ; CHECK-NEXT:        .long   39                      # Field reloc section string offset=39
 ; CHECK-NEXT:        .long   2
@@ -60,22 +59,16 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.v2s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
index dea6e406678f0..e5ef549977285 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-struct-3.ll
@@ -22,14 +22,14 @@ target triple = "bpf"
 %struct.v1 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v3) %arg, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !24
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([40 x i32]) %0, i32 1, i32 4), !dbg !32, !llvm.preserve.access.index !11
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.p0(ptr elementtype(%struct.v1) %2, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %3), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -60,24 +60,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
index 98fdfded2fee4..7aeaed43ee135 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-1.ll
@@ -24,14 +24,14 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !33, metadata !DIExpression()), !dbg !34
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !35, !llvm.preserve.access.index !20
   %1 = bitcast ptr %0 to ptr, !dbg !35
   %2 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %1, i32 1), !dbg !35, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %2, i64 0, i32 0, !dbg !35
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !36
+  %call = tail call i32 @get_value(ptr %b), !dbg !36
   ret i32 %call, !dbg !37
 }
 
@@ -61,21 +61,15 @@ entry:
 ; CHECK-NEXT:        .long   45
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
index 7b6369912111c..12c3936a880e7 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-cast-union-2.ll
@@ -22,7 +22,7 @@ target triple = "bpf"
 %union.v1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !19 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !19 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !30, metadata !DIExpression()), !dbg !31
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %arg, i32 1), !dbg !32, !llvm.preserve.access.index !24
@@ -31,7 +31,7 @@ entry:
   %2 = bitcast ptr %1 to ptr, !dbg !32
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.p0(ptr %2, i32 1), !dbg !32, !llvm.preserve.access.index !6
   %b = getelementptr inbounds %union.v1, ptr %3, i64 0, i32 0, !dbg !32
-  %call = tail call i32 @get_value(ptr %b) #4, !dbg !33
+  %call = tail call i32 @get_value(ptr %b), !dbg !33
   ret i32 %call, !dbg !34
 }
 
@@ -62,24 +62,18 @@ entry:
 ; CHECK-NEXT:        .long   118
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32) #2
+declare ptr @llvm.preserve.union.access.index.p0.p0(ptr, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16, !17}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
index 499e368983f3a..ee1f0e2df9a86 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-load.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !11 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !11 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !20, metadata !DIExpression()), !dbg !21
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !22, !llvm.preserve.access.index !15
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
index 2aadbdfd3aafb..3d66435540498 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-end-ret.ll
@@ -14,7 +14,7 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind readnone
-define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local ptr @test(ptr readnone %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !13
@@ -42,14 +42,10 @@ entry:
 ; CHECK-NEXT:  .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
index 34ea050513671..cf75909f16f91 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-1.ll
@@ -40,11 +40,11 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !20 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !20 {
 entry:
   %ull = alloca i64, align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !31, metadata !DIExpression()), !dbg !37
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull) #5, !dbg !38
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %ull), !dbg !38
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !39, !llvm.preserve.access.index !25
   %1 = tail call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 0), !dbg !40
   call void @llvm.dbg.value(metadata i32 %1, metadata !34, metadata !DIExpression()), !dbg !37
@@ -52,7 +52,7 @@ entry:
   call void @llvm.dbg.value(metadata i32 %2, metadata !35, metadata !DIExpression()), !dbg !37
   %idx.ext = zext i32 %1 to i64, !dbg !43
   %add.ptr = getelementptr i8, ptr %arg, i64 %idx.ext, !dbg !43
-  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr) #5, !dbg !44
+  call void @bpf_probe_read(ptr nonnull %ull, i32 %2, ptr %add.ptr), !dbg !44
   %3 = call i32 @llvm.bpf.preserve.field.info.p0(ptr %0, i64 4), !dbg !45
   call void @llvm.dbg.value(metadata i32 %3, metadata !36, metadata !DIExpression()), !dbg !37
   %4 = load i64, ptr %ull, align 8, !dbg !46, !tbaa !47
@@ -68,7 +68,7 @@ entry:
   %shr3 = lshr i64 %shl, %sh_prom1, !dbg !53
   %retval.0.in = select i1 %tobool, i64 %shr3, i64 %shr, !dbg !53
   %retval.0 = trunc i64 %retval.0.in to i32, !dbg !37
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull) #5, !dbg !54
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %ull), !dbg !54
   ret i32 %retval.0, !dbg !54
 }
 
@@ -114,28 +114,21 @@ entry:
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #2
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
-declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr #3
+declare dso_local void @bpf_probe_read(ptr, i32, ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #4
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone speculatable willreturn }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!16, !17, !18}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
index 01c5e69ff4069..d5b2d0524b73f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2-bpfeb.ll
@@ -42,7 +42,7 @@ target triple = "bpfeb"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
index d458d416b383a..5076e792e2f06 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-fieldinfo-2.ll
@@ -42,7 +42,7 @@ target triple = "bpfel"
 %struct.s = type { i32, i16 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @field_read(ptr %arg) local_unnamed_addr #0 !dbg !26 {
+define dso_local i32 @field_read(ptr %arg) local_unnamed_addr !dbg !26 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !37, metadata !DIExpression()), !dbg !41
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %arg, i32 1, i32 2), !dbg !42, !llvm.preserve.access.index !31
@@ -157,17 +157,13 @@ sw.epilog:                                        ; preds = %entry, %sw.bb9, %sw
 ; CHECK-NEXT:        .long   3
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64) #1
+declare i32 @llvm.bpf.preserve.field.info.p0(ptr, i64)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!22, !23, !24}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
index 7657b7897bb25..2f421180305ad 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-1.ll
@@ -19,10 +19,10 @@ target triple = "bpf"
 @g = dso_local global %struct.v3 zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !16 {
+define dso_local i32 @test() local_unnamed_addr !dbg !16 {
 entry:
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) nonnull @g, i32 1, i32 1), !dbg !19, !llvm.preserve.access.index !7
-  %call = tail call i32 @get_value(ptr %0) #3, !dbg !20
+  %call = tail call i32 @get_value(ptr %0), !dbg !20
   ret i32 %call, !dbg !21
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!12, !13, !14}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
index bed14ab4ea15d..f43df76786e24 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-2.ll
@@ -19,12 +19,12 @@ target triple = "bpf"
 @g = dso_local global [4 x [5 x %struct.v3]] zeroinitializer, section "stats", align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test() local_unnamed_addr !dbg !23 {
 entry:
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([4 x [5 x %struct.v3]]) nonnull @g, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !6
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype([5 x %struct.v3]) %0, i32 1, i32 2), !dbg !26, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %1, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %2) #3, !dbg !27
+  %call = tail call i32 @get_value(ptr %2), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -47,21 +47,15 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
index 49b89e2e49277..5bc2bf9392209 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-global-3.ll
@@ -19,11 +19,11 @@ target triple = "bpf"
 @g = dso_local local_unnamed_addr global ptr null, section "stats", align 8, !dbg !0
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !17 {
+define dso_local i32 @test() local_unnamed_addr !dbg !17 {
 entry:
   %0 = load ptr, ptr @g, align 8, !dbg !20, !tbaa !21
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !20, !llvm.preserve.access.index !8
-  %call = tail call i32 @get_value(ptr %1) #3, !dbg !25
+  %call = tail call i32 @get_value(ptr %1), !dbg !25
   ret i32 %call, !dbg !26
 }
 
@@ -45,15 +45,10 @@ entry:
 ; CHECK-NEXT:         .long   23
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind }
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!13, !14, !15}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
index 4ff170cf4c0f6..983383cdf0f2c 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-ignore.ll
@@ -13,11 +13,11 @@
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !10 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !14, metadata !DIExpression()), !dbg !15
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype(i32) %arg, i32 0, i32 4), !dbg !16, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !17
+  %call = tail call i32 @get_value(ptr %0), !dbg !17
   ret i32 %call, !dbg !18
 }
 
@@ -26,19 +26,13 @@ entry:
 ; CHECK:             .section        .BTF.ext,"",@progbits
 ; CHECK-NOT:         .long   16                      # FieldReloc
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!6, !7, !8}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
index e5f86c2a473f9..c67d57fb1d660 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-middle-chain.ll
@@ -29,7 +29,7 @@ target triple = "bpf"
 %struct.t1 = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local void @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local void @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr elementtype(%struct.r1) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !11
@@ -38,7 +38,7 @@ entry:
   call void @llvm.dbg.value(metadata ptr %1, metadata !25, metadata !DIExpression()), !dbg !29
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr elementtype(%struct.t1) %1, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !17
   call void @llvm.dbg.value(metadata ptr %2, metadata !27, metadata !DIExpression()), !dbg !29
-  tail call void @test1(ptr %0, ptr %1, ptr %2) #4, !dbg !36
+  tail call void @test1(ptr %0, ptr %1, ptr %2), !dbg !36
   ret void, !dbg !37
 }
 
@@ -67,24 +67,18 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.s1s.p0.r1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.t1s.p0.s1s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.t1s(ptr, i32, i32)
 
-declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr #2
+declare dso_local void @test1(ptr, ptr, ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
index 8ca3ef5d3f6cc..7ffb4de857f10 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-1.ll
@@ -17,14 +17,14 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x i32]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !21 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !21 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !25, metadata !DIExpression()), !dbg !26
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !27, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !27, !llvm.preserve.access.index !6
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %1, i32 1, i32 2), !dbg !27, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %2, i32 1, i32 3), !dbg !27, !llvm.preserve.access.index !15
-  %call = tail call i32 @get_value(ptr %3) #4, !dbg !28
+  %call = tail call i32 @get_value(ptr %3), !dbg !28
   ret i32 %call, !dbg !29
 }
 
@@ -46,27 +46,21 @@ entry:
 ; CHECK-NEXT:         .long   58
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
index b2ba5a8386bd4..55bb7c587b6e6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multi-array-2.ll
@@ -17,7 +17,7 @@ target triple = "bpf"
 %struct.v3 = type { i32, [4 x [4 x [4 x i32]]] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !23 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !23 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !27, metadata !DIExpression()), !dbg !28
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !29, !llvm.preserve.access.index !4
@@ -25,7 +25,7 @@ entry:
   %2 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x [4 x i32]]]) %1, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !11
   %3 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x [4 x i32]]) %2, i32 1, i32 3), !dbg !29, !llvm.preserve.access.index !15
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([4 x i32]) %3, i32 1, i32 2), !dbg !29, !llvm.preserve.access.index !17
-  %call = tail call i32 @get_value(ptr %4) #4, !dbg !30
+  %call = tail call i32 @get_value(ptr %4), !dbg !30
   ret i32 %call, !dbg !31
 }
 
@@ -47,29 +47,23 @@ entry:
 ; CHECK-NEXT:        .long   58
 ; CHECK-NEXT:        .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!19, !20, !21}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
index e00bbb85d7234..a5b460412c2cd 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-multilevel.ll
@@ -28,16 +28,16 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !28, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !31
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !31
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !32, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %3, i32 0, i32 0), !dbg !32, !llvm.preserve.access.index !23
-  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4) #4, !dbg !33
+  %5 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %4), !dbg !33
   %6 = load i32, ptr %2, align 4, !dbg !34, !tbaa !35
   call void @llvm.dbg.value(metadata i32 %6, metadata !29, metadata !DIExpression()), !dbg !30
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !39
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !39
   ret i32 %6, !dbg !40
 }
 
@@ -130,25 +130,19 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
index b4d18449dd378..ffd77ed1d3487 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-1.ll
@@ -16,11 +16,11 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %0), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,19 +42,13 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
index 87b88bc925b4f..cb0aff30fa6dd 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-pointer-2.ll
@@ -16,12 +16,12 @@ target triple = "bpf"
 %struct.v3 = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !15 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !19, metadata !DIExpression()), !dbg !20
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr elementtype(%struct.v3) %arg, i32 0, i32 1), !dbg !21, !llvm.preserve.access.index !4
   %1 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr elementtype(%struct.v3) %0, i32 1, i32 1), !dbg !21, !llvm.preserve.access.index !6
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !22
+  %call = tail call i32 @get_value(ptr %1), !dbg !22
   ret i32 %call, !dbg !23
 }
 
@@ -42,22 +42,16 @@ entry:
 ; CHECK-NEXT:         .long   32
 ; CHECK-NEXT:         .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.array.access.index.p0.v3s.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.v3s(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
index 8ebbfeaac6a27..2081b3f53e80f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-anonymous.ll
@@ -27,17 +27,17 @@ target triple = "bpf"
 %struct.anon = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr elementtype([10 x %struct.anon]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr elementtype(%struct.anon) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -140,28 +140,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
index 64ec2504551cf..4e51366c53083 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-struct-array.ll
@@ -28,17 +28,17 @@ target triple = "bpf"
 %struct.net_device = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !31, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !34
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !34
   %3 = tail call ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr elementtype(%struct.sk_buff) %0, i32 1, i32 1), !dbg !35, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr elementtype([10 x %struct.net_device]) %3, i32 1, i32 5), !dbg !35, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr elementtype(%struct.net_device) %4, i32 0, i32 0), !dbg !35, !llvm.preserve.access.index !24
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !36
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !36
   %7 = load i32, ptr %2, align 4, !dbg !37, !tbaa !38
   call void @llvm.dbg.value(metadata i32 %7, metadata !32, metadata !DIExpression()), !dbg !33
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !42
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !42
   ret i32 %7, !dbg !43
 }
 
@@ -143,28 +143,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.net_devices.p0.sk_buffs(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.net_devices.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.net_devices(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
index ed462e1b35b10..eb0620dd3b4f6 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-array.ll
@@ -20,12 +20,12 @@ target triple = "bpf"
 %struct.__s = type { [7 x i32] }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !24, metadata !DIExpression()), !dbg !25
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 0, i32 0), !dbg !26, !llvm.preserve.access.index !13
   %1 = tail call ptr @llvm.preserve.array.access.index.p0.p0(ptr elementtype([7 x i32]) %0, i32 1, i32 1), !dbg !26, !llvm.preserve.access.index !19
-  %call = tail call i32 @get_value(ptr %1) #4, !dbg !27
+  %call = tail call i32 @get_value(ptr %1), !dbg !27
   ret i32 %call, !dbg !28
 }
 
@@ -48,22 +48,16 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.p0(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
index 6b806ae87e6d3..c4edda145200f 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %struct.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr elementtype(%struct.__t) %arg, i32 0, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32) #1
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ts(ptr, i32, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
index c2b5a116e9cd0..f8cf2534a7639 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-struct.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %struct.__s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr elementtype(%struct.__s) %arg, i32 1, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:   .long   [[ACCESS_STR]]
 ; CHECK-NEXT:   .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.__ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
index a63b7e7296d36..0fe7c1fef1208 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union-2.ll
@@ -18,7 +18,7 @@ target triple = "bpf"
 %union.__t = type { i32 }
 
 ; Function Attrs: nounwind readonly
-define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr #0 !dbg !13 {
+define dso_local i32 @test(ptr readonly %arg) local_unnamed_addr !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !18, metadata !DIExpression()), !dbg !19
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr %arg, i32 0), !dbg !20, !llvm.preserve.access.index !4
@@ -50,14 +50,10 @@ entry:
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32) #1
+declare ptr @llvm.preserve.union.access.index.p0.__ts.p0.__ts(ptr, i32)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #2
-
-attributes #0 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readnone speculatable}
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!9, !10, !11}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
index 4b3d178cffba6..aa8705d316445 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef-union.ll
@@ -20,11 +20,11 @@ target triple = "bpf"
 %union.__s = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !21, metadata !DIExpression()), !dbg !22
   %0 = tail call ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr %arg, i32 1), !dbg !23, !llvm.preserve.access.index !14
-  %call = tail call i32 @get_value(ptr %0) #4, !dbg !24
+  %call = tail call i32 @get_value(ptr %0), !dbg !24
   ret i32 %call, !dbg !25
 }
 
@@ -47,19 +47,13 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.__ss.p0.__ss(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
index e757327919e2a..5195d17a5b49e 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-typedef.ll
@@ -24,13 +24,13 @@ target triple = "bpf"
 %struct.s = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %arg, metadata !28, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr elementtype([7 x %union.u]) %arg, i32 0, i32 1), !dbg !30, !llvm.preserve.access.index !14
   %1 = tail call ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr %0, i32 1), !dbg !30, !llvm.preserve.access.index !16
   %2 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr elementtype(%struct.s) %1, i32 1, i32 1), !dbg !30, !llvm.preserve.access.index !20
-  %call = tail call i32 @get_value(ptr %2) #4, !dbg !31
+  %call = tail call i32 @get_value(ptr %2), !dbg !31
   ret i32 %call, !dbg !32
 }
 
@@ -53,25 +53,19 @@ entry:
 ; CHECK-NEXT:    .long   [[ACCESS_STR:[0-9]+]]
 ; CHECK-NEXT:    .long   0
 
-declare dso_local i32 @get_value(ptr) local_unnamed_addr #1
+declare dso_local i32 @get_value(ptr) local_unnamed_addr
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.array.access.index.p0.us.p0.us(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.us.p0.us(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ss(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
index 824eba9af8f52..e1569994fbe17 100644
--- a/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
+++ b/llvm/test/CodeGen/BPF/CORE/offset-reloc-union.ll
@@ -31,17 +31,17 @@ target triple = "bpf"
 %union.anon = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
+define dso_local i32 @bpf_prog(ptr) local_unnamed_addr !dbg !15 {
   %2 = alloca i32, align 4
   call void @llvm.dbg.value(metadata ptr %0, metadata !32, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2) #4, !dbg !35
+  call void @llvm.lifetime.start.p0(i64 4, ptr nonnull %2), !dbg !35
   %3 = tail call ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr %0, i32 1), !dbg !36, !llvm.preserve.access.index !19
   %4 = tail call ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr elementtype(%struct.anon) %3, i32 1, i32 1), !dbg !36, !llvm.preserve.access.index !23
   %5 = tail call ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr %4, i32 0), !dbg !36, !llvm.preserve.access.index !27
-  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5) #4, !dbg !37
+  %6 = call i32 inttoptr (i64 4 to ptr)(ptr nonnull %2, i32 4, ptr %5), !dbg !37
   %7 = load i32, ptr %2, align 4, !dbg !38, !tbaa !39
   call void @llvm.dbg.value(metadata i32 %7, metadata !33, metadata !DIExpression()), !dbg !34
-  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2) #4, !dbg !43
+  call void @llvm.lifetime.end.p0(i64 4, ptr nonnull %2), !dbg !43
   ret i32 %7, !dbg !44
 }
 
@@ -145,28 +145,22 @@ define dso_local i32 @bpf_prog(ptr) local_unnamed_addr #0 !dbg !15 {
 ; CHECK-NEXT:        .long   0
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.sk_buffs.p0.sk_buffs(ptr, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg) #2
+declare ptr @llvm.preserve.struct.access.index.p0.anons.p0.anons(ptr, i32 immarg, i32 immarg)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg) #2
+declare ptr @llvm.preserve.union.access.index.p0.anons.p0.anons(ptr, i32 immarg)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #3
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12, !13}
diff --git a/llvm/test/CodeGen/BPF/CORE/store-addr.ll b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
index 33bbd0188165d..2c8a0c42c0dbd 100644
--- a/llvm/test/CodeGen/BPF/CORE/store-addr.ll
+++ b/llvm/test/CodeGen/BPF/CORE/store-addr.ll
@@ -22,17 +22,17 @@ target triple = "bpf"
 %struct.t = type { i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %arg) local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test(ptr %arg) local_unnamed_addr !dbg !14 {
 entry:
   %param = alloca [1 x i64], align 8
   call void @llvm.dbg.value(metadata ptr %arg, metadata !22, metadata !DIExpression()), !dbg !27
-  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param) #5, !dbg !28
+  call void @llvm.lifetime.start.p0(i64 8, ptr nonnull %param), !dbg !28
   call void @llvm.dbg.declare(metadata ptr %param, metadata !23, metadata !DIExpression()), !dbg !29
   %0 = tail call ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr elementtype(%struct.t) %arg, i32 0, i32 0), !dbg !30, !llvm.preserve.access.index !18
   %1 = ptrtoint ptr %0 to i64, !dbg !31
   store i64 %1, ptr %param, align 8, !dbg !33, !tbaa !34
-  %call = call i32 @foo(ptr nonnull %param) #5, !dbg !38
-  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param) #5, !dbg !39
+  %call = call i32 @foo(ptr nonnull %param), !dbg !38
+  call void @llvm.lifetime.end.p0(i64 8, ptr nonnull %param), !dbg !39
   ret i32 %call, !dbg !40
 }
 
@@ -41,28 +41,21 @@ entry:
 ; CHECK:  *(u64 *)(r10 - 8) = r1
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32) #3
+declare ptr @llvm.preserve.struct.access.index.p0.p0.ts(ptr, i32, i32)
 
-declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr #4
+declare !dbg !5 dso_local i32 @foo(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #2
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind }
-attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11, !12}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
index 8a4b37d4d2129..09ca422fc94c0 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp1.ll
@@ -20,12 +20,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -62,25 +62,20 @@ if.end:                                           ; preds = %lor.lhs.false
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
index ad157fe66d48b..bbda062df6d8e 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-icmp2.ll
@@ -18,12 +18,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %retval = alloca i32, align 4
   %ret = alloca i32, align 4
   %cleanup.dest.slot = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !2
   %0 = load i32, ptr %ret, align 4, !tbaa !2
@@ -65,25 +65,20 @@ if.end3:                                          ; preds = %if.end
   br label %cleanup
 
 cleanup:                                          ; preds = %if.end3, %if.then2, %if.then
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   %3 = load i32, ptr %retval, align 4
   ret i32 %3
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
-declare dso_local i32 @bar(i32) #2
+declare dso_local i32 @bar(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
index d118fa0e9a76a..d34d652c5b076 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative1.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i64, align 8
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %ret)
   %call = call i64 @foo()
   store i64 %call, ptr %ret, align 8, !tbaa !6
   %0 = load i64, ptr %ret, align 8, !tbaa !6
@@ -36,7 +36,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 8, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %ret)
   ret ptr %3
 }
 ; CHECK-COMMON:  [[REG6:r[0-9]+]] = r1
@@ -57,17 +57,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i64 @foo(...) #2
+declare dso_local i64 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
index 218fa5defac4c..5f3fa945a6cd5 100644
--- a/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
+++ b/llvm/test/CodeGen/BPF/adjust-opt-speculative2.ll
@@ -15,12 +15,12 @@
 ;   clang -target bpf -O2 -S -emit-llvm -Xclang -disable-llvm-passes test.c
 
 ; Function Attrs: nounwind
-define dso_local ptr @test(ptr %p) #0 {
+define dso_local ptr @test(ptr %p) {
 entry:
   %p.addr = alloca ptr, align 8
   %ret = alloca i32, align 4
   store ptr %p, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.start.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %ret)
   %call = call i32 @foo()
   store i32 %call, ptr %ret, align 4, !tbaa !6
   %0 = load i32, ptr %ret, align 4, !tbaa !6
@@ -37,7 +37,7 @@ if.then:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.then, %entry
   %3 = load ptr, ptr %p.addr, align 8, !tbaa !2
-  call void @llvm.lifetime.end.p0(i64 4, ptr %ret) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %ret)
   ret ptr %3
 }
 
@@ -66,17 +66,12 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK-COMMON:  exit
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local i32 @foo(...) #2
+declare dso_local i32 @foo(...)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/callx.ll b/llvm/test/CodeGen/BPF/callx.ll
index d83e0f6888ac7..e027c1fdb8789 100644
--- a/llvm/test/CodeGen/BPF/callx.ll
+++ b/llvm/test/CodeGen/BPF/callx.ll
@@ -3,16 +3,13 @@
 ;   int test(int (*f)(void)) { return f(); }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr nocapture %f) local_unnamed_addr {
 entry:
-  %call = tail call i32 %f() #1
+  %call = tail call i32 %f()
 ; CHECK: callx r{{[0-9]+}}
   ret i32 %call
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/dwarfdump.ll b/llvm/test/CodeGen/BPF/dwarfdump.ll
index a3a6b52b2034f..d50c025c4e6f8 100644
--- a/llvm/test/CodeGen/BPF/dwarfdump.ll
+++ b/llvm/test/CodeGen/BPF/dwarfdump.ll
@@ -10,7 +10,7 @@ target triple = "bpf"
 @testprog.myvar_c = internal unnamed_addr global i32 0, align 4, !dbg !0
 
 ; Function Attrs: nounwind
-define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
+define i32 @testprog(i32, i32) local_unnamed_addr !dbg !2 {
   tail call void @llvm.dbg.value(metadata i32 %0, i64 0, metadata !11, metadata !16), !dbg !17
   tail call void @llvm.dbg.value(metadata i32 %1, i64 0, metadata !12, metadata !16), !dbg !18
   %3 = load i32, ptr @testprog.myvar_c, align 4, !dbg !19, !tbaa !20
@@ -21,10 +21,7 @@ define i32 @testprog(i32, i32) local_unnamed_addr #0 !dbg !2 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!7}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/CodeGen/BPF/i128.ll b/llvm/test/CodeGen/BPF/i128.ll
index a966e3e4831eb..3c94e0c1c05c3 100644
--- a/llvm/test/CodeGen/BPF/i128.ll
+++ b/llvm/test/CodeGen/BPF/i128.ll
@@ -19,14 +19,14 @@
 %struct.ipv6_key_t = type { i32, i128, i16 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(i32 %pid) local_unnamed_addr #0 {
+define dso_local i32 @test(i32 %pid) local_unnamed_addr {
 entry:
   %ipv6_key = alloca %struct.ipv6_key_t, align 16
-  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @llvm.lifetime.start.p0(i64 48, ptr nonnull %ipv6_key)
   call void @llvm.memset.p0.i64(ptr nonnull align 16 dereferenceable(48) %ipv6_key, i8 0, i64 48, i1 false)
   store i32 %pid, ptr %ipv6_key, align 16, !tbaa !2
-  call void @test1(ptr nonnull %ipv6_key) #4
-  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key) #4
+  call void @test1(ptr nonnull %ipv6_key)
+  call void @llvm.lifetime.end.p0(i64 48, ptr nonnull %ipv6_key)
   ret i32 0
 }
 
@@ -35,21 +35,15 @@ entry:
 ; CHECK:       *(u32 *)(r10 - 48) = r{{[0-9]+}}
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind willreturn writeonly
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #2
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
 
-declare dso_local void @test1(ptr) local_unnamed_addr #3
+declare dso_local void @test1(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { argmemonly nounwind willreturn writeonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_trunc_free.ll b/llvm/test/CodeGen/BPF/is_trunc_free.ll
index fe00731928625..6bb8568432c38 100644
--- a/llvm/test/CodeGen/BPF/is_trunc_free.ll
+++ b/llvm/test/CodeGen/BPF/is_trunc_free.ll
@@ -29,7 +29,7 @@
 %struct.env_t = type { i32, i32 }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr %skb) local_unnamed_addr #0 {
+define dso_local i32 @test(ptr %skb) local_unnamed_addr {
 entry:
   %data_end1 = getelementptr inbounds %struct.env_t, ptr %skb, i64 0, i32 1
   %0 = load i32, ptr %data_end1, align 4, !tbaa !2
@@ -49,7 +49,7 @@ if.end10:                                         ; preds = %entry
   %sub.ptr.lhs.cast = ptrtoint ptr %add.ptr to i64
   %4 = trunc i64 %sub.ptr.lhs.cast to i32
   %conv13 = sub i32 %4, %2
-  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13) #2
+  %call = tail call i32 @work(ptr nonnull %skb, i32 %conv13)
   br label %cleanup
 
 cleanup:                                          ; preds = %entry, %if.end10
@@ -59,11 +59,7 @@ cleanup:                                          ; preds = %entry, %if.end10
 
 ; CHECK: w{{[0-9]+}} = *(u32 *)(r{{[0-9]+}} + 0)
 
-declare dso_local i32 @work(ptr, i32) local_unnamed_addr #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare dso_local i32 @work(ptr, i32) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/is_zext_free.ll b/llvm/test/CodeGen/BPF/is_zext_free.ll
index 4b81a908f57b6..3b794a903d277 100644
--- a/llvm/test/CodeGen/BPF/is_zext_free.ll
+++ b/llvm/test/CodeGen/BPF/is_zext_free.ll
@@ -7,7 +7,7 @@
 ;   clang -target bpf -O2 -emit-llvm -S test.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr #0 {
+define dso_local i32 @test(i64 %x, i64 %y) local_unnamed_addr {
 entry:
   %and = and i64 %y, %x
   %conv = trunc i64 %and to i32
@@ -17,8 +17,6 @@ entry:
 ; CHECK: r[[REG1:[0-9]+]] = r{{[0-9]+}}
 ; CHECK: w[[REG1]] &= w{{[0-9]+}}
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
index fb1043ce4cdf2..8158a1bac3f8d 100644
--- a/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
+++ b/llvm/test/CodeGen/BPF/objdump_two_funcs.ll
@@ -14,7 +14,7 @@
 ;   clang -target bpf -S -gdwarf-5 -gembed-source -emit-llvm -g -O2 bug.c
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func1(i32 %a) local_unnamed_addr #0 section "s1" !dbg !7 {
+define dso_local i32 @func1(i32 %a) local_unnamed_addr section "s1" !dbg !7 {
 entry:
 ; CHECK: <func1>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !12, metadata !DIExpression()), !dbg !13
@@ -24,7 +24,7 @@ entry:
 }
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @func2(i32 %a) local_unnamed_addr #0 section "s2" !dbg !16 {
+define dso_local i32 @func2(i32 %a) local_unnamed_addr section "s2" !dbg !16 {
 entry:
 ; CHECK: <func2>:
   call void @llvm.dbg.value(metadata i32 %a, metadata !18, metadata !DIExpression()), !dbg !19
@@ -35,10 +35,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/optnone-1.ll b/llvm/test/CodeGen/BPF/optnone-1.ll
index 68046bf026fac..f45c85b891c33 100644
--- a/llvm/test/CodeGen/BPF/optnone-1.ll
+++ b/llvm/test/CodeGen/BPF/optnone-1.ll
@@ -5,7 +5,7 @@
 ;   clang -target bpf -g -S -emit-llvm test.c
 
 ; Function Attrs: noinline nounwind optnone
-define dso_local i32 @test(i32 %a, i32 %b) #0 !dbg !7 {
+define dso_local i32 @test(i32 %a, i32 %b) !dbg !7 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -22,10 +22,7 @@ entry:
 ; CHECK-LABEL: test
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable}
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf-2.ll b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
index 7398257e43a91..430abc72b3ccd 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf-2.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf-2.ll
@@ -14,7 +14,7 @@
 @s = internal global i32 0, align 4, !dbg !6
 
 ; Function Attrs: norecurse nounwind
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
+define dso_local i32 @test() local_unnamed_addr !dbg !14 {
   %1 = load i32, ptr @g, align 4, !dbg !17, !tbaa !18
   %2 = load volatile i32, ptr @s, align 4, !dbg !22, !tbaa !18
   %3 = add nsw i32 %2, %1, !dbg !23
@@ -27,8 +27,6 @@ define dso_local i32 @test() local_unnamed_addr #0 !dbg !14 {
 ; CHECK-RELOC: R_BPF_64_NODYLD32 g
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!10, !11, !12}
 !llvm.ident = !{!13}
diff --git a/llvm/test/CodeGen/BPF/reloc-btf.ll b/llvm/test/CodeGen/BPF/reloc-btf.ll
index b9f6e3af6d72c..875bfa1eadbf3 100644
--- a/llvm/test/CodeGen/BPF/reloc-btf.ll
+++ b/llvm/test/CodeGen/BPF/reloc-btf.ll
@@ -1,7 +1,7 @@
 ; RUN: llc -mtriple=bpfel -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
 ; Function Attrs: norecurse nounwind readnone
-define dso_local i32 @test() local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test() local_unnamed_addr !dbg !7 {
 entry:
   ret i32 0, !dbg !11
 }
@@ -13,8 +13,6 @@ entry:
 ; CHECK-RELOC: RELOCATION RECORDS FOR [.BTF.ext]:
 ; CHECK-RELOC: R_BPF_64_NODYLD32 .text
 
-attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/CodeGen/BPF/simplifycfg.ll b/llvm/test/CodeGen/BPF/simplifycfg.ll
index fcd2321cd08f1..d53b51a105c12 100644
--- a/llvm/test/CodeGen/BPF/simplifycfg.ll
+++ b/llvm/test/CodeGen/BPF/simplifycfg.ll
@@ -38,15 +38,15 @@ target triple = "bpf"
 %struct.FrameData = type { ptr }
 
 ; Function Attrs: nounwind
-define dso_local i32 @test() #0 {
+define dso_local i32 @test() {
 entry:
   %frame_ptr = alloca ptr, align 8
   %frame = alloca %struct.FrameData, align 8
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr) #3
-  call void @llvm.lifetime.start.p0(i64 8, ptr %frame) #3
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame_ptr)
+  call void @llvm.lifetime.start.p0(i64 8, ptr %frame)
   call void @get_frame_ptr(ptr %frame_ptr)
-  call void @llvm.lifetime.start.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.start.p0(i64 4, ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond
 
@@ -61,7 +61,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(i64 4, ptr %i) #3
+  call void @llvm.lifetime.end.p0(i64 4, ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -93,25 +93,20 @@ for.end:                                          ; preds = %for.cond.cleanup
   %5 = load ptr, ptr %frame_ptr, align 8, !tbaa !6
   %cmp2 = icmp eq ptr %5, null
   %conv = zext i1 %cmp2 to i32
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame) #3
-  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr) #3
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame)
+  call void @llvm.lifetime.end.p0(i64 8, ptr %frame_ptr)
   ret i32 %conv
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64 immarg, ptr nocapture)
 
-declare dso_local void @get_frame_ptr(ptr) #2
+declare dso_local void @get_frame_ptr(ptr)
 
-declare dso_local i32 @get_data(ptr, ptr) #2
+declare dso_local i32 @get_data(ptr, ptr)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/BPF/warn-stack.ll b/llvm/test/CodeGen/BPF/warn-stack.ll
index 58a6e4c8d9e74..5e62a91a07521 100644
--- a/llvm/test/CodeGen/BPF/warn-stack.ll
+++ b/llvm/test/CodeGen/BPF/warn-stack.ll
@@ -1,43 +1,37 @@
 ; RUN: not llc -mtriple=bpfel < %s 2>&1 >/dev/null | FileCheck %s
 
 ;; CHECK-NOT: nowarn
-define void @nowarn() local_unnamed_addr #0 !dbg !6 {
+define void @nowarn() local_unnamed_addr !dbg !6 {
   %1 = alloca [504 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1) #4, !dbg !15
+  call void @llvm.lifetime.start.p0(i64 504, ptr nonnull %1), !dbg !15
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !10, metadata !16), !dbg !17
-  call void @doit(ptr nonnull %1) #4, !dbg !18
-  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1) #4, !dbg !19
+  call void @doit(ptr nonnull %1), !dbg !18
+  call void @llvm.lifetime.end.p0(i64 504, ptr nonnull %1), !dbg !19
   ret void, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @doit(ptr) local_unnamed_addr #3
+declare void @doit(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
 ; CHECK: error: warn_stack.c
 ; CHECK: BPF stack limit
-define void @warn() local_unnamed_addr #0 !dbg !20 {
+define void @warn() local_unnamed_addr !dbg !20 {
   %1 = alloca [512 x i8], align 1
-  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1) #4, !dbg !26
+  call void @llvm.lifetime.start.p0(i64 512, ptr nonnull %1), !dbg !26
   tail call void @llvm.dbg.declare(metadata ptr %1, metadata !22, metadata !16), !dbg !27
-  call void @doit(ptr nonnull %1) #4, !dbg !28
-  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1) #4, !dbg !29
+  call void @doit(ptr nonnull %1), !dbg !28
+  call void @llvm.lifetime.end.p0(i64 512, ptr nonnull %1), !dbg !29
   ret void, !dbg !29
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
 !llvm.ident = !{!5}
diff --git a/llvm/test/CodeGen/BPF/xadd.ll b/llvm/test/CodeGen/BPF/xadd.ll
index a3ec3235e04f4..8d232ffbafe4c 100644
--- a/llvm/test/CodeGen/BPF/xadd.ll
+++ b/llvm/test/CodeGen/BPF/xadd.ll
@@ -17,7 +17,7 @@ target datalayout = "e-m:e-p:64:64-i64:64-n32:64-S128"
 target triple = "bpf"
 
 ; Function Attrs: nounwind
-define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr #0 !dbg !7 {
+define dso_local i32 @test(ptr nocapture %ptr) local_unnamed_addr !dbg !7 {
 entry:
   call void @llvm.dbg.value(metadata ptr %ptr, metadata !13, metadata !DIExpression()), !dbg !15
   %0 = atomicrmw add ptr %ptr, i32 4 seq_cst, !dbg !16
@@ -28,10 +28,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
new file mode 100644
index 0000000000000..6f1bbd050bd7c
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/CBufferAccess/unused.ll
@@ -0,0 +1,13 @@
+; RUN: opt -S -dxil-cbuffer-access -mtriple=dxil--shadermodel6.3-library %s | FileCheck %s
+; Check that we correctly ignore cbuffers that were nulled out by optimizations.
+
+%__cblayout_CB = type <{ float }>
+@CB.cb = local_unnamed_addr global target("dx.CBuffer", target("dx.Layout", %__cblayout_CB, 4, 0)) poison
+@x = external local_unnamed_addr addrspace(2) global float, align 4
+
+; CHECK-NOT: !hlsl.cbs =
+!hlsl.cbs = !{!0, !1, !2}
+
+!0 = !{ptr @CB.cb, ptr addrspace(2) @x}
+!1 = !{ptr @CB.cb, null}
+!2 = !{null, null}
diff --git a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
index 4f13f4789cd66..56798c8382d45 100644
--- a/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
+++ b/llvm/test/CodeGen/DirectX/Metadata/resource-symbols.ll
@@ -28,6 +28,11 @@ define void @test() {
       @llvm.dx.resource.handlefrombinding(i32 0, i32 10, i32 1, i32 0, ptr @SB.str)
   ; CHECK: %"StructuredBuffer<struct.S>" = type { %struct.S }
 
+  ; StructuredBuffer<float[3][2]>
+  %struct1 = call target("dx.RawBuffer", [3 x [2 x float]], 0, 0)
+      @llvm.dx.resource.handlefrombinding(i32 0, i32 12, i32 1, i32 0, ptr null)
+  ; CHECK: %"StructuredBuffer<float[3][2]>" = type { [3 x [2 x float]] }
+
   ; ByteAddressBuffer
   %byteaddr = call target("dx.RawBuffer", i8, 0, 0)
       @llvm.dx.resource.handlefrombinding(i32 0, i32 20, i32 1, i32 0, ptr null)
@@ -40,12 +45,14 @@ define void @test() {
 ; CHECK-NEXT: @[[T1:.*]] = external constant %"Buffer<int32_t>"
 ; CHECK-NEXT: @[[T2:.*]] = external constant %"Buffer<uint32_t3>"
 ; CHECK-NEXT: @[[S0:.*]] = external constant %"StructuredBuffer<struct.S>"
+; CHECK-NEXT: @[[S1:.*]] = external constant %"StructuredBuffer<float[3][2]>"
 ; CHECK-NEXT: @[[B0:.*]] = external constant %ByteAddressBuffer
 
 ; CHECK: !{i32 0, ptr @[[T0]], !"A"
 ; CHECK: !{i32 1, ptr @[[T1]], !""
 ; CHECK: !{i32 2, ptr @[[T2]], !""
 ; CHECK: !{i32 3, ptr @[[S0]], !"SB"
-; CHECK: !{i32 4, ptr @[[B0]], !""
+; CHECK: !{i32 4, ptr @[[S1]], !""
+; CHECK: !{i32 5, ptr @[[B0]], !""
 
 attributes #0 = { nocallback nofree nosync nounwind willreturn memory(none) }
diff --git a/llvm/test/CodeGen/DirectX/legalize-module-flags.ll b/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
index 6c29deabc2aa3..044bd91866e61 100644
--- a/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-module-flags.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-prepare -mtriple=dxil-unknown-shadermodel6.0-compute %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata -mtriple=dxil-unknown-shadermodel6.0-compute %s | FileCheck %s
 
 ; Make sure behavior flag > 6 is fixed.
 ; CHECK: !{i32 2, !"frame-pointer", i32 2}
diff --git a/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll b/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
index 244ec8d54e131..b8a60a8b6e662 100644
--- a/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
+++ b/llvm/test/CodeGen/DirectX/legalize-module-flags2.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S -dxil-prepare -mtriple=dxil-unknown-shadermodel6.0-library %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata -mtriple=dxil-unknown-shadermodel6.0-library %s | FileCheck %s
 
 ; CHECK: define void @main()
 ; Make sure behavior flag > 6 is fixed.
diff --git a/llvm/test/CodeGen/DirectX/llc-pipeline.ll b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
index 13c25396ea98f..d265826cd2469 100644
--- a/llvm/test/CodeGen/DirectX/llc-pipeline.ll
+++ b/llvm/test/CodeGen/DirectX/llc-pipeline.ll
@@ -40,8 +40,8 @@
 ; CHECK-NEXT:   DXIL Resources Analysis
 ; CHECK-NEXT:   DXIL Module Metadata analysis
 ; CHECK-NEXT:   DXIL Shader Flag Analysis
-; CHECK-NEXT:   DXIL Translate Metadata
 ; CHECK-NEXT:   DXIL Root Signature Analysis
+; CHECK-NEXT:   DXIL Translate Metadata
 ; CHECK-NEXT:   DXIL Post Optimization Validation
 ; CHECK-NEXT:   DXIL Op Lowering
 ; CHECK-NEXT:   DXIL Prepare Module
diff --git a/llvm/test/CodeGen/DirectX/metadata-stripping.ll b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
index eb939babd7d62..531ab6c334d24 100644
--- a/llvm/test/CodeGen/DirectX/metadata-stripping.ll
+++ b/llvm/test/CodeGen/DirectX/metadata-stripping.ll
@@ -1,4 +1,4 @@
-; RUN: opt -S --dxil-prepare %s | FileCheck %s
+; RUN: opt -S --dxil-translate-metadata %s | FileCheck %s
 
 ; Test that only metadata nodes that are valid in DXIL are allowed through
 
diff --git a/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
index 9190d0305d63f..2c4140dae9e3e 100644
--- a/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
+++ b/llvm/test/CodeGen/DirectX/strip-llvm-errno-tbaa.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 
-; Ensures that dxil-prepare will remove the llvm.errno.tbaa metadata
+; Ensures that dxil-translate-metadata will remove the llvm.errno.tbaa metadata
 
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
@@ -10,7 +10,6 @@ entry:
 }
 
 ; CHECK-NOT: !llvm.errno.tbaa
-; CHECK-NOT: {{^!}}
 
 !llvm.errno.tbaa = !{!0}
 
diff --git a/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
index 3ac617ae871fc..daf20bf636b00 100644
--- a/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
+++ b/llvm/test/CodeGen/DirectX/strip-rootsignatures.ll
@@ -1,6 +1,6 @@
-; RUN: opt -S -dxil-prepare < %s | FileCheck %s
+; RUN: opt -S -dxil-translate-metadata < %s | FileCheck %s
 
-; Ensures that dxil-prepare will remove the dx.rootsignatures metadata
+; Ensures that dxil-translate-metadata  will remove the dx.rootsignatures metadata
 
 target triple = "dxil-unknown-shadermodel6.0-compute"
 
@@ -10,7 +10,6 @@ entry:
 }
 
 ; CHECK-NOT: !dx.rootsignatures
-; CHECK-NOT: {{^!}}
 
 !dx.rootsignatures = !{!2} ; list of function/root signature pairs
 !2 = !{ ptr @main, !3, i32 2 } ; function, root signature
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
new file mode 100644
index 0000000000000..4385da3373de7
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_scalarize_scatter.ll
@@ -0,0 +1,63 @@
+; Make sure we do not assert for the cases we do not handle.
+; RUN: llc -march=hexagon -mattr=+hvx,+hvx-length128b,+hvxv75,+v75,-long-calls < %s | FileCheck %s
+
+; Mainly make sure we do not core dump.
+; CHECK-NOT: scatter
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: mustprogress nofree norecurse nosync nounwind memory(argmem: write, inaccessiblemem: readwrite)
+define dso_local void @foo(ptr noundef writeonly captures(none) %cptr, i32 noundef %T, i32 noundef %W) local_unnamed_addr #0 {
+entry:
+  %invariant.gep11 = getelementptr i8, ptr %cptr, i32 0
+  %invariant.gep13 = getelementptr i8, ptr %invariant.gep11, i32 0
+  %cmp.not15 = icmp ugt i32 8, %T
+  br i1 %cmp.not15, label %for.cond.cleanup, label %for.cond1.preheader.lr.ph
+
+for.cond1.preheader.lr.ph:                        ; preds = %entry
+  %cmp3.not8 = icmp ugt i32 8, %W
+  %conv.ripple.LS.instance = trunc i32 %W to i8
+  %conv.ripple.LS.instance.ripple.bcast.splatinsert = insertelement <64 x i8> poison, i8 %conv.ripple.LS.instance, i64 0
+  %conv.ripple.LS.instance.ripple.bcast.splat = shufflevector <64 x i8> %conv.ripple.LS.instance.ripple.bcast.splatinsert, <64 x i8> poison, <64 x i32> zeroinitializer
+  br label %for.cond1.preheader
+
+for.cond.loopexit:                                ; preds = %for.body5, %for.cond1.preheader
+  %add = add i32 %add17, 8
+  %cmp.not = icmp ugt i32 %add, %T
+  br i1 %cmp.not, label %for.cond.cleanup, label %for.cond1.preheader
+
+for.cond1.preheader:                              ; preds = %for.cond1.preheader.lr.ph, %for.cond.loopexit
+  %add17 = phi i32 [ 8, %for.cond1.preheader.lr.ph ], [ %add, %for.cond.loopexit ]
+  %t.016 = phi i32 [ 0, %for.cond1.preheader.lr.ph ], [ %add17, %for.cond.loopexit ]
+  br i1 %cmp3.not8, label %for.cond.loopexit, label %for.body5.lr.ph
+
+for.body5.lr.ph:                                  ; preds = %for.cond1.preheader
+  %gep14 = getelementptr i8, ptr %invariant.gep13, i32 %t.016
+  br label %for.body5
+
+for.cond.cleanup:                                 ; preds = %for.cond.loopexit, %entry
+  ret void
+
+for.body5:                                        ; preds = %for.body5.lr.ph, %for.body5
+  %add210 = phi i32 [ 8, %for.body5.lr.ph ], [ %add2, %for.body5 ]
+  %w.09 = phi i32 [ 0, %for.body5.lr.ph ], [ %add210, %for.body5 ]
+  %gep = getelementptr i8, ptr %gep14, i32 %w.09
+  %gep.ripple.LS.instance = getelementptr i8, ptr %gep, <64 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14>
+  call void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %conv.ripple.LS.instance.ripple.bcast.splat, <64 x ptr> %gep.ripple.LS.instance, i32 1, <64 x i1> splat (i1 true))
+  %add2 = add i32 %add210, 8
+  %cmp3.not = icmp ugt i32 %add2, %W
+  br i1 %cmp3.not, label %for.cond.loopexit, label %for.body5
+}
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: write)
+declare void @llvm.ripple.block.setsize.i32(i32 immarg %0, i32 immarg %1, i32 %2) #1
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.index.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: read)
+declare i32 @llvm.ripple.block.getsize.i32(i32 immarg %0, i32 immarg %1) #2
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(write)
+declare void @llvm.masked.scatter.v64i8.v64p0(<64 x i8> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #3
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
new file mode 100644
index 0000000000000..83fd63ebc037c
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather.ll
@@ -0,0 +1,55 @@
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_gather_32:
+; CHECK: vtmp.w = vgather
+; CHECK-LABEL: Ripple_gather_16:
+; CHECK: vtmp.h = vgather
+; CHECK-LABEL: Ripple_gather_8:
+; CHECK: vand
+; CHECK: vpacke
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <32 x ptr> %source.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %0 = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i32, <32 x ptr> %source.ripple.bcast.splat, <32 x i32> %0
+  %1 = tail call <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr> %arrayidx2.ripple.vectorized, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <32 x i32> poison)
+  store <32 x i32> %1, ptr %destination, align 4
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <64 x ptr> %source.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %0 = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.vectorized = zext <64 x i16> %0 to <64 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i16, <64 x ptr> %source.ripple.bcast.splat, <64 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.vectorized, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %1, ptr %destination, align 2
+  ret void
+}
+
+; Function Attrs: nofree noinline norecurse nosync nounwind memory(argmem: readwrite, inaccessiblemem: readwrite)
+define dso_local void @Ripple_gather_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %source.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %source, i64 0
+  %source.ripple.bcast.splat = shufflevector <128 x ptr> %source.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %0 = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.vectorized = zext <128 x i8> %0 to <128 x i32>
+  %arrayidx2.ripple.vectorized = getelementptr inbounds i8, <128 x ptr> %source.ripple.bcast.splat, <128 x i32> %idxprom.ripple.vectorized
+  %1 = tail call <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %arrayidx2.ripple.vectorized, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <128 x i8> poison)
+  store <128 x i8> %1, ptr %destination, align 1
+  ret void
+}
+
+; Function Attrs: nocallback nofree nosync nounwind willreturn memory(read)
+declare <32 x i32> @llvm.masked.gather.v32i32.v32p0(<32 x ptr>, i32 immarg, <32 x i1>, <32 x i32>) #1
+declare <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr>, i32 immarg, <64 x i1>, <64 x i16>) #1
+declare <128 x i8> @llvm.masked.gather.v128i8.v128p0(<128 x ptr> %0, i32 immarg %1, <128 x i1> %2, <128 x i8> %3) #1
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
new file mode 100644
index 0000000000000..1bd79d7a46556
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vgather_SpVV.ll
@@ -0,0 +1,54 @@
+; Verify that we generate HVX vgather for the given input.
+; RUN: llc -march=hexagon -mattr=+hvxv73,+hvx-length128b,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+; CHECK-LABEL: SpVV_Ripple:
+; CHECK: vtmp.h = vgather(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local i32 @SpVV_Ripple(ptr nocapture noundef writeonly %scratchpad, ptr nocapture noundef readonly %Source_1, ptr nocapture noundef readonly %S_index, i32 noundef %nS, ptr nocapture noundef readonly %Source_2) local_unnamed_addr #1 {
+entry:
+  %Source_2.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %Source_2, i64 0
+  %Source_2.ripple.bcast.splat = shufflevector <64 x ptr> %Source_2.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %div16 = lshr i32 %nS, 6
+  %cmp6.not = icmp ult i32 %nS, 64
+  br i1 %cmp6.not, label %for.end, label %for.body
+
+for.body:                                         ; preds = %entry, %for.body
+  %lsr.iv17 = phi ptr [ %scevgep18, %for.body ], [ %S_index, %entry ]
+  %lsr.iv = phi ptr [ %scevgep, %for.body ], [ %Source_1, %entry ]
+  %result.08.ripple.vectorized = phi <64 x i32> [ %add8.ripple.vectorized, %for.body ], [ zeroinitializer, %entry ]
+  %_ripple_block_0.07 = phi i32 [ %add9, %for.body ], [ 0, %entry ]
+  %.ripple.LS.instance = load <64 x i16>, ptr %lsr.iv17, align 2
+  %idxprom.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx2.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %Source_2.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  %.ripple.LS.instance13 = tail call <64 x i16> @llvm.masked.gather.v64i16.v64p0(<64 x ptr> %arrayidx2.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <64 x i16> poison)
+  store <64 x i16> %.ripple.LS.instance13, ptr %scratchpad, align 2
+  %.ripple.LS.instance15 = load <64 x i16>, ptr %lsr.iv, align 2
+  %conv.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance15 to <64 x i32>
+  %conv6.ripple.LS.instance = sext <64 x i16> %.ripple.LS.instance13 to <64 x i32>
+  %mul7.ripple.LS.instance = mul nsw <64 x i32> %conv.ripple.LS.instance, %conv6.ripple.LS.instance
+  %add8.ripple.vectorized = add <64 x i32> %mul7.ripple.LS.instance, %result.08.ripple.vectorized
+  %add9 = add nuw nsw i32 %_ripple_block_0.07, 1
+  %scevgep = getelementptr i8, ptr %lsr.iv, i32 128
+  %scevgep18 = getelementptr i8, ptr %lsr.iv17, i32 128
+  %cmp = icmp ult i32 %add9, %div16
+  br i1 %cmp, label %for.body, label %for.end
+for.end:                                          ; preds = %for.body, %entry
+  %result.0.lcssa.ripple.LS.instance = phi <64 x i32> [ zeroinitializer, %entry ], [ %add8.ripple.vectorized, %for.body ]
+  %rdx.shuf = shufflevector <64 x i32> %result.0.lcssa.ripple.LS.instance, <64 x i32> poison, <64 x i32> <i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 45, i32 46, i32 47, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60, i32 61, i32 62, i32 63, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx = add <64 x i32> %result.0.lcssa.ripple.LS.instance, %rdx.shuf
+  %rdx.shuf19 = shufflevector <64 x i32> %bin.rdx, <64 x i32> poison, <64 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx20 = add <64 x i32> %bin.rdx, %rdx.shuf19
+  %rdx.shuf21 = shufflevector <64 x i32> %bin.rdx20, <64 x i32> poison, <64 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx22 = add <64 x i32> %bin.rdx20, %rdx.shuf21
+  %rdx.shuf23 = shufflevector <64 x i32> %bin.rdx22, <64 x i32> poison, <64 x i32> <i32 4, i32 5, i32 6, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx24 = add <64 x i32> %bin.rdx22, %rdx.shuf23
+  %rdx.shuf25 = shufflevector <64 x i32> %bin.rdx24, <64 x i32> poison, <64 x i32> <i32 2, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx26 = add <64 x i32> %bin.rdx24, %rdx.shuf25
+  %rdx.shuf27 = shufflevector <64 x i32> %bin.rdx26, <64 x i32> poison, <64 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+  %bin.rdx28 = add <64 x i32> %bin.rdx26, %rdx.shuf27
+  %0 = extractelement <64 x i32> %bin.rdx28, i32 0
+  ret i32 %0
+}
diff --git a/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
new file mode 100644
index 0000000000000..85d299948b729
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/autohvx/ripple_vscatter.ll
@@ -0,0 +1,52 @@
+; RUN: llc -march=hexagon -mattr=+hvx-length128b,+hvxv73,+v73,-long-calls -hexagon-allow-scatter-gather-hvx < %s | FileCheck %s
+
+; CHECK-LABEL: Ripple_scatter_8:
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK: if (q{{[0-9]+}}) vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h
+; CHECK-LABEL: Ripple_scatter_16:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.h).h = v{{[0-9]+}}
+; CHECK-LABEL: Ripple_scatter_32:
+; CHECK: vscatter(r{{[0-9]+}},m0,v{{[0-9]+}}.w).w = v{{[0-9]+}}
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon"
+
+define dso_local void @Ripple_scatter_8(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <128 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <128 x ptr> %destination.ripple.bcast.splatinsert, <128 x ptr> poison, <128 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <128 x i8>, ptr %source, align 1
+  %.ripple.LS.instance = load <128 x i8>, ptr %indexes, align 1
+  %idxprom.ripple.LS.instance = zext <128 x i8> %.ripple.LS.instance to <128 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i8, <128 x ptr> %destination.ripple.bcast.splat, <128 x i32> %idxprom.ripple.LS.instance
+  %cst_ptr_to_i32 = ptrtoint ptr %destination to i32
+  tail call void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %.ripple.LS.instance11, <128 x ptr> %arrayidx3.ripple.LS.instance, i32 1, <128 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_16(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <64 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <64 x ptr> %destination.ripple.bcast.splatinsert, <64 x ptr> poison, <64 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <64 x i16>, ptr %source, align 2
+  %.ripple.LS.instance = load <64 x i16>, ptr %indexes, align 2
+  %idxprom.ripple.LS.instance = zext <64 x i16> %.ripple.LS.instance to <64 x i32>
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i16, <64 x ptr> %destination.ripple.bcast.splat, <64 x i32> %idxprom.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %.ripple.LS.instance11, <64 x ptr> %arrayidx3.ripple.LS.instance, i32 2, <64 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+define dso_local void @Ripple_scatter_32(ptr nocapture noundef writeonly %destination, ptr nocapture noundef readonly %source, ptr nocapture noundef readonly %indexes) local_unnamed_addr #0 {
+entry:
+  %destination.ripple.bcast.splatinsert = insertelement <32 x ptr> poison, ptr %destination, i64 0
+  %destination.ripple.bcast.splat = shufflevector <32 x ptr> %destination.ripple.bcast.splatinsert, <32 x ptr> poison, <32 x i32> zeroinitializer
+  %.ripple.LS.instance11 = load <32 x i32>, ptr %source, align 4
+  %.ripple.LS.instance = load <32 x i32>, ptr %indexes, align 4
+  %arrayidx3.ripple.LS.instance = getelementptr inbounds i32, <32 x ptr> %destination.ripple.bcast.splat, <32 x i32> %.ripple.LS.instance
+  tail call void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %.ripple.LS.instance11, <32 x ptr> %arrayidx3.ripple.LS.instance, i32 4, <32 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>)
+  ret void
+}
+
+declare void @llvm.masked.scatter.v128i8.v128p0(<128 x i8> %0, <128 x ptr> %1, i32 immarg %2, <128 x i1> %3) #2
+declare void @llvm.masked.scatter.v64i16.v64p0(<64 x i16> %0, <64 x ptr> %1, i32 immarg %2, <64 x i1> %3) #2
+declare void @llvm.masked.scatter.v32i32.v32p0(<32 x i32> %0, <32 x ptr> %1, i32 immarg %2, <32 x i1> %3) #2
diff --git a/llvm/test/CodeGen/Hexagon/masked_gather.ll b/llvm/test/CodeGen/Hexagon/masked_gather.ll
new file mode 100644
index 0000000000000..461fd7921d796
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/masked_gather.ll
@@ -0,0 +1,58 @@
+; This produced masked gather that we are not yet handling
+; REQUIRES: asserts
+; RUN: opt -march=hexagon -passes=loop-vectorize -hexagon-autohvx -mattr=+hvx-length128b,+hvxv68,+v68,+hvx-ieee-fp,-long-calls,-packets -debug-only=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Original C++
+; clang -c -Os -mhvx -mhvx-ieee-fp -fvectorize -mno-packets -fno-strict-aliasing -Os -mhvx -mhvx-ieee-fp  -mno-packets -mv68
+;typedef struct poptContext_s * poptContext;
+;typedef struct { unsigned int bits[1]; } pbm_set;
+;struct poptContext_s { pbm_set * arg_strip; };
+;
+;int poptStrippedArgv(poptContext con, int argc, char ** argv) {
+;  int numargs = argc;
+;   for (int i = 1; i < argc; i++) {
+;     if (((((con->arg_strip)->bits)[((i) / (8 * sizeof (unsigned int)))] & ((unsigned int) 1 << ((i) % (8 * sizeof (unsigned int))))) != 0))
+;     numargs--;
+;   }
+;    return numargs;
+;}
+
+; CHECK-NOT: masked_gather
+
+target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i1:8:8-f32:32:32-f64:64:64-v32:32:32-v64:64:64-v512:512:512-v1024:1024:1024-v2048:2048:2048"
+target triple = "hexagon-unknown-unknown-elf"
+
+; Function Attrs: nofree norecurse nosync nounwind optsize memory(read, inaccessiblemem: none)
+define dso_local i32 @poptStrippedArgv(ptr noundef readonly captures(none) %con, i32 noundef %argc, ptr noundef readnone captures(none) %argv) local_unnamed_addr #0 {
+entry:
+  %cmp8 = icmp sgt i32 %argc, 1
+  br i1 %cmp8, label %for.body.lr.ph, label %for.cond.cleanup
+
+for.body.lr.ph:                                   ; preds = %entry
+  %0 = load ptr, ptr %con, align 4
+  br label %for.body
+
+for.cond.cleanup.loopexit:                        ; preds = %for.body
+  %spec.select.lcssa = phi i32 [ %spec.select, %for.body ]
+  br label %for.cond.cleanup
+
+for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
+  %numargs.0.lcssa = phi i32 [ %argc, %entry ], [ %spec.select.lcssa, %for.cond.cleanup.loopexit ]
+  ret i32 %numargs.0.lcssa
+
+for.body:                                         ; preds = %for.body.lr.ph, %for.body
+  %i.010 = phi i32 [ 1, %for.body.lr.ph ], [ %inc, %for.body ]
+  %numargs.09 = phi i32 [ %argc, %for.body.lr.ph ], [ %spec.select, %for.body ]
+  %div7 = lshr i32 %i.010, 5
+  %arrayidx = getelementptr inbounds nuw [1 x i32], ptr %0, i32 0, i32 %div7
+  %1 = load i32, ptr %arrayidx, align 4
+  %rem = and i32 %i.010, 31
+  %shl = shl nuw i32 1, %rem
+  %and = and i32 %1, %shl
+  %cmp1.not = icmp ne i32 %and, 0
+  %dec = sext i1 %cmp1.not to i32
+  %spec.select = add nsw i32 %numargs.09, %dec
+  %inc = add nuw nsw i32 %i.010, 1
+  %exitcond.not = icmp eq i32 %inc, %argc
+  br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body
+}
diff --git a/llvm/test/CodeGen/Hexagon/swp-phi.ll b/llvm/test/CodeGen/Hexagon/swp-phi.ll
index 9b0e1262c73f5..6ce2481e52115 100644
--- a/llvm/test/CodeGen/Hexagon/swp-phi.ll
+++ b/llvm/test/CodeGen/Hexagon/swp-phi.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=hexagon -enable-unsafe-fp-math -enable-pipeliner \
+; RUN: llc -mtriple=hexagon -enable-pipeliner \
 ; RUN:     -pipeliner-prune-deps=false -stats -o /dev/null < %s
 ; REQUIRES: asserts
 
diff --git a/llvm/test/CodeGen/Hexagon/vector-gather.ll b/llvm/test/CodeGen/Hexagon/vector-gather.ll
new file mode 100644
index 0000000000000..5700380508a57
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/vector-gather.ll
@@ -0,0 +1,27 @@
+; REQUIRES: hexagon-registered-target
+; RUN: llc -march=hexagon -mcpu=hexagonv73 -mattr=+hvxv73,+hvx-length128b < %s | FileCheck %s
+
+target triple = "hexagon"
+
+@VTCM_SCATTER16_ADDRESS = dso_local global i32 0, align 4
+@region_len = dso_local global i32 16383, align 4
+
+; CHECK: [[ADR:r[0-9]+]] = memw(gp+#VTCM_SCATTER16_ADDRESS)
+; CHECK: vtmp.h = vgather([[ADR]],m0,v0.h).h
+; CHECK: vmem(r0+#0) = vtmp.new
+
+define dso_local void @vector_gather_16(ptr noundef %vgather, <32 x i32> noundef %offsets) #0 {
+entry:
+  %vgather.addr = alloca ptr, align 4
+  %offsets.addr = alloca <32 x i32>, align 128
+  store ptr %vgather, ptr %vgather.addr, align 4
+  store <32 x i32> %offsets, ptr %offsets.addr, align 128
+  %0 = load ptr, ptr %vgather.addr, align 4
+  %1 = load i32, ptr @VTCM_SCATTER16_ADDRESS, align 4
+  %2 = load i32, ptr @region_len, align 4
+  %3 = load <32 x i32>, ptr %offsets.addr, align 128
+  call void @llvm.hexagon.V6.vgathermh.128B(ptr %0, i32 %1, i32 %2, <32 x i32> %3)
+  ret void
+}
+
+declare <128 x i1> @llvm.hexagon.V6.vandvrt.128B(<32 x i32>, i32)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
index 245f76472b844..7149cdb99063a 100644
--- a/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
+++ b/llvm/test/CodeGen/LoongArch/lasx/shuffle-as-permute-and-shuffle.ll
@@ -32,9 +32,7 @@ define <16 x i16> @shuffle_v16i16(<16 x i16> %a) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI2_0)
 ; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI2_0)
-; CHECK-NEXT:    xvpermi.d $xr2, $xr0, 78
-; CHECK-NEXT:    xvshuf.w $xr1, $xr2, $xr0
-; CHECK-NEXT:    xvori.b $xr0, $xr1, 0
+; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <16 x i16> %a, <16 x i16> poison, <16 x i32> <i32 8, i32 9, i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle
@@ -55,9 +53,7 @@ define <16 x i16> @shuffle_v16i16_same_lane(<16 x i16> %a) {
 define <8 x i32> @shuffle_v8i32(<8 x i32> %a) {
 ; CHECK-LABEL: shuffle_v8i32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI4_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI4_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x i32> %a, <8 x i32> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i32> %shuffle
@@ -93,9 +89,7 @@ define <4 x i64> @shuffle_v4i64_same_lane(<4 x i64> %a) {
 define <8 x float> @shuffle_v8f32(<8 x float> %a) {
 ; CHECK-LABEL: shuffle_v8f32:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    pcalau12i $a0, %pc_hi20(.LCPI8_0)
-; CHECK-NEXT:    xvld $xr1, $a0, %pc_lo12(.LCPI8_0)
-; CHECK-NEXT:    xvperm.w $xr0, $xr0, $xr1
+; CHECK-NEXT:    xvpermi.d $xr0, $xr0, 226
 ; CHECK-NEXT:    ret
   %shuffle = shufflevector <8 x float> %a, <8 x float> poison, <8 x i32> <i32 4, i32 5, i32 0, i32 1, i32 4, i32 5, i32 6, i32 7>
   ret <8 x float> %shuffle
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
new file mode 100644
index 0000000000000..6c6f9b6f2921e
--- /dev/null
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask-invalid-scalar.mir
@@ -0,0 +1,15 @@
+# RUN: not llc -mtriple=aarch64-- -run-pass=none -o /dev/null %s 2>&1 | FileCheck %s
+
+---
+name: test_shuffle_0
+tracksRegLiveness: true
+body:             |
+  bb.0:
+    liveins: $d0
+    %0:_(<2 x s32>) = COPY $d0
+    %2:_(<2 x s32>) = G_IMPLICIT_DEF
+    ; CHECK: [[@LINE+1]]:67: shufflemask should have > 1 element
+    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
+    $w0 = COPY %1
+    RET_ReallyLR implicit $w0
+...
diff --git a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
index c529d63d6b548..4cc3bc6b4c2d9 100644
--- a/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
+++ b/llvm/test/CodeGen/MIR/AArch64/parse-shufflemask.mir
@@ -122,54 +122,3 @@ body:             |
     RET_ReallyLR implicit $d0
 
 ...
-
-# CHECK-LABEL: name: test_shuffle_0
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(0)
----
-name: test_shuffle_0
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(0)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_1
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(1)
----
-name: test_shuffle_1
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(1)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
-
-# CHECK-LABEL: name: test_shuffle_undef
-# CHECK: G_SHUFFLE_VECTOR %0(<2 x s32>), %1, shufflemask(undef)
----
-name: test_shuffle_undef
-tracksRegLiveness: true
-body:             |
-  bb.0:
-    liveins: $d0
-
-    %0:_(<2 x s32>) = COPY $d0
-    %2:_(<2 x s32>) = G_IMPLICIT_DEF
-    %1:_(s32) = G_SHUFFLE_VECTOR %0(<2 x s32>), %2, shufflemask(undef)
-    $w0 = COPY %1
-    RET_ReallyLR implicit $w0
-
-...
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
index 2894fff850ba7..da0d13d64e165 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_2D_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities" : {
+    "Opcodes" : {
         "ABS_Fp":[1, 2],
         "ADC":[3, 4],
         "ADD":[5, 6],
@@ -7,5 +7,21 @@
         "ADDPDrr":[9, 10],
         "ADDPSrr":[11, 12],
         "ADDSDrm":[13, 14]
+    },
+    "CommonOperands": {
+        "Immediate": [0.1, 0.1],
+        "MBB": [0.2, 0.2],
+        "FrameIndex": [0.3, 0.3],
+        "GlobalAddress": [0.4, 0.4]
+    },
+    "PhysicalRegisters": {
+        "GR32": [0.5, 0.5],
+        "GR64": [0.6, 0.6],
+        "XMM": [0.7, 0.7]
+    },
+    "VirtualRegisters": {
+        "GR32": [0.8, 0.8],
+        "GR64": [0.9, 0.9],
+        "XMM": [1.0, 1.0]
     }
 }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
index 5de715bf80917..f4b14a4242c64 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json
@@ -1,5 +1,5 @@
 {
-  "entities": {
+  "Opcodes": {
     "KILL": [0.1, 0.2, 0.3],
     "MOV": [0.4, 0.5, 0.6],
     "LEA": [0.7, 0.8, 0.9],
@@ -18,5 +18,21 @@
     "POP": [4.6, 4.7, 4.8],
     "NOP": [4.9, 5.0, 5.1],
     "COPY": [5.2, 5.3, 5.4]
+  },
+  "CommonOperands": {
+    "Immediate": [0.1, 0.1, 0.1],
+    "MBB": [0.2, 0.2, 0.2],
+    "FrameIndex": [0.3, 0.3, 0.3],
+    "GlobalAddress": [0.4, 0.4, 0.4]
+  },
+  "PhysicalRegisters": {
+    "GR32": [0.5, 0.5, 0.5],
+    "GR64": [0.6, 0.6, 0.6],
+    "XMM": [0.7, 0.7, 0.7]
+  },
+  "VirtualRegisters": {
+    "GR32": [0.8, 0.8, 0.8],
+    "GR64": [0.9, 0.9, 0.9],
+    "XMM": [1.0, 1.0, 1.0]
   }
 }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
index bf041635fc603..6274fb747179c 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_inconsistent_dims.json
@@ -1,7 +1,16 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [1.0, 2.0, 3.0],
         "SUB": [1.5],
         "MUL": [2.0, 3.0]
+    },
+    "CommonOperands": {
+        "Immediate": [1.0]
+    },
+    "PhysicalRegisters": {
+        "GR32": [1.0, 2.0]
+    },
+    "VirtualRegisters": {
+        "GR32": [1.0, 2.0, 3.0]
     }
 }
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
index 63e8ccbd41034..7bfdf3bab831d 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/mir2vec_zero_vocab.json
@@ -1,5 +1,5 @@
 {
-    "entities": {
+    "Opcodes": {
         "ADD": [],
         "SUB": [],
         "MUL": [],
@@ -8,5 +8,14 @@
         "JMP": [],
         "CALL": [],
         "RET": []
+    },
+    "CommonOperands": {
+        "Immediate": []
+    },
+    "PhysicalRegisters": {
+        "GR32": []
+    },
+    "VirtualRegisters": {
+        "GR32": []
     }
 }
\ No newline at end of file
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
index 6327cffb77e74..d3c0da9862245 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
index 4409e6dbd5c72..c6e5508248b9b 100644
--- a/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
+++ b/llvm/test/CodeGen/MIR2Vec/Inputs/reference_x86_vocab_wo=0.5_print.txt
@@ -6880,3 +6880,294 @@ Key: XSHA:  [ 0.00  0.00 ]
 Key: XSTORE:  [ 0.00  0.00 ]
 Key: XSUSLDTRK:  [ 0.00  0.00 ]
 Key: XTEST:  [ 0.00  0.00 ]
+Key: Immediate:  [ 0.10  0.10 ]
+Key: CImmediate:  [ 0.00  0.00 ]
+Key: FPImmediate:  [ 0.00  0.00 ]
+Key: MBB:  [ 0.20  0.20 ]
+Key: FrameIndex:  [ 0.30  0.30 ]
+Key: ConstantPoolIndex:  [ 0.00  0.00 ]
+Key: TargetIndex:  [ 0.00  0.00 ]
+Key: JumpTableIndex:  [ 0.00  0.00 ]
+Key: ExternalSymbol:  [ 0.00  0.00 ]
+Key: GlobalAddress:  [ 0.40  0.40 ]
+Key: BlockAddress:  [ 0.00  0.00 ]
+Key: RegisterMask:  [ 0.00  0.00 ]
+Key: RegisterLiveOut:  [ 0.00  0.00 ]
+Key: Metadata:  [ 0.00  0.00 ]
+Key: MCSymbol:  [ 0.00  0.00 ]
+Key: CFIIndex:  [ 0.00  0.00 ]
+Key: IntrinsicID:  [ 0.00  0.00 ]
+Key: Predicate:  [ 0.00  0.00 ]
+Key: ShuffleMask:  [ 0.00  0.00 ]
+Key: PhyReg_GR8:  [ 0.00  0.00 ]
+Key: PhyReg_GRH8:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: PhyReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: PhyReg_GRH16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK1:  [ 0.00  0.00 ]
+Key: PhyReg_VK16:  [ 0.00  0.00 ]
+Key: PhyReg_VK2:  [ 0.00  0.00 ]
+Key: PhyReg_VK4:  [ 0.00  0.00 ]
+Key: PhyReg_VK8:  [ 0.00  0.00 ]
+Key: PhyReg_VK16WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK2WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK4WM:  [ 0.00  0.00 ]
+Key: PhyReg_VK8WM:  [ 0.00  0.00 ]
+Key: PhyReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: PhyReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_FPCCR:  [ 0.00  0.00 ]
+Key: PhyReg_FR16X:  [ 0.00  0.00 ]
+Key: PhyReg_FR16:  [ 0.00  0.00 ]
+Key: PhyReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_FR32X:  [ 0.00  0.00 ]
+Key: PhyReg_GR32:  [ 0.50  0.50 ]
+Key: PhyReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK32:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_RFP32:  [ 0.00  0.00 ]
+Key: PhyReg_VK32WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_CCR:  [ 0.00  0.00 ]
+Key: PhyReg_DFCCR:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: PhyReg_RFP64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64:  [ 0.60  0.60 ]
+Key: PhyReg_FR64X:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: PhyReg_FR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_VK64:  [ 0.00  0.00 ]
+Key: PhyReg_VR64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: PhyReg_VK64WM:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: PhyReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_AD:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_A:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: PhyReg_RST:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80:  [ 0.00  0.00 ]
+Key: PhyReg_RFP80_7:  [ 0.00  0.00 ]
+Key: PhyReg_VR128X:  [ 0.00  0.00 ]
+Key: PhyReg_VR128:  [ 0.00  0.00 ]
+Key: PhyReg_VR256X:  [ 0.00  0.00 ]
+Key: PhyReg_VR256:  [ 0.00  0.00 ]
+Key: PhyReg_VR512:  [ 0.00  0.00 ]
+Key: PhyReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: PhyReg_TILE:  [ 0.00  0.00 ]
+Key: PhyReg_TILEPAIR:  [ 0.00  0.00 ]
+Key: VirtReg_GR8:  [ 0.00  0.00 ]
+Key: VirtReg_GRH8:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_H:  [ 0.00  0.00 ]
+Key: VirtReg_GR8_ABCD_L:  [ 0.00  0.00 ]
+Key: VirtReg_GRH16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK1:  [ 0.00  0.00 ]
+Key: VirtReg_VK16:  [ 0.00  0.00 ]
+Key: VirtReg_VK2:  [ 0.00  0.00 ]
+Key: VirtReg_VK4:  [ 0.00  0.00 ]
+Key: VirtReg_VK8:  [ 0.00  0.00 ]
+Key: VirtReg_VK16WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK2WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK4WM:  [ 0.00  0.00 ]
+Key: VirtReg_VK8WM:  [ 0.00  0.00 ]
+Key: VirtReg_SEGMENT_REG:  [ 0.00  0.00 ]
+Key: VirtReg_GR16_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_FPCCR:  [ 0.00  0.00 ]
+Key: VirtReg_FR16X:  [ 0.00  0.00 ]
+Key: VirtReg_FR16:  [ 0.00  0.00 ]
+Key: VirtReg_VK16PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK2PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK4PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK8PAIR:  [ 0.00  0.00 ]
+Key: VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_FR32X:  [ 0.00  0.00 ]
+Key: VirtReg_GR32:  [ 0.80  0.80 ]
+Key: VirtReg_GR32_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_DEBUG_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK32:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_RFP32:  [ 0.00  0.00 ]
+Key: VirtReg_VK32WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_CCR:  [ 0.00  0.00 ]
+Key: VirtReg_DFCCR:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit:  [ 0.00  0.00 ]
+Key: VirtReg_RFP64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64:  [ 0.90  0.90 ]
+Key: VirtReg_FR64X:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_CONTROL_REG:  [ 0.00  0.00 ]
+Key: VirtReg_FR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_VK64:  [ 0.00  0.00 ]
+Key: VirtReg_VR64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_NOSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit:  [ 0.00  0.00 ]
+Key: VirtReg_VK64WM:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX:  [ 0.00  0.00 ]
+Key: VirtReg_GR64PLTSafe_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_NOREX_and_GR64_TCW64:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ABCD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_AD:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_A:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_ArgRef_and_GR64_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_and_LOW32_ADDR_ACCESS:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI:  [ 0.00  0.00 ]
+Key: VirtReg_RST:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80:  [ 0.00  0.00 ]
+Key: VirtReg_RFP80_7:  [ 0.00  0.00 ]
+Key: VirtReg_VR128X:  [ 0.00  0.00 ]
+Key: VirtReg_VR128:  [ 0.00  0.00 ]
+Key: VirtReg_VR256X:  [ 0.00  0.00 ]
+Key: VirtReg_VR256:  [ 0.00  0.00 ]
+Key: VirtReg_VR512:  [ 0.00  0.00 ]
+Key: VirtReg_VR512_0_15:  [ 0.00  0.00 ]
+Key: VirtReg_TILE:  [ 0.00  0.00 ]
+Key: VirtReg_TILEPAIR:  [ 0.00  0.00 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/if-else.mir b/llvm/test/CodeGen/MIR2Vec/if-else.mir
index 5734a23c9bcb6..f2572f55751e5 100644
--- a/llvm/test/CodeGen/MIR2Vec/if-else.mir
+++ b/llvm/test/CodeGen/MIR2Vec/if-else.mir
@@ -135,10 +135,10 @@ body:             |
 
 # CHECK: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: abc:entry:
-# CHECK-NEXT:  [ 16.50  17.10  17.70 ]
+# CHECK-NEXT:  [ 23.60  24.20  24.80 ]
 # CHECK-NEXT: Machine basic block: abc:if.then:
-# CHECK-NEXT:  [ 4.50  4.80  5.10 ]
+# CHECK-NEXT:  [ 7.30  7.60  7.90 ]
 # CHECK-NEXT: Machine basic block: abc:if.else:
-# CHECK-NEXT:  [ 0.80  1.00  1.20 ]
+# CHECK-NEXT:  [ 3.40  3.60  3.80 ]
 # CHECK-NEXT: Machine basic block: abc:return:
-# CHECK-NEXT:  [ 6.60  6.90  7.20 ]
\ No newline at end of file
+# CHECK-NEXT:  [ 8.80  9.10  9.40 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
index 338cb637bac80..0fdcc812c6b28 100644
--- a/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
+++ b/llvm/test/CodeGen/MIR2Vec/mir2vec-basic-symbolic.mir
@@ -48,29 +48,29 @@ body:             |
     RET 0
 
 # CHECK: MIR2Vec embeddings for machine function add_function:
-# CHECK: Function vector:  [ 19.20  19.80  20.40 ]
+# CHECK: Function vector:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: add_function:entry:
-# CHECK-NEXT:  [ 19.20  19.80  20.40 ]
+# CHECK-NEXT:  [ 26.50  27.10  27.70 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: %1:gr32 = COPY $esi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %0:gr32 = COPY $edi
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: %2:gr32 = nsw ADD32rr %0:gr32(tied-def 0), %1:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: %3:gr32 = ADD32rr %2:gr32(tied-def 0), %2:gr32, implicit-def dead $eflags
-# CHECK-NEXT:  [ 1.30  1.40  1.50 ]
+# CHECK-NEXT:  [ 3.70  3.80  3.90 ]
 # CHECK-NEXT: Machine instruction: $eax = COPY %3:gr32
-# CHECK-NEXT:  [ 5.20  5.30  5.40 ]
+# CHECK-NEXT:  [ 6.00  6.10  6.20 ]
 # CHECK-NEXT: Machine instruction: RET 0, $eax
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 
 # CHECK: MIR2Vec embeddings for machine function simple_function:
-# CHECK-NEXT:Function vector:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:Function vector:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine basic block vectors:
 # CHECK-NEXT: Machine basic block: simple_function:entry:
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
 # CHECK-NEXT: Machine instruction vectors:
 # CHECK-NEXT: Machine instruction: RET 0
-# CHECK-NEXT:  [ 1.00  1.10  1.20 ]
\ No newline at end of file
+# CHECK-NEXT:  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
index c6554bc0b7eca..13e908eb7ba11 100644
--- a/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
+++ b/llvm/test/CodeGen/MIR2Vec/vocab-error-handling.ll
@@ -10,6 +10,6 @@ define dso_local void @test() {
 }
 
 ; CHECK-INVALID: MIR2Vec Vocabulary Printer: Failed to get vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
-; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'entities' section of the vocabulary is zero
-; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'entities' section in vocabulary file
-; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'entities' section of the vocabulary are not of the same dimension
+; CHECK-ZERO-DIM: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Dimension of 'Opcodes' section of the vocabulary is zero
+; CHECK-NO-ENTITIES: MIR2Vec Vocabulary Printer: Failed to get vocabulary - Missing 'Opcodes' section in vocabulary file
+; CHECK-INCONSISTENT-DIMS: MIR2Vec Vocabulary Printer: Failed to get vocabulary - All vectors in the 'Opcodes' section of the vocabulary are not of the same dimension
diff --git a/llvm/test/CodeGen/MSP430/libcalls.ll b/llvm/test/CodeGen/MSP430/libcalls.ll
index 5d3755cbf9b0c..d1bafea2be5db 100644
--- a/llvm/test/CodeGen/MSP430/libcalls.ll
+++ b/llvm/test/CodeGen/MSP430/libcalls.ll
@@ -639,4 +639,18 @@ entry:
   ret i32 %shr
 }
 
+define i64 @test__mspabi_divull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_divull:
+; CHECK: call #__mspabi_divull
+  %result = udiv i64 %a, %b
+  ret i64 %result
+}
+
+define i64 @test__mspabi_remull(i64 %a, i64 %b) #0 {
+; CHECK-LABEL: test__mspabi_remull:
+; CHECK: call #__mspabi_remull
+  %result = urem i64 %a, %b
+  ret i64 %result
+}
+
 attributes #0 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
index b5bdf840facf4..9630dab013a66 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/br1.ll
@@ -31,4 +31,4 @@ if.end:                                           ; preds = %entry, %if.then
 
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll b/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
index a5c1cec0ea11f..d3d2e8b83c346 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/loadstore2.ll
@@ -80,6 +80,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll b/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
index bc6f2c50a805a..e68546549e6c1 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/loadstrconst.ll
@@ -17,5 +17,5 @@ entry:
 
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll b/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
index 90db1fde96fe0..f3b902b3b9263 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/logopm.ll
@@ -590,8 +590,8 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
index d1a0574a19826..eca0d16394f69 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorefp1.ll
@@ -51,4 +51,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
index ee174ddbc25d1..33b4ef8baa4aa 100644
--- a/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
+++ b/llvm/test/CodeGen/Mips/Fast-ISel/simplestorei.ll
@@ -63,6 +63,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/beqzc.ll b/llvm/test/CodeGen/Mips/beqzc.ll
index 28f3f8c144b6e..42eb392d3f369 100644
--- a/llvm/test/CodeGen/Mips/beqzc.ll
+++ b/llvm/test/CodeGen/Mips/beqzc.ll
@@ -14,7 +14,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
 
diff --git a/llvm/test/CodeGen/Mips/beqzc1.ll b/llvm/test/CodeGen/Mips/beqzc1.ll
index 915f34e63ff24..01bb5f134097e 100644
--- a/llvm/test/CodeGen/Mips/beqzc1.ll
+++ b/llvm/test/CodeGen/Mips/beqzc1.ll
@@ -19,6 +19,6 @@ if.end:                                           ; preds = %if.then, %entry
   ret i32 0
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/brsize3.ll b/llvm/test/CodeGen/Mips/brsize3.ll
index 1aea201af96b2..20aab184431fe 100644
--- a/llvm/test/CodeGen/Mips/brsize3.ll
+++ b/llvm/test/CodeGen/Mips/brsize3.ll
@@ -33,7 +33,7 @@ x:                                                ; preds = %x, %entry
 
 }
 
-attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 45}
diff --git a/llvm/test/CodeGen/Mips/brsize3a.ll b/llvm/test/CodeGen/Mips/brsize3a.ll
index de866f210e7b6..b1ebbd809c936 100644
--- a/llvm/test/CodeGen/Mips/brsize3a.ll
+++ b/llvm/test/CodeGen/Mips/brsize3a.ll
@@ -20,7 +20,7 @@ x:                                                ; preds = %x, %entry
 
 }
 
-attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { noreturn nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 45}
diff --git a/llvm/test/CodeGen/Mips/ci2.ll b/llvm/test/CodeGen/Mips/ci2.ll
index a9497296c8e42..4901d8de72ed0 100644
--- a/llvm/test/CodeGen/Mips/ci2.ll
+++ b/llvm/test/CodeGen/Mips/ci2.ll
@@ -33,7 +33,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; constisle	.4byte	305419896               # 0x12345678
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 103}
diff --git a/llvm/test/CodeGen/Mips/cmplarge.ll b/llvm/test/CodeGen/Mips/cmplarge.ll
index db7f37a88ac22..bfb6080ffac5a 100644
--- a/llvm/test/CodeGen/Mips/cmplarge.ll
+++ b/llvm/test/CodeGen/Mips/cmplarge.ll
@@ -33,6 +33,6 @@ for.end:                                          ; preds = %for.body, %entry
 ; cmp16:	.end	getSubImagesLuma
 declare i32 @iClip3(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/const1.ll b/llvm/test/CodeGen/Mips/const1.ll
index 750912d94f903..7915d6659ca51 100644
--- a/llvm/test/CodeGen/Mips/const1.ll
+++ b/llvm/test/CodeGen/Mips/const1.ll
@@ -28,7 +28,7 @@ entry:
 ; CHECK:	.4byte	262991277
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const4a.ll b/llvm/test/CodeGen/Mips/const4a.ll
index 245abbfe196c8..e88ffd3096f6b 100644
--- a/llvm/test/CodeGen/Mips/const4a.ll
+++ b/llvm/test/CodeGen/Mips/const4a.ll
@@ -172,8 +172,8 @@ declare void @goo(...) #1
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const6.ll b/llvm/test/CodeGen/Mips/const6.ll
index f40eeef3ccb6c..480a95818c49a 100644
--- a/llvm/test/CodeGen/Mips/const6.ll
+++ b/llvm/test/CodeGen/Mips/const6.ll
@@ -154,8 +154,8 @@ entry:
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Mips/const6a.ll b/llvm/test/CodeGen/Mips/const6a.ll
index 720edd3a662bc..eb62e2724cda9 100644
--- a/llvm/test/CodeGen/Mips/const6a.ll
+++ b/llvm/test/CodeGen/Mips/const6a.ll
@@ -23,7 +23,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind }
 
 !1 = !{i32 121}
diff --git a/llvm/test/CodeGen/Mips/ctlz.ll b/llvm/test/CodeGen/Mips/ctlz.ll
index 3cc156943f221..49eb36ff158cb 100644
--- a/llvm/test/CodeGen/Mips/ctlz.ll
+++ b/llvm/test/CodeGen/Mips/ctlz.ll
@@ -22,6 +22,6 @@ declare i32 @llvm.ctlz.i32(i32, i1) #1
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readnone }
 
diff --git a/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll b/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
index 7c41641560eb3..43fd36b966149 100644
--- a/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
+++ b/llvm/test/CodeGen/Mips/delay-slot-fill-forward.ll
@@ -161,7 +161,7 @@ if.end461:                                        ; preds = %if.end436, %for.bod
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="+mips32r2,+nooddspreg,+fpxx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="mips32r2" "target-features"="+mips32r2,+nooddspreg,+fpxx" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/Mips/f16abs.ll b/llvm/test/CodeGen/Mips/f16abs.ll
index 23bf402bb6dc8..242d8ff4b2acc 100644
--- a/llvm/test/CodeGen/Mips/f16abs.ll
+++ b/llvm/test/CodeGen/Mips/f16abs.ll
@@ -29,8 +29,8 @@ declare double @fabs(double) #1
 
 declare float @fabsf(float) #1
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { nounwind optsize readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #2 = { nounwind optsize readnone }
 
 
diff --git a/llvm/test/CodeGen/Mips/fp16instrinsmc.ll b/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
index 6c29c08720511..1582605161521 100644
--- a/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
+++ b/llvm/test/CodeGen/Mips/fp16instrinsmc.ll
@@ -385,7 +385,7 @@ entry:
 ; Function Attrs: nounwind
 declare double @exp2(double) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/Mips/fpneeded.ll b/llvm/test/CodeGen/Mips/fpneeded.ll
index cc82f81a22285..babfcad107d60 100644
--- a/llvm/test/CodeGen/Mips/fpneeded.ll
+++ b/llvm/test/CodeGen/Mips/fpneeded.ll
@@ -131,7 +131,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	foo3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 define void @vv() #0 {
 entry:
diff --git a/llvm/test/CodeGen/Mips/fpnotneeded.ll b/llvm/test/CodeGen/Mips/fpnotneeded.ll
index 761ef3032fc44..2b98f7e0726fc 100644
--- a/llvm/test/CodeGen/Mips/fpnotneeded.ll
+++ b/llvm/test/CodeGen/Mips/fpnotneeded.ll
@@ -61,7 +61,7 @@ entry:
 
 ; cisle:	.end	foo
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 
 define float @fv() #0 {
diff --git a/llvm/test/CodeGen/Mips/hf16call32.ll b/llvm/test/CodeGen/Mips/hf16call32.ll
index e187b76744b8c..33353b611d2f3 100644
--- a/llvm/test/CodeGen/Mips/hf16call32.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32.ll
@@ -1026,5 +1026,5 @@ declare { double, double } @dc_sf(float) #1
 ; stel: jr $18
 ; stel: .end __call_stub_fp_dc_sf
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/hf16call32_body.ll b/llvm/test/CodeGen/Mips/hf16call32_body.ll
index 3bcb6f6bc0152..2eea4c3b79ec8 100644
--- a/llvm/test/CodeGen/Mips/hf16call32_body.ll
+++ b/llvm/test/CodeGen/Mips/hf16call32_body.ll
@@ -303,4 +303,4 @@ entry:
 ; stel: $__fn_local_sf_df_df = sf_df_df
 ; stel: .end __fn_stub_sf_df_df
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/hfptrcall.ll b/llvm/test/CodeGen/Mips/hfptrcall.ll
index 920c6945c25cc..2babc67e1d0cb 100644
--- a/llvm/test/CodeGen/Mips/hfptrcall.ll
+++ b/llvm/test/CodeGen/Mips/hfptrcall.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 
 
diff --git a/llvm/test/CodeGen/Mips/l3mc.ll b/llvm/test/CodeGen/Mips/l3mc.ll
index 440da3abb53fb..dc68eaf93f1cb 100644
--- a/llvm/test/CodeGen/Mips/l3mc.ll
+++ b/llvm/test/CodeGen/Mips/l3mc.ll
@@ -99,7 +99,7 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ; __call_stub_fp___fixunsdfsi:  __call_stub_fp___fixunsdfsi:
 ; __call_stub_fp___floatdidf:  __call_stub_fp___floatdidf:
diff --git a/llvm/test/CodeGen/Mips/lcb2.ll b/llvm/test/CodeGen/Mips/lcb2.ll
index 036de382b6217..79f4b439d0ddc 100644
--- a/llvm/test/CodeGen/Mips/lcb2.ll
+++ b/llvm/test/CodeGen/Mips/lcb2.ll
@@ -115,7 +115,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; lcb: 	.end	btz
 ; lcbn:	.end	btz
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/Mips/lcb3c.ll b/llvm/test/CodeGen/Mips/lcb3c.ll
index 40912f3701f36..dd88924204de4 100644
--- a/llvm/test/CodeGen/Mips/lcb3c.ll
+++ b/llvm/test/CodeGen/Mips/lcb3c.ll
@@ -51,7 +51,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; lcb:	jal	$BB1_2	# branch
 ; lcb: $BB1_1:                                 # %if.then
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/lcb4a.ll b/llvm/test/CodeGen/Mips/lcb4a.ll
index a0258b15e84a1..ad843bbeede5d 100644
--- a/llvm/test/CodeGen/Mips/lcb4a.ll
+++ b/llvm/test/CodeGen/Mips/lcb4a.ll
@@ -55,7 +55,7 @@ if.end:                                           ; preds = %if.else, %if.then
 ; ci:	nop
 ; ci: $BB1_1:                                 # %if.else
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/lcb5.ll b/llvm/test/CodeGen/Mips/lcb5.ll
index 22baebac3e933..0d479ffdc91bb 100644
--- a/llvm/test/CodeGen/Mips/lcb5.ll
+++ b/llvm/test/CodeGen/Mips/lcb5.ll
@@ -216,7 +216,7 @@ if.end:                                           ; preds = %if.then, %entry
 ; ci:	.p2align	2
 ; ci:	.end	z4
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 
diff --git a/llvm/test/CodeGen/Mips/mbrsize4a.ll b/llvm/test/CodeGen/Mips/mbrsize4a.ll
index b8d2e2db8098b..e6c620a67360d 100644
--- a/llvm/test/CodeGen/Mips/mbrsize4a.ll
+++ b/llvm/test/CodeGen/Mips/mbrsize4a.ll
@@ -30,8 +30,8 @@ declare i32 @foo(...) #1
 
 declare i32 @printf(ptr, ...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !1 = !{i32 68}
diff --git a/llvm/test/CodeGen/Mips/micromips-attr.ll b/llvm/test/CodeGen/Mips/micromips-attr.ll
index 8e70cc616e0b7..1915f3bfbf491 100644
--- a/llvm/test/CodeGen/Mips/micromips-attr.ll
+++ b/llvm/test/CodeGen/Mips/micromips-attr.ll
@@ -24,7 +24,7 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="none"
   "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
   "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "stack-protector-buffer-size"="8"
   "use-soft-float"="false"
 }
 
@@ -34,6 +34,6 @@ attributes #1 = {
   "less-precise-fpmad"="false" "frame-pointer"="none"
   "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false"
   "no-signed-zeros-fp-math"="false" "no-trapping-math"="false"
-  "stack-protector-buffer-size"="8" "unsafe-fp-math"="false"
+  "stack-protector-buffer-size"="8"
   "use-soft-float"="false"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll b/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
index 80294b5cd9587..eaa39e97d2022 100644
--- a/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
+++ b/llvm/test/CodeGen/Mips/mips16-hf-attr-2.ll
@@ -28,18 +28,18 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="false"
+  "use-soft-float"="false"
 }
 attributes #1 = {
   nounwind
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
 attributes #2 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16-hf-attr.ll b/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
index c8af7127b00e5..cafa2d5dcfab4 100644
--- a/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
+++ b/llvm/test/CodeGen/Mips/mips16-hf-attr.ll
@@ -28,18 +28,18 @@ attributes #0 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="false"
+  "use-soft-float"="false"
 }
 attributes #1 = {
   nounwind
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
 attributes #2 = {
   "less-precise-fpmad"="false" "frame-pointer"="all"
  "frame-pointer"="non-leaf" "no-infs-fp-math"="false"
   "no-nans-fp-math"="false" "stack-protector-buffer-size"="8"
-  "unsafe-fp-math"="false" "use-soft-float"="true"
+  "use-soft-float"="true"
 }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_1.ll b/llvm/test/CodeGen/Mips/mips16_32_1.ll
index 0d020221c5efc..963fb58c681ca 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_1.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_1.ll
@@ -10,4 +10,4 @@ entry:
 ; CHECK:	.ent	foo
 ; CHECK:	jrc $ra
 ; CHECK:	.end	foo
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_10.ll b/llvm/test/CodeGen/Mips/mips16_32_10.ll
index 86378ffaf0680..e0d68595e18b6 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_10.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_10.ll
@@ -53,6 +53,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_3.ll b/llvm/test/CodeGen/Mips/mips16_32_3.ll
index ee33abc579234..dc2fe29e7734a 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_3.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_3.ll
@@ -62,6 +62,6 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_4.ll b/llvm/test/CodeGen/Mips/mips16_32_4.ll
index da92634277b24..2fed74d130e38 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_4.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_4.ll
@@ -56,6 +56,6 @@ entry:
 ; 32:	.end	main
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_5.ll b/llvm/test/CodeGen/Mips/mips16_32_5.ll
index 66924605423b3..2bbe7786a974f 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_5.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_5.ll
@@ -73,6 +73,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_6.ll b/llvm/test/CodeGen/Mips/mips16_32_6.ll
index 5a464a2d70110..0503b3faf71ed 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_6.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_6.ll
@@ -80,6 +80,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "nomips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_7.ll b/llvm/test/CodeGen/Mips/mips16_32_7.ll
index 236f791855c0d..2b2dd8b83bfb5 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_7.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_7.ll
@@ -68,6 +68,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_8.ll b/llvm/test/CodeGen/Mips/mips16_32_8.ll
index 5c0cd323615a7..1aff91c8f6f07 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_8.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_8.ll
@@ -67,7 +67,7 @@ entry:
 ; 32:	.set	reorder
 ; 32:	.end	main
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "use-soft-float"="false" }
+attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/mips16_32_9.ll b/llvm/test/CodeGen/Mips/mips16_32_9.ll
index 609f054e37b43..82d77272900f6 100644
--- a/llvm/test/CodeGen/Mips/mips16_32_9.ll
+++ b/llvm/test/CodeGen/Mips/mips16_32_9.ll
@@ -44,6 +44,6 @@ entry:
 
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false"  "use-soft-float"="false" }
+attributes #2 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Mips/nomips16.ll b/llvm/test/CodeGen/Mips/nomips16.ll
index 62564f9f890d9..6b51eb929a65e 100644
--- a/llvm/test/CodeGen/Mips/nomips16.ll
+++ b/llvm/test/CodeGen/Mips/nomips16.ll
@@ -33,6 +33,6 @@ entry:
 ; CHECK: 	.end	nofoo
 
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "mips16" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "nomips16" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll b/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
index 63a730c186167..a8eab075c8b25 100644
--- a/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
+++ b/llvm/test/CodeGen/Mips/pbqp-reserved-physreg.ll
@@ -31,5 +31,5 @@ bb35:                                             ; preds = %bb
   unreachable
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/Mips/powif64_16.ll b/llvm/test/CodeGen/Mips/powif64_16.ll
index 3443b6225a38e..914ef940a8dbb 100644
--- a/llvm/test/CodeGen/Mips/powif64_16.ll
+++ b/llvm/test/CodeGen/Mips/powif64_16.ll
@@ -17,7 +17,7 @@ define double @foo_pow_f64(double %y, i32 %p)  {
   ret double %1
 } 
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 attributes #1 = { nounwind readonly }
 
 !0 = !{!"double", !1}
diff --git a/llvm/test/CodeGen/Mips/s2rem.ll b/llvm/test/CodeGen/Mips/s2rem.ll
index fdf06cedc7743..5d324cbee3416 100644
--- a/llvm/test/CodeGen/Mips/s2rem.ll
+++ b/llvm/test/CodeGen/Mips/s2rem.ll
@@ -86,7 +86,7 @@ entry:
 
 declare void @vf(float) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sel1c.ll b/llvm/test/CodeGen/Mips/sel1c.ll
index 071f988abac32..2aaf56d84f6aa 100644
--- a/llvm/test/CodeGen/Mips/sel1c.ll
+++ b/llvm/test/CodeGen/Mips/sel1c.ll
@@ -16,6 +16,6 @@ entry:
 ; cond-b-short:	bteqz	$BB0_{{[0-9]+}}  # 16 bit inst
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sel2c.ll b/llvm/test/CodeGen/Mips/sel2c.ll
index 0c3b9578e69f2..44de4ac933246 100644
--- a/llvm/test/CodeGen/Mips/sel2c.ll
+++ b/llvm/test/CodeGen/Mips/sel2c.ll
@@ -16,6 +16,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/simplebr.ll b/llvm/test/CodeGen/Mips/simplebr.ll
index cfe547fe7ca28..ae09d85a76a85 100644
--- a/llvm/test/CodeGen/Mips/simplebr.ll
+++ b/llvm/test/CodeGen/Mips/simplebr.ll
@@ -31,7 +31,7 @@ declare void @goo(...) #1
 
 declare void @hoo(...) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="true" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="true" }
 
 
diff --git a/llvm/test/CodeGen/Mips/sr1.ll b/llvm/test/CodeGen/Mips/sr1.ll
index c6fa9fcc4fb09..6c42d457bb019 100644
--- a/llvm/test/CodeGen/Mips/sr1.ll
+++ b/llvm/test/CodeGen/Mips/sr1.ll
@@ -50,7 +50,7 @@ entry:
 
 declare float @xf() #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
diff --git a/llvm/test/CodeGen/Mips/tnaked.ll b/llvm/test/CodeGen/Mips/tnaked.ll
index ac54f2faae3f5..287c009869f12 100644
--- a/llvm/test/CodeGen/Mips/tnaked.ll
+++ b/llvm/test/CodeGen/Mips/tnaked.ll
@@ -25,5 +25,5 @@ entry:
 ; CHECK:	.fmask	0x00000000,0
 ; CHECK: 	addiu	$sp, $sp, -8
 
-attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { naked noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/NVPTX/fma-assoc.ll b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
index 6693c9044ca2c..db0eae75ae9ba 100644
--- a/llvm/test/CodeGen/NVPTX/fma-assoc.ll
+++ b/llvm/test/CodeGen/NVPTX/fma-assoc.ll
@@ -1,8 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK
-; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | FileCheck %s -check-prefix=CHECK -check-prefix=CHECK-UNSAFE
+; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
 ; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast | %ptxas-verify %}
-; RUN: %if ptxas %{ llc < %s -mtriple=nvptx64 -mcpu=sm_20 -fp-contract=fast -enable-unsafe-fp-math | %ptxas-verify %}
 
 define ptx_device float @t1_f32(float %x, float %y, float %z,
 ; CHECK-UNSAFE-LABEL: t1_f32(
diff --git a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir
index 7d80c025389a5..41f7599bb692c 100644
--- a/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir
+++ b/llvm/test/CodeGen/PowerPC/DisableHoistingDueToBlockHotnessProfileData.mir
@@ -70,7 +70,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
 
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
   attributes #1 = { nounwind }
 
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
index 60a399d49bd1f..00ae719811281 100644
--- a/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
+++ b/llvm/test/CodeGen/PowerPC/NoCRFieldRedefWhenSpillingCRBIT.mir
@@ -36,7 +36,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #1
   
-  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
   attributes #1 = { nounwind }
   
   !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
index e61d51f386e8f..7e471293c749d 100644
--- a/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
+++ b/llvm/test/CodeGen/PowerPC/aantidep-inline-asm-use.ll
@@ -298,7 +298,7 @@ _ZN10SubProcess12SafeSyscalls5fcntlEiil.exit:     ; preds = %_ZN10SubProcess12Sa
 ; Function Attrs: nounwind argmemonly
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
 attributes #1 = { nounwind argmemonly }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/addrfuncstr.ll b/llvm/test/CodeGen/PowerPC/addrfuncstr.ll
index 1dbf42075d546..473d669957f2a 100644
--- a/llvm/test/CodeGen/PowerPC/addrfuncstr.ll
+++ b/llvm/test/CodeGen/PowerPC/addrfuncstr.ll
@@ -23,5 +23,5 @@ declare i64 @fread(ptr, i64, i64, ptr) #1
 ; CHECK: .section .data.rel.ro
 ; CHECK: .quad fread
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/asm-constraints.ll b/llvm/test/CodeGen/PowerPC/asm-constraints.ll
index 9a37ba2c82601..6c740e1f0e269 100644
--- a/llvm/test/CodeGen/PowerPC/asm-constraints.ll
+++ b/llvm/test/CodeGen/PowerPC/asm-constraints.ll
@@ -65,7 +65,7 @@ entry:
 }
 
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
 
 attributes #1 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/asym-regclass-copy.ll b/llvm/test/CodeGen/PowerPC/asym-regclass-copy.ll
index d6b398e8a0ca7..687163835577b 100644
--- a/llvm/test/CodeGen/PowerPC/asym-regclass-copy.ll
+++ b/llvm/test/CodeGen/PowerPC/asym-regclass-copy.ll
@@ -52,5 +52,5 @@ declare void @free(ptr nocapture) #0
 
 declare i64 @strtol(ptr, ptr nocapture, i32 signext) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir
index fac09d2bdb9ea..f8d32ccd9cf70 100644
--- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir
+++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-R0-special-handling.mir
@@ -63,8 +63,8 @@
     ret i64 %2
   }
   
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
+  attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
   
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir
index 0b61455b05239..72f9b0228be63 100644
--- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir
+++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs-out-of-range.mir
@@ -187,7 +187,7 @@
     ret i64 %cond
   }
   
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "use-soft-float"="false" }
   
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
index 61c0da64c001e..02a3f34ba47bb 100644
--- a/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
+++ b/llvm/test/CodeGen/PowerPC/convert-rr-to-ri-instrs.mir
@@ -983,10 +983,10 @@
     ret i64 %xor
   }
 
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "use-soft-float"="false" }
+  attributes #1 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "use-soft-float"="false" }
+  attributes #2 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,-vsx" "use-soft-float"="false" }
+  attributes #3 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/PowerPC/cr-spills.ll b/llvm/test/CodeGen/PowerPC/cr-spills.ll
index 6703de6180c07..3488facb00ac6 100644
--- a/llvm/test/CodeGen/PowerPC/cr-spills.ll
+++ b/llvm/test/CodeGen/PowerPC/cr-spills.ll
@@ -400,5 +400,5 @@ for.end999:                                       ; preds = %for.inc997
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/crypto_bifs.ll b/llvm/test/CodeGen/PowerPC/crypto_bifs.ll
index f61c62ff7b3d5..9ae02c6c64873 100644
--- a/llvm/test/CodeGen/PowerPC/crypto_bifs.ll
+++ b/llvm/test/CodeGen/PowerPC/crypto_bifs.ll
@@ -276,7 +276,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <2 x i64> @llvm.ppc.altivec.crypto.vshasigmad(<2 x i64>, i32, i32) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/ctr-cleanup.ll b/llvm/test/CodeGen/PowerPC/ctr-cleanup.ll
index 9e13b6f75676c..f7a7641f3ca5a 100644
--- a/llvm/test/CodeGen/PowerPC/ctr-cleanup.ll
+++ b/llvm/test/CodeGen/PowerPC/ctr-cleanup.ll
@@ -22,4 +22,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-do-not-duplicate-mi.mir b/llvm/test/CodeGen/PowerPC/ctrloop-do-not-duplicate-mi.mir
index 668e7fe03ae04..9230c3162ded3 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-do-not-duplicate-mi.mir
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-do-not-duplicate-mi.mir
@@ -5,7 +5,7 @@
   target datalayout = "e-m:e-i64:64-n32:64-S128-v256:256:256-v512:512:512"
   target triple = "powerpc64le-unknown-linux-gnu"
   
-  define dso_local void @test() local_unnamed_addr #0 {
+  define dso_local void @test() local_unnamed_addr {
   test_entry:
     %_val_domain_ = load i32, ptr undef, align 4
     %_conv765 = sext i32 %_val_domain_ to i64
@@ -53,7 +53,6 @@
   ; Function Attrs: nocallback noduplicate nofree nosync nounwind willreturn
   declare i1 @llvm.loop.decrement.i64(i64) #2
   
-  attributes #0 = { "unsafe-fp-math"="true" }
   attributes #1 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
   attributes #2 = { nocallback noduplicate nofree nosync nounwind willreturn }
   
diff --git a/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll b/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll
index 072cefa82fa5d..c9a955fb1e144 100644
--- a/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll
+++ b/llvm/test/CodeGen/PowerPC/ctrloop-intrin.ll
@@ -329,7 +329,7 @@ declare void @llvm.set.loop.iterations.i64(i64) #0
 declare i1 @llvm.loop.decrement(i64) #0
 
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/div-e-32.ll b/llvm/test/CodeGen/PowerPC/div-e-32.ll
index fe64abbe6b951..1b39d45b493c1 100644
--- a/llvm/test/CodeGen/PowerPC/div-e-32.ll
+++ b/llvm/test/CodeGen/PowerPC/div-e-32.ll
@@ -23,7 +23,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.ppc.divweu(i32, i32) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/div-e-all.ll b/llvm/test/CodeGen/PowerPC/div-e-all.ll
index 462086bd2f03a..072876068a445 100644
--- a/llvm/test/CodeGen/PowerPC/div-e-all.ll
+++ b/llvm/test/CodeGen/PowerPC/div-e-all.ll
@@ -46,7 +46,7 @@ entry:
 ; Function Attrs: nounwind readnone
 declare i64 @llvm.ppc.divdeu(i64, i64) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll
index 00d170f308c80..9bc8b23d2c537 100644
--- a/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll
+++ b/llvm/test/CodeGen/PowerPC/extra-toc-reg-deps.ll
@@ -419,8 +419,8 @@ declare void @_ZN4Foam11regIOobjectD2Ev() #0
 
 declare void @_ZN4Foam6reduceIiNS_5sumOpIiEEEEvRKNS_4ListINS_8UPstream11commsStructEEERT_RKT0_ii() #0
 
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 
diff --git a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
index a336fc796ca52..2d34e83720e29 100644
--- a/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
+++ b/llvm/test/CodeGen/PowerPC/fma-mutate-duplicate-vreg.ll
@@ -33,4 +33,4 @@ define float @f(float %xf) #0 {
   ret float %25
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
index cad684ecf0170..baa127e451701 100644
--- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
+++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll
@@ -2,8 +2,8 @@
 ; REQUIRES: asserts
 ; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1                        | FileCheck %s --check-prefix=FMFDEBUG
 ; RUN: llc < %s -mtriple=powerpc64le                                                           | FileCheck %s --check-prefix=FMF
-; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
-; RUN: llc < %s -mtriple=powerpc64le -enable-unsafe-fp-math -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
+; RUN: llc < %s -mtriple=powerpc64le -debug-only=isel -o /dev/null 2>&1 -fp-contract=fast -enable-no-nans-fp-math | FileCheck %s --check-prefix=GLOBALDEBUG
+; RUN: llc < %s -mtriple=powerpc64le -fp-contract=fast -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math | FileCheck %s --check-prefix=GLOBAL
 
 ; Test FP transforms using instruction/node-level fast-math-flags.
 ; We're also checking debug output to verify that FMF is propagated to the newly created nodes.
diff --git a/llvm/test/CodeGen/PowerPC/frameaddr.ll b/llvm/test/CodeGen/PowerPC/frameaddr.ll
index 180f736c5ddbd..60eb0867256c8 100644
--- a/llvm/test/CodeGen/PowerPC/frameaddr.ll
+++ b/llvm/test/CodeGen/PowerPC/frameaddr.ll
@@ -236,7 +236,7 @@ declare void @use(ptr)
 
 declare ptr @llvm.frameaddress(i32) #2
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind naked "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind naked "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
index 4c0c713bad650..2726e8ff2d073 100644
--- a/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
+++ b/llvm/test/CodeGen/PowerPC/glob-comp-aa-crash.ll
@@ -120,9 +120,9 @@ declare void @_ZNSt3__15mutex4lockEv(ptr) #0
 ; Function Attrs: nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #3
 
-attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noreturn optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { noreturn optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 attributes #4 = { optsize }
 attributes #5 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
index 64c60bffa31d7..b00840afe0c3f 100644
--- a/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
+++ b/llvm/test/CodeGen/PowerPC/ifcvt-forked-bug-2016-08-08.ll
@@ -34,5 +34,5 @@ declare ptr @_ZN11__sanitizer21internal_start_threadEPFvPvES0_(ptr, ptr) local_u
 
 declare hidden void @_ZN11__sanitizer16BackgroundThreadEPv(ptr nocapture readnone) #5
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
 attributes #7 = { nobuiltin nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/isel-rc-nox0.ll b/llvm/test/CodeGen/PowerPC/isel-rc-nox0.ll
index d4daf244f7484..e07fc29c9519e 100644
--- a/llvm/test/CodeGen/PowerPC/isel-rc-nox0.ll
+++ b/llvm/test/CodeGen/PowerPC/isel-rc-nox0.ll
@@ -42,5 +42,5 @@ func_80.exit2978.i:                               ; preds = %for.inc44.i2977.i
 ; Function Attrs: nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "ssp-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll
index 2d471a05899f7..f005899fb58fb 100644
--- a/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll
+++ b/llvm/test/CodeGen/PowerPC/lxv-aligned-stack-slots.ll
@@ -39,6 +39,6 @@ define void @aligned_slot() #0 {
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "target-cpu"="pwr9" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+power9-vector,+vsx" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll
index f807f4fa20d25..1ac5201705e17 100644
--- a/llvm/test/CodeGen/PowerPC/mc-instrlat.ll
+++ b/llvm/test/CodeGen/PowerPC/mc-instrlat.ll
@@ -19,7 +19,7 @@ entry:
 
 declare void @bar(double) #1
 
-attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind "no-infs-fp-math"="true" "no-nans-fp-math"="true" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "use-soft-float"="false" }
+attributes #1 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/negctr.ll b/llvm/test/CodeGen/PowerPC/negctr.ll
index 38664b058b878..e00a826e5735f 100644
--- a/llvm/test/CodeGen/PowerPC/negctr.ll
+++ b/llvm/test/CodeGen/PowerPC/negctr.ll
@@ -82,4 +82,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/ppc-empty-fs.ll b/llvm/test/CodeGen/PowerPC/ppc-empty-fs.ll
index 5c9bb1240caf7..66fa48e90772f 100644
--- a/llvm/test/CodeGen/PowerPC/ppc-empty-fs.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc-empty-fs.ll
@@ -21,7 +21,7 @@ entry:
 ; Function Attrs: nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-features"="" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-features"="" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/ppc32-lshrti3.ll b/llvm/test/CodeGen/PowerPC/ppc32-lshrti3.ll
index c9e183c97973c..4499ea2687bdc 100644
--- a/llvm/test/CodeGen/PowerPC/ppc32-lshrti3.ll
+++ b/llvm/test/CodeGen/PowerPC/ppc32-lshrti3.ll
@@ -32,7 +32,7 @@ while.end:                                        ; preds = %while.cond
   ret i32 undef
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/PowerPC/pr17168.ll b/llvm/test/CodeGen/PowerPC/pr17168.ll
index 828bc2b74067f..160e3b12d449e 100644
--- a/llvm/test/CodeGen/PowerPC/pr17168.ll
+++ b/llvm/test/CodeGen/PowerPC/pr17168.ll
@@ -51,7 +51,7 @@ for.cond1816.preheader.for.inc1898_crit_edge:     ; preds = %for.cond1816.prehea
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!7}
diff --git a/llvm/test/CodeGen/PowerPC/pr17354.ll b/llvm/test/CodeGen/PowerPC/pr17354.ll
index 38b98dffccea7..9079371e6a936 100644
--- a/llvm/test/CodeGen/PowerPC/pr17354.ll
+++ b/llvm/test/CodeGen/PowerPC/pr17354.ll
@@ -35,4 +35,4 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/pr18663-2.ll b/llvm/test/CodeGen/PowerPC/pr18663-2.ll
index 64d767aa03284..97ad5b311c77d 100644
--- a/llvm/test/CodeGen/PowerPC/pr18663-2.ll
+++ b/llvm/test/CodeGen/PowerPC/pr18663-2.ll
@@ -147,7 +147,7 @@ ehcleanup142:                                     ; preds = %lpad19, %lpad.i.i.i
   resume { ptr, i32 } undef
 }
 
-attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { noreturn "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { inlinehint "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/PowerPC/pr24546.ll b/llvm/test/CodeGen/PowerPC/pr24546.ll
index 527139a4d3a09..a84b2e894f111 100644
--- a/llvm/test/CodeGen/PowerPC/pr24546.ll
+++ b/llvm/test/CodeGen/PowerPC/pr24546.ll
@@ -47,8 +47,8 @@ declare double @pow(double, double) #0
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/PowerPC/pr27350.ll b/llvm/test/CodeGen/PowerPC/pr27350.ll
index 1a68f2cb94621..568d7cddfb12a 100644
--- a/llvm/test/CodeGen/PowerPC/pr27350.ll
+++ b/llvm/test/CodeGen/PowerPC/pr27350.ll
@@ -18,7 +18,7 @@ entry:
 declare fastcc void @bar([2 x i64], [2 x i64]) unnamed_addr #1 align 2
 
 attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/pr28130.ll b/llvm/test/CodeGen/PowerPC/pr28130.ll
index 135c6aa97edf7..21e4f3d98aa28 100644
--- a/llvm/test/CodeGen/PowerPC/pr28130.ll
+++ b/llvm/test/CodeGen/PowerPC/pr28130.ll
@@ -63,4 +63,4 @@ bb:
   ret void
 }
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pwr8" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+power8-vector,+vsx" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/reloc-align.ll b/llvm/test/CodeGen/PowerPC/reloc-align.ll
index 1640e3d2ba9d5..d1524e593cfe0 100644
--- a/llvm/test/CodeGen/PowerPC/reloc-align.ll
+++ b/llvm/test/CodeGen/PowerPC/reloc-align.ll
@@ -30,4 +30,4 @@ entry:
   ret i32 %bf.cast
 }
 
-attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir
index 584f2a975b042..d0bcaef75ebef 100644
--- a/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir
+++ b/llvm/test/CodeGen/PowerPC/rlwinm_rldicl_to_andi.mir
@@ -60,7 +60,7 @@
     ret i64 %cond
   }
   
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
   
   !llvm.module.flags = !{!0, !1}
   !llvm.ident = !{!2}
diff --git a/llvm/test/CodeGen/PowerPC/scalar-equal.ll b/llvm/test/CodeGen/PowerPC/scalar-equal.ll
index 1832475e7795b..c0b11b47236a9 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-equal.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-equal.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
 ; RUN:   --check-prefix=FAST-P8
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
diff --git a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
index ca9bacebe7a33..5915bd3b21085 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-min-max-p10.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr10 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s
diff --git a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
index fd0b494d57677..881d1f4c4093b 100644
--- a/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar_cmp.ll
@@ -1,10 +1,10 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr8 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
 ; RUN:   --check-prefix=FAST-P8
-; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names --enable-unsafe-fp-math \
+; RUN: llc -mcpu=pwr9 -ppc-asm-full-reg-names \
 ; RUN:   -verify-machineinstrs --enable-no-signed-zeros-fp-math \
 ; RUN:   --enable-no-nans-fp-math --enable-no-infs-fp-math \
 ; RUN:   -mtriple=powerpc64le-unknown-unknown < %s | FileCheck %s \
diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc.mir b/llvm/test/CodeGen/PowerPC/setcr_bc.mir
index 76f9d5e847dff..2823c0f20aef8 100644
--- a/llvm/test/CodeGen/PowerPC/setcr_bc.mir
+++ b/llvm/test/CodeGen/PowerPC/setcr_bc.mir
@@ -32,8 +32,8 @@
     ret i32 %call2.i.sink
   }
   
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir
index 433ea6301277f..ca88ec7be2107 100644
--- a/llvm/test/CodeGen/PowerPC/setcr_bc2.mir
+++ b/llvm/test/CodeGen/PowerPC/setcr_bc2.mir
@@ -32,8 +32,8 @@
     ret i32 %call2.i.sink
   }
   
-  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir
index ee16a8c69afb7..2ffca9cbac06b 100644
--- a/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir
+++ b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-1.mir
@@ -155,7 +155,7 @@
   ; Function Attrs: noduplicate nounwind
   declare i1 @llvm.loop.decrement.i64(i64) #1
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-regpressure-high.mir b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-regpressure-high.mir
index 4069fec4216e3..261ed55d5a910 100644
--- a/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-regpressure-high.mir
+++ b/llvm/test/CodeGen/PowerPC/sink-down-more-instructions-regpressure-high.mir
@@ -239,7 +239,7 @@
   ; Function Attrs: noduplicate nounwind
   declare i1 @llvm.loop.decrement.i64(i64) #1
 
-  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nofree norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector,-spe" "use-soft-float"="false" }
   attributes #1 = { noduplicate nounwind }
 
   !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/sjlj.ll b/llvm/test/CodeGen/PowerPC/sjlj.ll
index 88a6cfdc209dd..92b0c5c804f5d 100644
--- a/llvm/test/CodeGen/PowerPC/sjlj.ll
+++ b/llvm/test/CodeGen/PowerPC/sjlj.ll
@@ -171,7 +171,7 @@ declare ptr @llvm.stacksave() #3
 
 declare i32 @llvm.eh.sjlj.setjmp(ptr) #3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { noreturn nounwind }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/CodeGen/PowerPC/stwu-sched.ll b/llvm/test/CodeGen/PowerPC/stwu-sched.ll
index 4cf07c396d48c..29a5efc0bf3f3 100644
--- a/llvm/test/CodeGen/PowerPC/stwu-sched.ll
+++ b/llvm/test/CodeGen/PowerPC/stwu-sched.ll
@@ -58,7 +58,7 @@ define void @initCombList(ptr nocapture, i32 signext) local_unnamed_addr #0 {
   ret void
 }
 
-attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/PowerPC/toc-load-sched-bug.ll b/llvm/test/CodeGen/PowerPC/toc-load-sched-bug.ll
index 74187672128c6..37fe76376d709 100644
--- a/llvm/test/CodeGen/PowerPC/toc-load-sched-bug.ll
+++ b/llvm/test/CodeGen/PowerPC/toc-load-sched-bug.ll
@@ -475,13 +475,13 @@ declare void @_ZNSs4swapERSs(ptr, ptr dereferenceable(8)) #1
 ; Function Attrs: nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture, i8, i64, i1) #3
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { inlinehint nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #2 = { inlinehint nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind }
-attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nobuiltin nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { noreturn nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #5 = { nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #6 = { nobuiltin nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #7 = { noreturn nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/PowerPC/unal4-std.ll b/llvm/test/CodeGen/PowerPC/unal4-std.ll
index 04fd52559d7f9..4a5fb39cc76ed 100644
--- a/llvm/test/CodeGen/PowerPC/unal4-std.ll
+++ b/llvm/test/CodeGen/PowerPC/unal4-std.ll
@@ -34,4 +34,4 @@ if.end210:                                        ; preds = %entry
 
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/PowerPC/uwtables.ll b/llvm/test/CodeGen/PowerPC/uwtables.ll
index f9f35d95151d7..3c3a108a458bc 100644
--- a/llvm/test/CodeGen/PowerPC/uwtables.ll
+++ b/llvm/test/CodeGen/PowerPC/uwtables.ll
@@ -46,5 +46,5 @@ declare i32 @__gxx_personality_v0(...)
 declare void @__cxa_call_unexpected(ptr) local_unnamed_addr
 
 
-attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64le" "target-features"="+altivec,+bpermd,+crypto,+direct-move,+extdiv,+htm,+power8-vector,+vsx,-power9-vector" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/PowerPC/zero-not-run.ll b/llvm/test/CodeGen/PowerPC/zero-not-run.ll
index 6d43191a8c7d9..3c26554563f06 100644
--- a/llvm/test/CodeGen/PowerPC/zero-not-run.ll
+++ b/llvm/test/CodeGen/PowerPC/zero-not-run.ll
@@ -24,4 +24,4 @@ for.end731:                                       ; preds = %entry
 ; Function Attrs: nounwind
 declare i64 @safe_mod_func_uint64_t_u_u(i64, i64) #0
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
new file mode 100644
index 0000000000000..bc79c6f650291
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/store-fp-zero-to-x0.ll
@@ -0,0 +1,320 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+f,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32F
+; RUN: llc -global-isel -mtriple=riscv32 -mattr=+d,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV32D
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+f,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64F
+; RUN: llc -global-isel -mtriple=riscv64 -mattr=+d,+zfh < %s \
+; RUN:   | FileCheck %s --check-prefix=RV64D
+
+define void @zero_f16(ptr %i) {
+; RV32F-LABEL: zero_f16:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sh zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f16:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sh zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f16:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sh zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f16:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sh zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store half 0.0, ptr %i, align 4
+  ret void
+}
+
+define void @zero_bf16(ptr %i) {
+; RV32F-LABEL: zero_bf16:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sh zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_bf16:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sh zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_bf16:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sh zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_bf16:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sh zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store bfloat 0.0, ptr %i, align 4
+  ret void
+}
+
+define void @zero_f32(ptr %i) {
+; RV32F-LABEL: zero_f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store float 0.0, ptr %i, align 4
+  ret void
+}
+
+
+define void @zero_f64(ptr %i) {
+; RV32F-LABEL: zero_f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI3_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI3_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store double 0.0, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v1f32(ptr %i) {
+; RV32F-LABEL: zero_v1f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v1f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v1f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v1f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <1 x float> <float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v2f32(ptr %i) {
+; RV32F-LABEL: zero_v2f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    sw zero, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v2f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    sw zero, 4(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v2f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    sw zero, 4(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v2f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    sw zero, 4(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <2 x float> <float 0.0, float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v4f32(ptr %i) {
+; RV32F-LABEL: zero_v4f32:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    sw zero, 0(a0)
+; RV32F-NEXT:    sw zero, 4(a0)
+; RV32F-NEXT:    sw zero, 8(a0)
+; RV32F-NEXT:    sw zero, 12(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v4f32:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    sw zero, 0(a0)
+; RV32D-NEXT:    sw zero, 4(a0)
+; RV32D-NEXT:    sw zero, 8(a0)
+; RV32D-NEXT:    sw zero, 12(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v4f32:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sw zero, 0(a0)
+; RV64F-NEXT:    sw zero, 4(a0)
+; RV64F-NEXT:    sw zero, 8(a0)
+; RV64F-NEXT:    sw zero, 12(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v4f32:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sw zero, 0(a0)
+; RV64D-NEXT:    sw zero, 4(a0)
+; RV64D-NEXT:    sw zero, 8(a0)
+; RV64D-NEXT:    sw zero, 12(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <4 x float> <float 0.0, float 0.0, float 0.0, float 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v1f64(ptr %i) {
+; RV32F-LABEL: zero_v1f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI7_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI7_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v1f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v1f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v1f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <1 x double> <double 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v2f64(ptr %i) {
+; RV32F-LABEL: zero_v2f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI8_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI8_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    sw a2, 8(a0)
+; RV32F-NEXT:    sw a1, 12(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v2f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    fsd fa5, 8(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v2f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    sd zero, 8(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v2f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    sd zero, 8(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <2 x double> <double 0.0, double 0.0>, ptr %i, align 8
+  ret void
+}
+
+define void @zero_v4f64(ptr %i) {
+; RV32F-LABEL: zero_v4f64:
+; RV32F:       # %bb.0: # %entry
+; RV32F-NEXT:    lui a1, %hi(.LCPI9_0)
+; RV32F-NEXT:    addi a1, a1, %lo(.LCPI9_0)
+; RV32F-NEXT:    lw a2, 0(a1)
+; RV32F-NEXT:    lw a1, 4(a1)
+; RV32F-NEXT:    sw a2, 0(a0)
+; RV32F-NEXT:    sw a1, 4(a0)
+; RV32F-NEXT:    sw a2, 8(a0)
+; RV32F-NEXT:    sw a1, 12(a0)
+; RV32F-NEXT:    sw a2, 16(a0)
+; RV32F-NEXT:    sw a1, 20(a0)
+; RV32F-NEXT:    sw a2, 24(a0)
+; RV32F-NEXT:    sw a1, 28(a0)
+; RV32F-NEXT:    ret
+;
+; RV32D-LABEL: zero_v4f64:
+; RV32D:       # %bb.0: # %entry
+; RV32D-NEXT:    fcvt.d.w fa5, zero
+; RV32D-NEXT:    fsd fa5, 0(a0)
+; RV32D-NEXT:    fsd fa5, 8(a0)
+; RV32D-NEXT:    fsd fa5, 16(a0)
+; RV32D-NEXT:    fsd fa5, 24(a0)
+; RV32D-NEXT:    ret
+;
+; RV64F-LABEL: zero_v4f64:
+; RV64F:       # %bb.0: # %entry
+; RV64F-NEXT:    sd zero, 0(a0)
+; RV64F-NEXT:    sd zero, 8(a0)
+; RV64F-NEXT:    sd zero, 16(a0)
+; RV64F-NEXT:    sd zero, 24(a0)
+; RV64F-NEXT:    ret
+;
+; RV64D-LABEL: zero_v4f64:
+; RV64D:       # %bb.0: # %entry
+; RV64D-NEXT:    sd zero, 0(a0)
+; RV64D-NEXT:    sd zero, 8(a0)
+; RV64D-NEXT:    sd zero, 16(a0)
+; RV64D-NEXT:    sd zero, 24(a0)
+; RV64D-NEXT:    ret
+entry:
+  store <4 x double> <double 0.0, double 0.0, double 0.0, double 0.0>, ptr %i, align 8
+  ret void
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 693a40d3f39e6..5e5f2b78e8869 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -217,6 +217,11 @@
 ; CHECK-NEXT:   xsfmm64t                         - 'XSfmm64t' (TE=64 configuration). 
 ; CHECK-NEXT:   xsfmmbase                        - 'XSfmmbase' (All non arithmetic instructions for all TEWs and sf.vtzero).
 ; CHECK-NEXT:   xsfvcp                           - 'XSfvcp' (SiFive Custom Vector Coprocessor Interface Instructions).
+; CHECK-NEXT:   xsfvfbfexp16e                    - 'XSfvfbfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, BFloat16).
+; CHECK-NEXT:   xsfvfexp16e                      - 'XSfvfexp16e' (SiFive Vector Floating-Point Exponential Function Instruction, Half Precision).
+; CHECK-NEXT:   xsfvfexp32e                      - 'XSfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction, Single Precision).
+; CHECK-NEXT:   xsfvfexpa                        - 'XSfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction).
+; CHECK-NEXT:   xsfvfexpa64e                     - 'XSfvfexpa64e' (SiFive Vector Floating-Point Exponential Approximation Instruction with Double-Precision).
 ; CHECK-NEXT:   xsfvfnrclipxfqf                  - 'XSfvfnrclipxfqf' (SiFive FP32-to-int8 Ranged Clip Instructions).
 ; CHECK-NEXT:   xsfvfwmaccqqq                    - 'XSfvfwmaccqqq' (SiFive Matrix Multiply Accumulate Instruction (4-by-4)).
 ; CHECK-NEXT:   xsfvqmaccdod                     - 'XSfvqmaccdod' (SiFive Int8 Matrix Multiplication Instructions (2-by-8 and 8-by-2)).
diff --git a/llvm/test/CodeGen/RISCV/rv32p.ll b/llvm/test/CodeGen/RISCV/rv32p.ll
new file mode 100644
index 0000000000000..4eee880a398a9
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv32p.ll
@@ -0,0 +1,709 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define i32 @ctlz_i32(i32 %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bnez a1, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    addi a0, a0, 32
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    clz a0, a1
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define i32 @cttz_i32(i32 %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB2_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB2_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    or a2, a0, a1
+; CHECK-NEXT:    beqz a2, .LBB3_3
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    bnez a0, .LBB3_4
+; CHECK-NEXT:  # %bb.2: # %cond.false
+; CHECK-NEXT:    addi a0, a1, -1
+; CHECK-NEXT:    not a1, a1
+; CHECK-NEXT:    and a0, a1, a0
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    j .LBB3_5
+; CHECK-NEXT:  .LBB3_3:
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB3_4:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:  .LBB3_5: # %cond.false
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define i32 @sextb_i32(i32 %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define i32 @sexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    srai a1, a0, 31
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define i32 @min_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB9_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB9_3
+; CHECK-NEXT:    j .LBB9_4
+; CHECK-NEXT:  .LBB9_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB9_4
+; CHECK-NEXT:  .LBB9_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB9_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @max_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB11_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    slt a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB11_3
+; CHECK-NEXT:    j .LBB11_4
+; CHECK-NEXT:  .LBB11_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB11_4
+; CHECK-NEXT:  .LBB11_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB11_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @minu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB13_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    beqz a4, .LBB13_3
+; CHECK-NEXT:    j .LBB13_4
+; CHECK-NEXT:  .LBB13_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    bnez a4, .LBB13_4
+; CHECK-NEXT:  .LBB13_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB13_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define i32 @maxu_i32(i32 %a, i32 %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+; As we are not matching directly i64 code patterns on RV32 some i64 patterns
+; don't have yet any matching bit manipulation instructions on RV32.
+; This test is presented here in case future expansions of the Bitmanip
+; extensions introduce instructions suitable for this pattern.
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB15_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a3, a1
+; CHECK-NEXT:    beqz a4, .LBB15_3
+; CHECK-NEXT:    j .LBB15_4
+; CHECK-NEXT:  .LBB15_2:
+; CHECK-NEXT:    sltu a4, a2, a0
+; CHECK-NEXT:    bnez a4, .LBB15_4
+; CHECK-NEXT:  .LBB15_3:
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    mv a1, a3
+; CHECK-NEXT:  .LBB15_4:
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bgez a1, .LBB17_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    snez a2, a0
+; CHECK-NEXT:    neg a0, a0
+; CHECK-NEXT:    neg a1, a1
+; CHECK-NEXT:    sub a1, a1, a2
+; CHECK-NEXT:  .LBB17_2:
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+define i32 @zexth_i32(i32 %a) nounwind {
+; CHECK-LABEL: zexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    ret
+  %and = and i32 %a, 65535
+  ret i32 %and
+}
+
+define i64 @zexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: zexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 16
+; CHECK-NEXT:    srli a0, a0, 16
+; CHECK-NEXT:    li a1, 0
+; CHECK-NEXT:    ret
+  %and = and i64 %a, 65535
+  ret i64 %and
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define i32 @bswap_i32(i32 %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a2, a1
+; CHECK-NEXT:    rev8 a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i32 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 31
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+
+define i32 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 25
+; CHECK-NEXT:    srai a0, a0, 30
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i32
+  ret i32 %3
+}
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB27_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB27_3
+; CHECK-NEXT:  .LBB27_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB27_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lw a3, 4(a1)
+; CHECK-NEXT:    lw a4, 8(a1)
+; CHECK-NEXT:    lw a5, 12(a1)
+; CHECK-NEXT:    lw a6, 4(a2)
+; CHECK-NEXT:    lw t0, 12(a2)
+; CHECK-NEXT:    lw a7, 8(a2)
+; CHECK-NEXT:    beq a5, t0, .LBB28_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu t1, a5, t0
+; CHECK-NEXT:    j .LBB28_3
+; CHECK-NEXT:  .LBB28_2:
+; CHECK-NEXT:    sltu t1, a4, a7
+; CHECK-NEXT:  .LBB28_3:
+; CHECK-NEXT:    lw a1, 0(a1)
+; CHECK-NEXT:    lw a2, 0(a2)
+; CHECK-NEXT:    beq a3, a6, .LBB28_5
+; CHECK-NEXT:  # %bb.4:
+; CHECK-NEXT:    sltu t2, a3, a6
+; CHECK-NEXT:    j .LBB28_6
+; CHECK-NEXT:  .LBB28_5:
+; CHECK-NEXT:    sltu t2, a1, a2
+; CHECK-NEXT:  .LBB28_6:
+; CHECK-NEXT:    xor t3, a5, t0
+; CHECK-NEXT:    xor t4, a4, a7
+; CHECK-NEXT:    or t3, t4, t3
+; CHECK-NEXT:    beqz t3, .LBB28_8
+; CHECK-NEXT:  # %bb.7:
+; CHECK-NEXT:    mv t2, t1
+; CHECK-NEXT:  .LBB28_8:
+; CHECK-NEXT:    addi t3, t2, -1
+; CHECK-NEXT:    and t2, t3, t0
+; CHECK-NEXT:    and t0, t3, a2
+; CHECK-NEXT:    and t1, t3, a6
+; CHECK-NEXT:    sltu a2, a1, t0
+; CHECK-NEXT:    and a7, t3, a7
+; CHECK-NEXT:    mv a6, a2
+; CHECK-NEXT:    beq a3, t1, .LBB28_10
+; CHECK-NEXT:  # %bb.9:
+; CHECK-NEXT:    sltu a6, a3, t1
+; CHECK-NEXT:  .LBB28_10:
+; CHECK-NEXT:    sub t3, a4, a7
+; CHECK-NEXT:    sltu a4, a4, a7
+; CHECK-NEXT:    sub a5, a5, t2
+; CHECK-NEXT:    sub a3, a3, t1
+; CHECK-NEXT:    sub a1, a1, t0
+; CHECK-NEXT:    sltu a7, t3, a6
+; CHECK-NEXT:    sub a5, a5, a4
+; CHECK-NEXT:    sub a4, t3, a6
+; CHECK-NEXT:    sub a3, a3, a2
+; CHECK-NEXT:    sub a2, a5, a7
+; CHECK-NEXT:    sw a1, 0(a0)
+; CHECK-NEXT:    sw a3, 4(a0)
+; CHECK-NEXT:    sw a4, 8(a0)
+; CHECK-NEXT:    sw a2, 12(a0)
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a2, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sll a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a2, a0, a1
+; CHECK-NEXT:    minu a2, a0, a2
+; CHECK-NEXT:    bltu a0, a1, .LBB30_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a0, 4
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB30_2:
+; CHECK-NEXT:    li a0, 2
+; CHECK-NEXT:    sll a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 %x, i32 %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 16
+; CHECK-NEXT:    srli a1, a1, 16
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    li a2, 1
+; CHECK-NEXT:    beq a1, a2, .LBB35_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltiu a2, a1, 2
+; CHECK-NEXT:    xori a2, a2, 1
+; CHECK-NEXT:    j .LBB35_3
+; CHECK-NEXT:  .LBB35_2:
+; CHECK-NEXT:    lui a2, 172127
+; CHECK-NEXT:    addi a2, a2, 511
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:  .LBB35_3:
+; CHECK-NEXT:    neg a2, a2
+; CHECK-NEXT:    andi a3, a2, -2
+; CHECK-NEXT:    add a1, a1, a3
+; CHECK-NEXT:    lui a3, 876449
+; CHECK-NEXT:    addi a3, a3, -512
+; CHECK-NEXT:    and a2, a2, a3
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    sltu a0, a2, a0
+; CHECK-NEXT:    add a1, a1, a0
+; CHECK-NEXT:    mv a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    add a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    add a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rv64p.ll b/llvm/test/CodeGen/RISCV/rv64p.ll
new file mode 100644
index 0000000000000..cb07f945a582a
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64p.ll
@@ -0,0 +1,677 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv64 -mattr=+experimental-p -verify-machineinstrs < %s \
+; RUN:   | FileCheck %s
+
+declare i32 @llvm.ctlz.i32(i32, i1)
+
+define signext i32 @ctlz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ctlz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @log2_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 31
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 false)
+  %2 = sub i32 31, %1
+  ret i32 %2
+}
+
+define signext i32 @log2_ceil_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: log2_ceil_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = sub i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  %3 = sub i32 32, %2
+  ret i32 %3
+}
+
+define signext i32 @findLastSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findLastSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clzw a1, a0
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    xori a1, a1, 31
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a1
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.ctlz.i32(i32 %a, i1 true)
+  %2 = xor i32 31, %1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 -1, i32 %2
+  ret i32 %4
+}
+
+define i32 @ctlz_lshr_i32(i32 signext %a) {
+; CHECK-LABEL: ctlz_lshr_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    srliw a0, a0, 1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    ret
+  %1 = lshr i32 %a, 1
+  %2 = call i32 @llvm.ctlz.i32(i32 %1, i1 false)
+  ret i32 %2
+}
+
+declare i64 @llvm.ctlz.i64(i64, i1)
+
+define i64 @ctlz_i64(i64 %a) nounwind {
+; CHECK-LABEL: ctlz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.ctlz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+declare i32 @llvm.cttz.i32(i32, i1)
+
+define signext i32 @cttz_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB6_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB6_2:
+; CHECK-NEXT:    li a0, 32
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 false)
+  ret i32 %1
+}
+
+define signext i32 @cttz_zero_undef_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: cttz_zero_undef_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clzw a0, a0
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  ret i32 %1
+}
+
+define signext i32 @findFirstSet_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: findFirstSet_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 32
+; CHECK-NEXT:    snez a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    or a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = icmp eq i32 %a, 0
+  %3 = select i1 %2, i32 -1, i32 %1
+  ret i32 %3
+}
+
+define signext i32 @ffs_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: ffs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a2, a0
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    li a2, 33
+; CHECK-NEXT:    seqz a0, a0
+; CHECK-NEXT:    clzw a1, a1
+; CHECK-NEXT:    sub a2, a2, a1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    ret
+  %1 = call i32 @llvm.cttz.i32(i32 %a, i1 true)
+  %2 = add i32 %1, 1
+  %3 = icmp eq i32 %a, 0
+  %4 = select i1 %3, i32 0, i32 %2
+  ret i32 %4
+}
+
+declare i64 @llvm.cttz.i64(i64, i1)
+
+define i64 @cttz_i64(i64 %a) nounwind {
+; CHECK-LABEL: cttz_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beqz a0, .LBB10_2
+; CHECK-NEXT:  # %bb.1: # %cond.false
+; CHECK-NEXT:    addi a1, a0, -1
+; CHECK-NEXT:    not a0, a0
+; CHECK-NEXT:    and a0, a0, a1
+; CHECK-NEXT:    clz a0, a0
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    sub a0, a1, a0
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB10_2:
+; CHECK-NEXT:    li a0, 64
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.cttz.i64(i64 %a, i1 false)
+  ret i64 %1
+}
+
+define signext i32 @sextb_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sextb_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 24
+  %shr = ashr exact i32 %shl, 24
+  ret i32 %shr
+}
+
+define i64 @sextb_i64(i64 %a) nounwind {
+; CHECK-LABEL: sextb_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.b a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 56
+  %shr = ashr exact i64 %shl, 56
+  ret i64 %shr
+}
+
+define signext i32 @sexth_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: sexth_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i32 %a, 16
+  %shr = ashr exact i32 %shl, 16
+  ret i32 %shr
+}
+
+define i64 @sexth_i64(i64 %a) nounwind {
+; CHECK-LABEL: sexth_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.h a0, a0
+; CHECK-NEXT:    ret
+  %shl = shl i64 %a, 48
+  %shr = ashr exact i64 %shl, 48
+  ret i64 %shr
+}
+
+define signext i32 @min_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: min_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @min_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: min_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    min a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp slt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @max_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: max_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @max_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: max_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    max a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp sgt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @minu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: minu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @minu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: minu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+define signext i32 @maxu_i32(i32 signext %a, i32 signext %b) nounwind {
+; CHECK-LABEL: maxu_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %a, %b
+  %cond = select i1 %cmp, i32 %a, i32 %b
+  ret i32 %cond
+}
+
+define i64 @maxu_i64(i64 %a, i64 %b) nounwind {
+; CHECK-LABEL: maxu_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    maxu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %a, %b
+  %cond = select i1 %cmp, i64 %a, i64 %b
+  ret i64 %cond
+}
+
+declare i32 @llvm.abs.i32(i32, i1 immarg)
+
+define i32 @abs_i32(i32 %x) {
+; CHECK-LABEL: abs_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+define signext i32 @abs_i32_sext(i32 signext %x) {
+; CHECK-LABEL: abs_i32_sext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    sext.w a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i32 @llvm.abs.i32(i32 %x, i1 true)
+  ret i32 %abs
+}
+
+declare i64 @llvm.abs.i64(i64, i1 immarg)
+
+define i64 @abs_i64(i64 %x) {
+; CHECK-LABEL: abs_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    abs a0, a0
+; CHECK-NEXT:    ret
+  %abs = tail call i64 @llvm.abs.i64(i64 %x, i1 true)
+  ret i64 %abs
+}
+
+declare i32 @llvm.bswap.i32(i32)
+
+define signext i32 @bswap_i32(i32 signext %a) nounwind {
+; CHECK-LABEL: bswap_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srai a0, a0, 32
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  ret i32 %1
+}
+
+; Similar to bswap_i32 but the result is not sign extended.
+define void @bswap_i32_nosext(i32 signext %a, ptr %x) nounwind {
+; CHECK-LABEL: bswap_i32_nosext:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    srli a0, a0, 32
+; CHECK-NEXT:    sw a0, 0(a1)
+; CHECK-NEXT:    ret
+  %1 = tail call i32 @llvm.bswap.i32(i32 %a)
+  store i32 %1, ptr %x
+  ret void
+}
+
+declare i64 @llvm.bswap.i64(i64)
+
+define i64 @bswap_i64(i64 %a) {
+; CHECK-LABEL: bswap_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    rev8 a0, a0
+; CHECK-NEXT:    ret
+  %1 = call i64 @llvm.bswap.i64(i64 %a)
+  ret i64 %1
+}
+
+define i64 @srai_slli(i16 signext %0) {
+; CHECK-LABEL: srai_slli:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 63
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 15
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define i64 @srai_slli2(i16 signext %0) {
+; CHECK-LABEL: srai_slli2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    slli a0, a0, 57
+; CHECK-NEXT:    srai a0, a0, 62
+; CHECK-NEXT:    ret
+  %2 = shl i16 %0, 9
+  %sext = ashr i16 %2, 14
+  %3 = sext i16 %sext to i64
+  ret i64 %3
+}
+
+define signext i32 @func0000000000000001(i32 signext %0, i8 signext %1) #0 {
+; CHECK-LABEL: func0000000000000001:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    slli a1, a1, 59
+; CHECK-NEXT:    srai a1, a1, 63
+; CHECK-NEXT:    addw a0, a1, a0
+; CHECK-NEXT:    ret
+entry:
+  %2 = shl i8 %1, 3
+  %3 = ashr i8 %2, 7
+  %4 = sext i8 %3 to i32
+  %5 = add nsw i32 %4, %0
+  ret i32 %5
+}
+
+define i8 @sub_if_uge_i8(i8 %x, i8 %y) {
+; CHECK-LABEL: sub_if_uge_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    zext.b a2, a0
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    zext.b a0, a0
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i8 %x, %y
+  %select = select i1 %cmp, i8 0, i8 %y
+  %sub = sub nuw i8 %x, %select
+  ret i8 %sub
+}
+
+define i16 @sub_if_uge_i16(i16 %x, i16 %y) {
+; CHECK-LABEL: sub_if_uge_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a0, a0, a2
+; CHECK-NEXT:    and a1, a1, a2
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i16 %x, %y
+  %select = select i1 %cmp, i16 0, i16 %y
+  %sub = sub nuw i16 %x, %select
+  ret i16 %sub
+}
+
+define i32 @sub_if_uge_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i64 @sub_if_uge_i64(i64 %x, i64 %y) {
+; CHECK-LABEL: sub_if_uge_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sub a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i64 %x, %y
+  %select = select i1 %cmp, i64 0, i64 %y
+  %sub = sub nuw i64 %x, %select
+  ret i64 %sub
+}
+
+define i128 @sub_if_uge_i128(i128 %x, i128 %y) {
+; CHECK-LABEL: sub_if_uge_i128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    beq a1, a3, .LBB36_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    sltu a4, a1, a3
+; CHECK-NEXT:    j .LBB36_3
+; CHECK-NEXT:  .LBB36_2:
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:  .LBB36_3:
+; CHECK-NEXT:    addi a4, a4, -1
+; CHECK-NEXT:    and a3, a4, a3
+; CHECK-NEXT:    and a2, a4, a2
+; CHECK-NEXT:    sltu a4, a0, a2
+; CHECK-NEXT:    sub a1, a1, a3
+; CHECK-NEXT:    sub a1, a1, a4
+; CHECK-NEXT:    sub a0, a0, a2
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i128 %x, %y
+  %select = select i1 %cmp, i128 0, i128 %y
+  %sub = sub nuw i128 %x, %select
+  ret i128 %sub
+}
+
+define i32 @sub_if_uge_multiuse_select_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_select_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    sltu a2, a3, a2
+; CHECK-NEXT:    addi a2, a2, -1
+; CHECK-NEXT:    and a1, a2, a1
+; CHECK-NEXT:    sub a0, a0, a1
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %shl = shl i32 %sub, %select
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_i32(i32 %x, i32 %y) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sext.w a2, a1
+; CHECK-NEXT:    sext.w a3, a0
+; CHECK-NEXT:    subw a0, a0, a1
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    bltu a3, a2, .LBB38_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    li a1, 4
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+; CHECK-NEXT:  .LBB38_2:
+; CHECK-NEXT:    li a1, 2
+; CHECK-NEXT:    sllw a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, %y
+  %select = select i1 %cmp, i32 0, i32 %y
+  %sub = sub nuw i32 %x, %select
+  %select2 = select i1 %cmp, i32 2, i32 4
+  %shl = shl i32 %sub, %select2
+  ret i32 %shl
+}
+
+define i32 @sub_if_uge_multiuse_cmp_store_i32(i32 signext %x, i32 signext %y, ptr %z) {
+; CHECK-LABEL: sub_if_uge_multiuse_cmp_store_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    sltu a3, a0, a1
+; CHECK-NEXT:    subw a1, a0, a1
+; CHECK-NEXT:    xori a3, a3, 1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    sw a3, 0(a2)
+; CHECK-NEXT:    ret
+  %cmp = icmp uge i32 %x, %y
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %select = select i1 %cmp, i32 %y, i32 0
+  %sub = sub nuw i32 %x, %select
+  ret i32 %sub
+}
+
+define i8 @sub_if_uge_C_i8(i8 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i8:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -13
+; CHECK-NEXT:    zext.b a1, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i8 %x, 12
+  %sub = add i8 %x, -13
+  %conv4 = select i1 %cmp, i8 %sub, i8 %x
+  ret i8 %conv4
+}
+
+define i16 @sub_if_uge_C_i16(i16 zeroext %x) {
+; CHECK-LABEL: sub_if_uge_C_i16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a1, a0, -251
+; CHECK-NEXT:    slli a1, a1, 48
+; CHECK-NEXT:    srli a1, a1, 48
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i16 %x, 250
+  %sub = add i16 %x, -251
+  %conv4 = select i1 %cmp, i16 %sub, i16 %x
+  ret i16 %conv4
+}
+
+define i32 @sub_if_uge_C_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i64 @sub_if_uge_C_i64(i64 %x) {
+; CHECK-LABEL: sub_if_uge_C_i64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1046192
+; CHECK-NEXT:    addi a1, a1, -761
+; CHECK-NEXT:    slli a1, a1, 9
+; CHECK-NEXT:    add a1, a0, a1
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i64 %x, 4999999999
+  %sub = add i64 %x, -5000000000
+  %cond = select i1 %cmp, i64 %sub, i64 %x
+  ret i64 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_cmp_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_cmp_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 16
+; CHECK-NEXT:    lui a3, 1048560
+; CHECK-NEXT:    addi a2, a2, -16
+; CHECK-NEXT:    addi a3, a3, 15
+; CHECK-NEXT:    sltu a2, a2, a0
+; CHECK-NEXT:    addw a3, a0, a3
+; CHECK-NEXT:    minu a0, a3, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %cmp = icmp ugt i32 %x, 65520
+  %conv = zext i1 %cmp to i32
+  store i32 %conv, ptr %z, align 4
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_multiuse_sub_i32(i32 signext %x, ptr %z) {
+; CHECK-LABEL: sub_if_uge_C_multiuse_sub_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a2, 1048560
+; CHECK-NEXT:    addi a2, a2, 15
+; CHECK-NEXT:    addw a2, a0, a2
+; CHECK-NEXT:    minu a0, a2, a0
+; CHECK-NEXT:    sw a2, 0(a1)
+; CHECK-NEXT:    ret
+  %sub = add i32 %x, -65521
+  store i32 %sub, ptr %z, align 4
+  %cmp = icmp ugt i32 %x, 65520
+  %cond = select i1 %cmp, i32 %sub, i32 %x
+  ret i32 %cond
+}
+
+define i32 @sub_if_uge_C_swapped_i32(i32 signext %x) {
+; CHECK-LABEL: sub_if_uge_C_swapped_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lui a1, 1048560
+; CHECK-NEXT:    addi a1, a1, 15
+; CHECK-NEXT:    addw a1, a0, a1
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %cmp = icmp ult i32 %x, 65521
+  %sub = add i32 %x, -65521
+  %cond = select i1 %cmp, i32 %x, i32 %sub
+  ret i32 %cond
+}
+
+define i7 @sub_if_uge_C_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a0, a1
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ugt i7 %x, -18
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %add, i7 %x
+  ret i7 %s
+}
+
+define i7 @sub_if_uge_C_swapped_nsw_i7(i7 %a) {
+; CHECK-LABEL: sub_if_uge_C_swapped_nsw_i7:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ori a0, a0, 51
+; CHECK-NEXT:    andi a1, a0, 127
+; CHECK-NEXT:    addi a0, a0, 17
+; CHECK-NEXT:    andi a0, a0, 92
+; CHECK-NEXT:    minu a0, a1, a0
+; CHECK-NEXT:    ret
+  %x = or i7 %a, 51
+  %c = icmp ult i7 %x, -17
+  %add = add nsw i7 %x, 17
+  %s = select i1 %c, i7 %x, i7 %add
+  ret i7 %s
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
new file mode 100644
index 0000000000000..5c0c6c10da3e6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfbfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+zvfbfmin,+xsfvfbfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_v_nxv1bf16(<vscale x 1 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.nxv1bf16(
+    <vscale x 1 x bfloat> poison,
+    <vscale x 1 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_v_nxv2bf16(<vscale x 2 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.nxv2bf16(
+    <vscale x 2 x bfloat> poison,
+    <vscale x 2 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_v_nxv4bf16(<vscale x 4 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.nxv4bf16(
+    <vscale x 4 x bfloat> poison,
+    <vscale x 4 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_v_nxv8bf16(<vscale x 8 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.nxv8bf16(
+    <vscale x 8 x bfloat> poison,
+    <vscale x 8 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_v_nxv16bf16(<vscale x 16 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.nxv16bf16(
+    <vscale x 16 x bfloat> poison,
+    <vscale x 16 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_v_nxv32bf16(<vscale x 32 x bfloat> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.nxv32bf16(
+    <vscale x 32 x bfloat> poison,
+    <vscale x 32 x bfloat> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x bfloat> %a
+}
+
+define <vscale x 1 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv1bf16(<vscale x 1 x bfloat> %0, <vscale x 1 x bfloat> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv1bf16(
+    <vscale x 1 x bfloat> %0,
+    <vscale x 1 x bfloat> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x bfloat> %a
+}
+
+define <vscale x 2 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv2bf16(<vscale x 2 x bfloat> %0, <vscale x 2 x bfloat> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv2bf16(
+    <vscale x 2 x bfloat> %0,
+    <vscale x 2 x bfloat> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x bfloat> %a
+}
+
+define <vscale x 4 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv4bf16(<vscale x 4 x bfloat> %0, <vscale x 4 x bfloat> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv4bf16(
+    <vscale x 4 x bfloat> %0,
+    <vscale x 4 x bfloat> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x bfloat> %a
+}
+
+define <vscale x 8 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv8bf16(<vscale x 8 x bfloat> %0, <vscale x 8 x bfloat> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv8bf16(
+    <vscale x 8 x bfloat> %0,
+    <vscale x 8 x bfloat> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x bfloat> %a
+}
+
+define <vscale x 16 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv16bf16(<vscale x 16 x bfloat> %0, <vscale x 16 x bfloat> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv16bf16(
+    <vscale x 16 x bfloat> %0,
+    <vscale x 16 x bfloat> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x bfloat> %a
+}
+
+define <vscale x 32 x bfloat>  @intrinsic_sf_vfexp_mask_v_nxv32bf16(<vscale x 32 x bfloat> %0, <vscale x 32 x bfloat> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32bf16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16alt, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x bfloat> @llvm.riscv.sf.vfexp.mask.nxv32bf16(
+    <vscale x 32 x bfloat> %0,
+    <vscale x 32 x bfloat> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x bfloat> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
new file mode 100644
index 0000000000000..2d97f73b73289
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp16e.ll
@@ -0,0 +1,191 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp16e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.nxv1f16(
+    <vscale x 1 x half> poison,
+    <vscale x 1 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.nxv2f16(
+    <vscale x 2 x half> poison,
+    <vscale x 2 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.nxv4f16(
+    <vscale x 4 x half> poison,
+    <vscale x 4 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.nxv8f16(
+    <vscale x 8 x half> poison,
+    <vscale x 8 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.nxv16f16(
+    <vscale x 16 x half> poison,
+    <vscale x 16 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.nxv32f16(
+    <vscale x 32 x half> poison,
+    <vscale x 32 x half> %0,
+    iXLen %1)
+
+  ret <vscale x 32 x half> %a
+}
+
+define <vscale x 1 x half>  @intrinsic_sf_vfexp_mask_v_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x half> @llvm.riscv.sf.vfexp.mask.nxv1f16(
+    <vscale x 1 x half> %0,
+    <vscale x 1 x half> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x half> %a
+}
+
+define <vscale x 2 x half>  @intrinsic_sf_vfexp_mask_v_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x half> @llvm.riscv.sf.vfexp.mask.nxv2f16(
+    <vscale x 2 x half> %0,
+    <vscale x 2 x half> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x half> %a
+}
+
+define <vscale x 4 x half>  @intrinsic_sf_vfexp_mask_v_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x half> @llvm.riscv.sf.vfexp.mask.nxv4f16(
+    <vscale x 4 x half> %0,
+    <vscale x 4 x half> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x half> %a
+}
+
+define <vscale x 8 x half>  @intrinsic_sf_vfexp_mask_v_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x half> @llvm.riscv.sf.vfexp.mask.nxv8f16(
+    <vscale x 8 x half> %0,
+    <vscale x 8 x half> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x half> %a
+}
+
+define <vscale x 16 x half>  @intrinsic_sf_vfexp_mask_v_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x half> @llvm.riscv.sf.vfexp.mask.nxv16f16(
+    <vscale x 16 x half> %0,
+    <vscale x 16 x half> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x half> %a
+}
+
+define <vscale x 32 x half>  @intrinsic_sf_vfexp_mask_v_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 32 x half> @llvm.riscv.sf.vfexp.mask.nxv32f16(
+    <vscale x 32 x half> %0,
+    <vscale x 32 x half> %1,
+    <vscale x 32 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 32 x half> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
new file mode 100644
index 0000000000000..46dce14340c41
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexp32e.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+v,+xsfvfexp32e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.nxv1f32(
+    <vscale x 1 x float> poison,
+    <vscale x 1 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.nxv2f32(
+    <vscale x 2 x float> poison,
+    <vscale x 2 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.nxv4f32(
+    <vscale x 4 x float> poison,
+    <vscale x 4 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.nxv8f32(
+    <vscale x 8 x float> poison,
+    <vscale x 8 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexp.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.nxv16f32(
+    <vscale x 16 x float> poison,
+    <vscale x 16 x float> %0,
+    iXLen %1)
+
+  ret <vscale x 16 x float> %a
+}
+
+define <vscale x 1 x float>  @intrinsic_sf_vfexp_mask_v_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 1 x float> @llvm.riscv.sf.vfexp.mask.nxv1f32(
+    <vscale x 1 x float> %0,
+    <vscale x 1 x float> %1,
+    <vscale x 1 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 1 x float> %a
+}
+
+define <vscale x 2 x float>  @intrinsic_sf_vfexp_mask_v_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 2 x float> @llvm.riscv.sf.vfexp.mask.nxv2f32(
+    <vscale x 2 x float> %0,
+    <vscale x 2 x float> %1,
+    <vscale x 2 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 2 x float> %a
+}
+
+define <vscale x 4 x float>  @intrinsic_sf_vfexp_mask_v_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 4 x float> @llvm.riscv.sf.vfexp.mask.nxv4f32(
+    <vscale x 4 x float> %0,
+    <vscale x 4 x float> %1,
+    <vscale x 4 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 4 x float> %a
+}
+
+define <vscale x 8 x float>  @intrinsic_sf_vfexp_mask_v_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 8 x float> @llvm.riscv.sf.vfexp.mask.nxv8f32(
+    <vscale x 8 x float> %0,
+    <vscale x 8 x float> %1,
+    <vscale x 8 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 8 x float> %a
+}
+
+define <vscale x 16 x float>  @intrinsic_sf_vfexp_mask_v_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %2) nounwind {
+; CHECK-LABEL: intrinsic_sf_vfexp_mask_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexp.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+  %a = call <vscale x 16 x float> @llvm.riscv.sf.vfexp.mask.nxv16f32(
+    <vscale x 16 x float> %0,
+    <vscale x 16 x float> %1,
+    <vscale x 16 x i1> %m,
+    iXLen %2, iXLen 0)
+
+  ret <vscale x 16 x float> %a
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
new file mode 100644
index 0000000000000..d3d10d2a83f41
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa.ll
@@ -0,0 +1,335 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+zve64f,+zvfh,+xsfvfexpa \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_nxv1f32(<vscale x 1 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.nxv1f32(
+      <vscale x 1 x float> poison,
+      <vscale x 1 x float> %0,
+      iXLen %1)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_nxv2f32(<vscale x 2 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.nxv2f32(
+      <vscale x 2 x float> poison,
+      <vscale x 2 x float> %0,
+      iXLen %1)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_nxv4f32(<vscale x 4 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.nxv4f32(
+      <vscale x 4 x float> poison,
+      <vscale x 4 x float> %0,
+      iXLen %1)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_nxv8f32(<vscale x 8 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.nxv8f32(
+      <vscale x 8 x float> poison,
+      <vscale x 8 x float> %0,
+      iXLen %1)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_nxv16f32(<vscale x 16 x float> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.nxv16f32(
+      <vscale x 16 x float> poison,
+      <vscale x 16 x float> %0,
+      iXLen %1)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv1f32(<vscale x 1 x float> %0, <vscale x 1 x float> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x float> @llvm.riscv.sf.vfexpa.mask.nxv1f32(
+      <vscale x 1 x float> %0,
+      <vscale x 1 x float> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x float> %f
+}
+
+define <vscale x 2 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv2f32(<vscale x 2 x float> %0, <vscale x 2 x float> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x float> @llvm.riscv.sf.vfexpa.mask.nxv2f32(
+      <vscale x 2 x float> %0,
+      <vscale x 2 x float> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x float> %f
+}
+
+define <vscale x 4 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv4f32(<vscale x 4 x float> %0, <vscale x 4 x float> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x float> @llvm.riscv.sf.vfexpa.mask.nxv4f32(
+      <vscale x 4 x float> %0,
+      <vscale x 4 x float> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x float> %f
+}
+
+define <vscale x 8 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv8f32(<vscale x 8 x float> %0, <vscale x 8 x float> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x float> @llvm.riscv.sf.vfexpa.mask.nxv8f32(
+      <vscale x 8 x float> %0,
+      <vscale x 8 x float> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x float> %f
+}
+
+define <vscale x 16 x float> @test_intrinsic_sf_vfexpa_v_mask_nxv16f32(<vscale x 16 x float> %0, <vscale x 16 x float> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e32, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x float> @llvm.riscv.sf.vfexpa.mask.nxv16f32(
+      <vscale x 16 x float> %0,
+      <vscale x 16 x float> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x float> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_nxv1f16(<vscale x 1 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.nxv1f16(
+      <vscale x 1 x half> poison,
+      <vscale x 1 x half> %0,
+      iXLen %1)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_nxv2f16(<vscale x 2 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.nxv2f16(
+      <vscale x 2 x half> poison,
+      <vscale x 2 x half> %0,
+      iXLen %1)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_nxv4f16(<vscale x 4 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.nxv4f16(
+      <vscale x 4 x half> poison,
+      <vscale x 4 x half> %0,
+      iXLen %1)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_nxv8f16(<vscale x 8 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.nxv8f16(
+      <vscale x 8 x half> poison,
+      <vscale x 8 x half> %0,
+      iXLen %1)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_nxv16f16(<vscale x 16 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.nxv16f16(
+      <vscale x 16 x half> poison,
+      <vscale x 16 x half> %0,
+      iXLen %1)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_nxv32f16(<vscale x 32 x half> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.nxv32f16(
+      <vscale x 32 x half> poison,
+      <vscale x 32 x half> %0,
+      iXLen %1)
+    ret <vscale x 32 x half> %f
+}
+
+define <vscale x 1 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv1f16(<vscale x 1 x half> %0, <vscale x 1 x half> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x half> @llvm.riscv.sf.vfexpa.mask.nxv1f16(
+      <vscale x 1 x half> %0,
+      <vscale x 1 x half> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x half> %f
+}
+
+define <vscale x 2 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv2f16(<vscale x 2 x half> %0, <vscale x 2 x half> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x half> @llvm.riscv.sf.vfexpa.mask.nxv2f16(
+      <vscale x 2 x half> %0,
+      <vscale x 2 x half> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x half> %f
+}
+
+define <vscale x 4 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv4f16(<vscale x 4 x half> %0, <vscale x 4 x half> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x half> @llvm.riscv.sf.vfexpa.mask.nxv4f16(
+      <vscale x 4 x half> %0,
+      <vscale x 4 x half> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x half> %f
+}
+
+define <vscale x 8 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv8f16(<vscale x 8 x half> %0, <vscale x 8 x half> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x half> @llvm.riscv.sf.vfexpa.mask.nxv8f16(
+      <vscale x 8 x half> %0,
+      <vscale x 8 x half> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x half> %f
+}
+
+define <vscale x 16 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv16f16(<vscale x 16 x half> %0, <vscale x 16 x half> %1, <vscale x 16 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv16f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 16 x half> @llvm.riscv.sf.vfexpa.mask.nxv16f16(
+      <vscale x 16 x half> %0,
+      <vscale x 16 x half> %1,
+      <vscale x 16 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 16 x half> %f
+}
+
+define <vscale x 32 x half> @test_intrinsic_sf_vfexpa_v_mask_nxv32f16(<vscale x 32 x half> %0, <vscale x 32 x half> %1, <vscale x 32 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv32f16:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 32 x half> @llvm.riscv.sf.vfexpa.mask.nxv32f16(
+      <vscale x 32 x half> %0,
+      <vscale x 32 x half> %1,
+      <vscale x 32 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 32 x half> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
new file mode 100644
index 0000000000000..3de0e93923aee
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rvv/sf_vfexpa64e.ll
@@ -0,0 +1,125 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+; RUN: sed 's/iXLen/i64/g' %s | llc -mtriple=riscv64 -mattr=+xsfvfexpa64e \
+; RUN:   -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_nxv1f64(<vscale x 1 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.nxv1f64(
+      <vscale x 1 x double> poison,
+      <vscale x 1 x double> %0,
+      iXLen %1)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_nxv2f64(<vscale x 2 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.nxv2f64(
+      <vscale x 2 x double> poison,
+      <vscale x 2 x double> %0,
+      iXLen %1)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_nxv4f64(<vscale x 4 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.nxv4f64(
+      <vscale x 4 x double> poison,
+      <vscale x 4 x double> %0,
+      iXLen %1)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_nxv8f64(<vscale x 8 x double> %0, iXLen %1) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, ta, ma
+; CHECK-NEXT:    sf.vfexpa.v v8, v8
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.nxv8f64(
+      <vscale x 8 x double> poison,
+      <vscale x 8 x double> %0,
+      iXLen %1)
+    ret <vscale x 8 x double> %f
+}
+
+define <vscale x 1 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv1f64(<vscale x 1 x double> %0, <vscale x 1 x double> %1, <vscale x 1 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv1f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m1, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v9, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 1 x double> @llvm.riscv.sf.vfexpa.mask.nxv1f64(
+      <vscale x 1 x double> %0,
+      <vscale x 1 x double> %1,
+      <vscale x 1 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 1 x double> %f
+}
+
+define <vscale x 2 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv2f64(<vscale x 2 x double> %0, <vscale x 2 x double> %1, <vscale x 2 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m2, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v10, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 2 x double> @llvm.riscv.sf.vfexpa.mask.nxv2f64(
+      <vscale x 2 x double> %0,
+      <vscale x 2 x double> %1,
+      <vscale x 2 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 2 x double> %f
+}
+
+define <vscale x 4 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv4f64(<vscale x 4 x double> %0, <vscale x 4 x double> %1, <vscale x 4 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m4, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v12, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 4 x double> @llvm.riscv.sf.vfexpa.mask.nxv4f64(
+      <vscale x 4 x double> %0,
+      <vscale x 4 x double> %1,
+      <vscale x 4 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 4 x double> %f
+}
+
+define <vscale x 8 x double> @test_intrinsic_sf_vfexpa_v_mask_nxv8f64(<vscale x 8 x double> %0, <vscale x 8 x double> %1, <vscale x 8 x i1> %m, iXLen %vl) {
+; CHECK-LABEL: test_intrinsic_sf_vfexpa_v_mask_nxv8f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vsetvli zero, a0, e64, m8, tu, mu
+; CHECK-NEXT:    sf.vfexpa.v v8, v16, v0.t
+; CHECK-NEXT:    ret
+entry:
+    %f = call <vscale x 8 x double> @llvm.riscv.sf.vfexpa.mask.nxv8f64(
+      <vscale x 8 x double> %0,
+      <vscale x 8 x double> %1,
+      <vscale x 8 x i1> %m,
+      iXLen %vl,
+      iXLen 0)
+    ret <vscale x 8 x double> %f
+}
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
index 061b2b0c5ab37..abd00b647e374 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-sdnode.ll
@@ -11,33 +11,80 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfh,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x bfloat> %va, %vb
   ret <vscale x 1 x bfloat> %vc
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 1 x bfloat> %head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x bfloat> %va, %splat
@@ -45,31 +92,75 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x bfloat> %va, %vb
   ret <vscale x 2 x bfloat> %vc
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vf v9, v9, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v9, v9, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 2 x bfloat> %head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x bfloat> %va, %splat
@@ -77,31 +168,75 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x bfloat> %va, %vb
   ret <vscale x 4 x bfloat> %vc
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vf v10, v10, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v10, v10, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 4 x bfloat> %head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x bfloat> %va, %splat
@@ -109,31 +244,75 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x bfloat> %va, %vb
   ret <vscale x 8 x bfloat> %vc
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %va, %splat
@@ -141,16 +320,38 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_fv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vf v12, v12, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_fv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_fv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v12, v12, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 8 x bfloat> %head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x bfloat> %splat, %va
@@ -158,31 +359,75 @@ define <vscale x 8 x bfloat> @vfadd_fv_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x bfloat> %va, %vb
   ret <vscale x 16 x bfloat> %vc
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fcvt.s.bf16 fa5, fa0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vf v16, v16, fa5
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fcvt.s.bf16 fa5, fa0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v16, v16, fa5
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 16 x bfloat> %head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x bfloat> %va, %splat
@@ -190,78 +435,216 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v0, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v0, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v0, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v0, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x bfloat> %va, %vb
   ret <vscale x 32 x bfloat> %vc
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    sub sp, sp, a0
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a0, fa0
-; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv.v.x v8, a0
-; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v0, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v0, v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    sub sp, sp, a0
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a0, fa0
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv.v.x v8, a0
+; ZVFH-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v0, v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a0, fa0
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v8, a0
+; ZVFHMIN-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v0, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v0, v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    sub sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a0, fa0
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16alt, m8, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v8, a0
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v0, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v8, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v0, v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %splat = shufflevector <vscale x 32 x bfloat> %head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x bfloat> %va, %splat
@@ -285,6 +668,12 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 1 x half> %va, %vb
   ret <vscale x 1 x half> %vc
 }
@@ -306,6 +695,12 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 1 x half> %head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %vc = fadd <vscale x 1 x half> %va, %splat
@@ -329,6 +724,12 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 2 x half> %va, %vb
   ret <vscale x 2 x half> %vc
 }
@@ -350,6 +751,12 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 2 x half> %head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %vc = fadd <vscale x 2 x half> %va, %splat
@@ -373,6 +780,12 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v9
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 4 x half> %va, %vb
   ret <vscale x 4 x half> %vc
 }
@@ -394,6 +807,12 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 4 x half> %head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %vc = fadd <vscale x 4 x half> %va, %splat
@@ -417,6 +836,12 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v10
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 8 x half> %va, %vb
   ret <vscale x 8 x half> %vc
 }
@@ -438,6 +863,12 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %va, %splat
@@ -461,6 +892,12 @@ define <vscale x 8 x half> @vfadd_fv_nxv8f16(<vscale x 8 x half> %va, half %b) {
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_fv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 8 x half> %head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %vc = fadd <vscale x 8 x half> %splat, %va
@@ -484,6 +921,12 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v12
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 16 x half> %va, %vb
   ret <vscale x 16 x half> %vc
 }
@@ -505,6 +948,12 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 16 x half> %head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %vc = fadd <vscale x 16 x half> %va, %splat
@@ -549,6 +998,12 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v8, v8, v16
+; ZVFBFA-NEXT:    ret
   %vc = fadd <vscale x 32 x half> %va, %vb
   ret <vscale x 32 x half> %vc
 }
@@ -596,6 +1051,12 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vf v8, v8, fa0
+; ZVFBFA-NEXT:    ret
   %head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %splat = shufflevector <vscale x 32 x half> %head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %vc = fadd <vscale x 32 x half> %va, %splat
diff --git a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
index 32e3d6bc663be..633a201c0131a 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vfadd-vp.ll
@@ -11,52 +11,125 @@
 ; RUN: llc -mtriple=riscv64 -mattr=+d,+zfhmin,+zvfhmin,+zfbfmin,+zvfbfmin,+v \
 ; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
 ; RUN:     --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -mattr=+zvfhmin,+experimental-zvfbfa,+v \
+; RUN:     -target-abi=lp64d -verify-machineinstrs < %s | FileCheck %s \
+; RUN:     --check-prefixes=CHECK,ZVFBFA
 
 declare <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, <vscale x 1 x i1>, i32)
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vv_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -64,18 +137,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16(<vscale x 1 x bfloat> %va, bfloa
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %va, bfloat %b, <vscale x 1 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -83,18 +182,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_commute(<vscale x 1 x bfloat> %v
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %va, <vscale x 1 x bfloat> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -102,18 +227,44 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked(<vscale x 1 x bfloat> %
 }
 
 define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v8, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v8, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v8, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1bf16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 1 x bfloat> %elt.head, <vscale x 1 x bfloat> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x bfloat> @llvm.vp.fadd.nxv1bf16(<vscale x 1 x bfloat> %vb, <vscale x 1 x bfloat> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -123,48 +274,118 @@ define <vscale x 1 x bfloat> @vfadd_vf_nxv1bf16_unmasked_commute(<vscale x 1 x b
 declare <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, <vscale x 2 x i1>, i32)
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vv_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v9, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v9, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v9, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v9, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v9, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x bfloat> %v
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloat %b, <vscale x 2 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -172,18 +393,44 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16(<vscale x 2 x bfloat> %va, bfloa
 }
 
 define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv2bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
-; CHECK-NEXT:    vmv.v.x v9, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v9
-; CHECK-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
-; CHECK-NEXT:    vfadd.vv v9, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v9
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v9, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFH-NEXT:    vfadd.vv v9, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v9, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v9
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v9, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v9
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 2 x bfloat> %elt.head, <vscale x 2 x bfloat> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x bfloat> @llvm.vp.fadd.nxv2bf16(<vscale x 2 x bfloat> %va, <vscale x 2 x bfloat> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -193,48 +440,118 @@ define <vscale x 2 x bfloat> @vfadd_vf_nxv2bf16_unmasked(<vscale x 2 x bfloat> %
 declare <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, <vscale x 4 x i1>, i32)
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vv_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v9
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v12, v10
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v12, v10
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v9
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v12, v10
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x bfloat> %v
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloat %b, <vscale x 4 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -242,18 +559,44 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16(<vscale x 4 x bfloat> %va, bfloa
 }
 
 define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv4bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
-; CHECK-NEXT:    vmv.v.x v12, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
-; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
-; CHECK-NEXT:    vfadd.vv v10, v10, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFH-NEXT:    vmv.v.x v12, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFH-NEXT:    vfadd.vv v10, v10, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v12, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v10, v10, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v10
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 4 x bfloat> %elt.head, <vscale x 4 x bfloat> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x bfloat> @llvm.vp.fadd.nxv4bf16(<vscale x 4 x bfloat> %va, <vscale x 4 x bfloat> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -263,48 +606,118 @@ define <vscale x 4 x bfloat> @vfadd_vf_nxv4bf16_unmasked(<vscale x 4 x bfloat> %
 declare <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, <vscale x 8 x i1>, i32)
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vv_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v16, v12
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v16, v12
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v16, v12
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x bfloat> %v
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloat %b, <vscale x 8 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -312,18 +725,44 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16(<vscale x 8 x bfloat> %va, bfloa
 }
 
 define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv8bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v16
-; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
-; CHECK-NEXT:    vfadd.vv v12, v12, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v12
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFH-NEXT:    vfadd.vv v12, v12, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v12, v12, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v12
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 8 x bfloat> %elt.head, <vscale x 8 x bfloat> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x bfloat> @llvm.vp.fadd.nxv8bf16(<vscale x 8 x bfloat> %va, <vscale x 8 x bfloat> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -333,48 +772,118 @@ define <vscale x 8 x bfloat> @vfadd_vf_nxv8bf16_unmasked(<vscale x 8 x bfloat> %
 declare <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, <vscale x 16 x i1>, i32)
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vv_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x bfloat> %v
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bfloat %b, <vscale x 16 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -382,18 +891,44 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16(<vscale x 16 x bfloat> %va, bf
 }
 
 define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv16bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v24
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v8
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v8, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 16 x bfloat> %elt.head, <vscale x 16 x bfloat> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x bfloat> @llvm.vp.fadd.nxv16bf16(<vscale x 16 x bfloat> %va, <vscale x 16 x bfloat> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -403,173 +938,493 @@ define <vscale x 16 x bfloat> @vfadd_vf_nxv16bf16_unmasked(<vscale x 16 x bfloat
 declare <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, <vscale x 32 x i1>, i32)
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB22_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB22_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB22_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB22_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB22_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB22_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vv_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vv_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB23_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB23_2:
-; CHECK-NEXT:    addi a1, sp, 16
-; CHECK-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v24
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v8
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB23_2:
+; ZVFH-NEXT:    addi a1, sp, 16
+; ZVFH-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB23_2:
+; ZVFHMIN-NEXT:    addi a1, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v24
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v8
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB23_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB23_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x bfloat> %v
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bfloat %b, <vscale x 32 x i1> %m, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 4
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
-; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmv1r.v v7, v0
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vmv.v.x v24, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v0, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    csrr a3, vlenb
-; CHECK-NEXT:    slli a3, a3, 3
-; CHECK-NEXT:    add a3, sp, a3
-; CHECK-NEXT:    addi a3, a3, 16
-; CHECK-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v24, v16, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB24_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB24_2:
-; CHECK-NEXT:    vmv1r.v v0, v7
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add a0, sp, a0
-; CHECK-NEXT:    addi a0, a0, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 4
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 4
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFH-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmv1r.v v7, v0
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vmv.v.x v24, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    csrr a3, vlenb
+; ZVFH-NEXT:    slli a3, a3, 3
+; ZVFH-NEXT:    add a3, sp, a3
+; ZVFH-NEXT:    addi a3, a3, 16
+; ZVFH-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB24_2:
+; ZVFH-NEXT:    vmv1r.v v0, v7
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add a0, sp, a0
+; ZVFH-NEXT:    addi a0, a0, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 4
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 4
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFHMIN-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmv1r.v v7, v0
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vmv.v.x v24, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    csrr a3, vlenb
+; ZVFHMIN-NEXT:    slli a3, a3, 3
+; ZVFHMIN-NEXT:    add a3, sp, a3
+; ZVFHMIN-NEXT:    addi a3, a3, 16
+; ZVFHMIN-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v28, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB24_2:
+; ZVFHMIN-NEXT:    vmv1r.v v0, v7
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add a0, sp, a0
+; ZVFHMIN-NEXT:    addi a0, a0, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v16, v0.t
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16, v0.t
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 4
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB24_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB24_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -577,56 +1432,158 @@ define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16(<vscale x 32 x bfloat> %va, bf
 }
 
 define <vscale x 32 x bfloat> @vfadd_vf_nxv32bf16_unmasked(<vscale x 32 x bfloat> %va, bfloat %b, i32 zeroext %evl) {
-; CHECK-LABEL: vfadd_vf_nxv32bf16_unmasked:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    addi sp, sp, -16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    csrr a1, vlenb
-; CHECK-NEXT:    slli a1, a1, 3
-; CHECK-NEXT:    sub sp, sp, a1
-; CHECK-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
-; CHECK-NEXT:    fmv.x.h a1, fa0
-; CHECK-NEXT:    csrr a2, vlenb
-; CHECK-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
-; CHECK-NEXT:    vmset.m v24
-; CHECK-NEXT:    vmv.v.x v16, a1
-; CHECK-NEXT:    slli a1, a2, 1
-; CHECK-NEXT:    srli a2, a2, 2
-; CHECK-NEXT:    sub a3, a0, a1
-; CHECK-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
-; CHECK-NEXT:    vslidedown.vx v0, v24, a2
-; CHECK-NEXT:    sltu a2, a0, a3
-; CHECK-NEXT:    addi a2, a2, -1
-; CHECK-NEXT:    and a2, a2, a3
-; CHECK-NEXT:    addi a3, sp, 16
-; CHECK-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
-; CHECK-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24, v0.t
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
-; CHECK-NEXT:    bltu a0, a1, .LBB25_2
-; CHECK-NEXT:  # %bb.1:
-; CHECK-NEXT:    mv a0, a1
-; CHECK-NEXT:  .LBB25_2:
-; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
-; CHECK-NEXT:    addi a0, sp, 16
-; CHECK-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
-; CHECK-NEXT:    vfwcvtbf16.f.f.v v24, v0
-; CHECK-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
-; CHECK-NEXT:    vfadd.vv v16, v16, v24
-; CHECK-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
-; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v16
-; CHECK-NEXT:    csrr a0, vlenb
-; CHECK-NEXT:    slli a0, a0, 3
-; CHECK-NEXT:    add sp, sp, a0
-; CHECK-NEXT:    .cfi_def_cfa sp, 16
-; CHECK-NEXT:    addi sp, sp, 16
-; CHECK-NEXT:    .cfi_def_cfa_offset 0
-; CHECK-NEXT:    ret
+; ZVFH-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFH:       # %bb.0:
+; ZVFH-NEXT:    addi sp, sp, -16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 16
+; ZVFH-NEXT:    csrr a1, vlenb
+; ZVFH-NEXT:    slli a1, a1, 3
+; ZVFH-NEXT:    sub sp, sp, a1
+; ZVFH-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFH-NEXT:    fmv.x.h a1, fa0
+; ZVFH-NEXT:    csrr a2, vlenb
+; ZVFH-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFH-NEXT:    vmset.m v24
+; ZVFH-NEXT:    vmv.v.x v16, a1
+; ZVFH-NEXT:    slli a1, a2, 1
+; ZVFH-NEXT:    srli a2, a2, 2
+; ZVFH-NEXT:    sub a3, a0, a1
+; ZVFH-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFH-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFH-NEXT:    sltu a2, a0, a3
+; ZVFH-NEXT:    addi a2, a2, -1
+; ZVFH-NEXT:    and a2, a2, a3
+; ZVFH-NEXT:    addi a3, sp, 16
+; ZVFH-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFH-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFH-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFH-NEXT:  # %bb.1:
+; ZVFH-NEXT:    mv a0, a1
+; ZVFH-NEXT:  .LBB25_2:
+; ZVFH-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFH-NEXT:    addi a0, sp, 16
+; ZVFH-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFH-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFH-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFH-NEXT:    vfadd.vv v16, v16, v24
+; ZVFH-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFH-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFH-NEXT:    csrr a0, vlenb
+; ZVFH-NEXT:    slli a0, a0, 3
+; ZVFH-NEXT:    add sp, sp, a0
+; ZVFH-NEXT:    .cfi_def_cfa sp, 16
+; ZVFH-NEXT:    addi sp, sp, 16
+; ZVFH-NEXT:    .cfi_def_cfa_offset 0
+; ZVFH-NEXT:    ret
+;
+; ZVFHMIN-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFHMIN:       # %bb.0:
+; ZVFHMIN-NEXT:    addi sp, sp, -16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 16
+; ZVFHMIN-NEXT:    csrr a1, vlenb
+; ZVFHMIN-NEXT:    slli a1, a1, 3
+; ZVFHMIN-NEXT:    sub sp, sp, a1
+; ZVFHMIN-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFHMIN-NEXT:    fmv.x.h a1, fa0
+; ZVFHMIN-NEXT:    csrr a2, vlenb
+; ZVFHMIN-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFHMIN-NEXT:    vmset.m v24
+; ZVFHMIN-NEXT:    vmv.v.x v16, a1
+; ZVFHMIN-NEXT:    slli a1, a2, 1
+; ZVFHMIN-NEXT:    srli a2, a2, 2
+; ZVFHMIN-NEXT:    sub a3, a0, a1
+; ZVFHMIN-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFHMIN-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFHMIN-NEXT:    sltu a2, a0, a3
+; ZVFHMIN-NEXT:    addi a2, a2, -1
+; ZVFHMIN-NEXT:    and a2, a2, a3
+; ZVFHMIN-NEXT:    addi a3, sp, 16
+; ZVFHMIN-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFHMIN-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v20, v0.t
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v12, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v12, v16, v0.t
+; ZVFHMIN-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFHMIN-NEXT:  # %bb.1:
+; ZVFHMIN-NEXT:    mv a0, a1
+; ZVFHMIN-NEXT:  .LBB25_2:
+; ZVFHMIN-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; ZVFHMIN-NEXT:    addi a0, sp, 16
+; ZVFHMIN-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFHMIN-NEXT:    vfwcvtbf16.f.f.v v24, v0
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFHMIN-NEXT:    vfadd.vv v16, v16, v24
+; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFHMIN-NEXT:    vfncvtbf16.f.f.w v8, v16
+; ZVFHMIN-NEXT:    csrr a0, vlenb
+; ZVFHMIN-NEXT:    slli a0, a0, 3
+; ZVFHMIN-NEXT:    add sp, sp, a0
+; ZVFHMIN-NEXT:    .cfi_def_cfa sp, 16
+; ZVFHMIN-NEXT:    addi sp, sp, 16
+; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
+; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32bf16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.h a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB25_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB25_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16alt, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x bfloat> poison, bfloat %b, i32 0
   %vb = shufflevector <vscale x 32 x bfloat> %elt.head, <vscale x 32 x bfloat> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x bfloat> @llvm.vp.fadd.nxv32bf16(<vscale x 32 x bfloat> %va, <vscale x 32 x bfloat> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
@@ -651,6 +1608,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16(<vscale x 1 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> %m, i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -672,6 +1640,17 @@ define <vscale x 1 x half> @vfadd_vv_nxv1f16_unmasked(<vscale x 1 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %b, <vscale x 1 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 1 x half> %v
 }
@@ -695,6 +1674,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16(<vscale x 1 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> %m, i32 %evl)
@@ -720,6 +1712,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_commute(<vscale x 1 x half> %va, ha
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> %m, i32 %evl)
@@ -745,6 +1750,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked(<vscale x 1 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %va, <vscale x 1 x half> %vb, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -770,6 +1788,19 @@ define <vscale x 1 x half> @vfadd_vf_nxv1f16_unmasked_commute(<vscale x 1 x half
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv1f16_unmasked_commute:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, mf2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v8, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 1 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 1 x half> %elt.head, <vscale x 1 x half> poison, <vscale x 1 x i32> zeroinitializer
   %v = call <vscale x 1 x half> @llvm.vp.fadd.nxv1f16(<vscale x 1 x half> %vb, <vscale x 1 x half> %va, <vscale x 1 x i1> splat (i1 true), i32 %evl)
@@ -795,6 +1826,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16(<vscale x 2 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> %m, i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -816,6 +1858,17 @@ define <vscale x 2 x half> @vfadd_vv_nxv2f16_unmasked(<vscale x 2 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v9, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v9, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %b, <vscale x 2 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 2 x half> %v
 }
@@ -839,6 +1892,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16(<vscale x 2 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> %m, i32 %evl)
@@ -864,6 +1930,19 @@ define <vscale x 2 x half> @vfadd_vf_nxv2f16_unmasked(<vscale x 2 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v9
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv2f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v9, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v9
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m1, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v9, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, mf2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v9
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 2 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 2 x half> %elt.head, <vscale x 2 x half> poison, <vscale x 2 x i32> zeroinitializer
   %v = call <vscale x 2 x half> @llvm.vp.fadd.nxv2f16(<vscale x 2 x half> %va, <vscale x 2 x half> %vb, <vscale x 2 x i1> splat (i1 true), i32 %evl)
@@ -889,6 +1968,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16(<vscale x 4 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> %m, i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -910,6 +2000,17 @@ define <vscale x 4 x half> @vfadd_vv_nxv4f16_unmasked(<vscale x 4 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v9
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v12, v10
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %b, <vscale x 4 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 4 x half> %v
 }
@@ -933,6 +2034,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16(<vscale x 4 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> %m, i32 %evl)
@@ -958,6 +2072,19 @@ define <vscale x 4 x half> @vfadd_vf_nxv4f16_unmasked(<vscale x 4 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v10
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv4f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v12, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v10, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v10, v10, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v10
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 4 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 4 x half> %elt.head, <vscale x 4 x half> poison, <vscale x 4 x i32> zeroinitializer
   %v = call <vscale x 4 x half> @llvm.vp.fadd.nxv4f16(<vscale x 4 x half> %va, <vscale x 4 x half> %vb, <vscale x 4 x i1> splat (i1 true), i32 %evl)
@@ -983,6 +2110,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16(<vscale x 8 x half> %va, <vscale x
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> %m, i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1004,6 +2142,17 @@ define <vscale x 8 x half> @vfadd_vv_nxv8f16_unmasked(<vscale x 8 x half> %va, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v10
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v16, v12
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %b, <vscale x 8 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 8 x half> %v
 }
@@ -1027,6 +2176,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16(<vscale x 8 x half> %va, half %b, <
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> %m, i32 %evl)
@@ -1052,6 +2214,19 @@ define <vscale x 8 x half> @vfadd_vf_nxv8f16_unmasked(<vscale x 8 x half> %va, h
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v12
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv8f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v12, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v12, v12, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v12
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 8 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 8 x half> %elt.head, <vscale x 8 x half> poison, <vscale x 8 x i32> zeroinitializer
   %v = call <vscale x 8 x half> @llvm.vp.fadd.nxv8f16(<vscale x 8 x half> %va, <vscale x 8 x half> %vb, <vscale x 8 x i1> splat (i1 true), i32 %evl)
@@ -1077,6 +2252,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16(<vscale x 16 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> %m, i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1098,6 +2284,17 @@ define <vscale x 16 x half> @vfadd_vv_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %b, <vscale x 16 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 16 x half> %v
 }
@@ -1121,6 +2318,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16(<vscale x 16 x half> %va, half %b
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16, v0.t
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> %m, i32 %evl)
@@ -1146,6 +2356,19 @@ define <vscale x 16 x half> @vfadd_vf_nxv16f16_unmasked(<vscale x 16 x half> %va
 ; ZVFHMIN-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
 ; ZVFHMIN-NEXT:    vfncvt.f.f.w v8, v16
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv16f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v8, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 16 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 16 x half> %elt.head, <vscale x 16 x half> poison, <vscale x 16 x i32> zeroinitializer
   %v = call <vscale x 16 x half> @llvm.vp.fadd.nxv16f16(<vscale x 16 x half> %va, <vscale x 16 x half> %vb, <vscale x 16 x i1> splat (i1 true), i32 %evl)
@@ -1209,6 +2432,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16(<vscale x 32 x half> %va, <vscale
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB48_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB48_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> %m, i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1268,6 +2540,55 @@ define <vscale x 32 x half> @vfadd_vv_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vv_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e8, m4, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB49_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB49_2:
+; ZVFBFA-NEXT:    addi a1, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v24, (a1) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v24
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v8
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %b, <vscale x 32 x i1> splat (i1 true), i32 %evl)
   ret <vscale x 32 x half> %v
 }
@@ -1340,6 +2661,68 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16(<vscale x 32 x half> %va, half %b
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 4
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 16 * vlenb
+; ZVFBFA-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmv1r.v v7, v0
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vmv.v.x v24, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v0, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    csrr a3, vlenb
+; ZVFBFA-NEXT:    slli a3, a3, 3
+; ZVFBFA-NEXT:    add a3, sp, a3
+; ZVFBFA-NEXT:    addi a3, a3, 16
+; ZVFBFA-NEXT:    vs8r.v v24, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v28, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v24, v16, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB50_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB50_2:
+; ZVFBFA-NEXT:    vmv1r.v v0, v7
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a0) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add a0, sp, a0
+; ZVFBFA-NEXT:    addi a0, a0, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v16, v0.t
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v16, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16, v0.t
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 4
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> %m, i32 %evl)
@@ -1403,6 +2786,57 @@ define <vscale x 32 x half> @vfadd_vf_nxv32f16_unmasked(<vscale x 32 x half> %va
 ; ZVFHMIN-NEXT:    addi sp, sp, 16
 ; ZVFHMIN-NEXT:    .cfi_def_cfa_offset 0
 ; ZVFHMIN-NEXT:    ret
+;
+; ZVFBFA-LABEL: vfadd_vf_nxv32f16_unmasked:
+; ZVFBFA:       # %bb.0:
+; ZVFBFA-NEXT:    addi sp, sp, -16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 16
+; ZVFBFA-NEXT:    csrr a1, vlenb
+; ZVFBFA-NEXT:    slli a1, a1, 3
+; ZVFBFA-NEXT:    sub sp, sp, a1
+; ZVFBFA-NEXT:    .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 8 * vlenb
+; ZVFBFA-NEXT:    fmv.x.w a1, fa0
+; ZVFBFA-NEXT:    csrr a2, vlenb
+; ZVFBFA-NEXT:    vsetvli a3, zero, e16, m8, ta, ma
+; ZVFBFA-NEXT:    vmset.m v24
+; ZVFBFA-NEXT:    vmv.v.x v16, a1
+; ZVFBFA-NEXT:    slli a1, a2, 1
+; ZVFBFA-NEXT:    srli a2, a2, 2
+; ZVFBFA-NEXT:    sub a3, a0, a1
+; ZVFBFA-NEXT:    vsetvli a4, zero, e8, mf2, ta, ma
+; ZVFBFA-NEXT:    vslidedown.vx v0, v24, a2
+; ZVFBFA-NEXT:    sltu a2, a0, a3
+; ZVFBFA-NEXT:    addi a2, a2, -1
+; ZVFBFA-NEXT:    and a2, a2, a3
+; ZVFBFA-NEXT:    addi a3, sp, 16
+; ZVFBFA-NEXT:    vs8r.v v16, (a3) # vscale x 64-byte Folded Spill
+; ZVFBFA-NEXT:    vsetvli zero, a2, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v20, v0.t
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v12, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24, v0.t
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v12, v16, v0.t
+; ZVFBFA-NEXT:    bltu a0, a1, .LBB51_2
+; ZVFBFA-NEXT:  # %bb.1:
+; ZVFBFA-NEXT:    mv a0, a1
+; ZVFBFA-NEXT:  .LBB51_2:
+; ZVFBFA-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v16, v8
+; ZVFBFA-NEXT:    addi a0, sp, 16
+; ZVFBFA-NEXT:    vl8r.v v0, (a0) # vscale x 64-byte Folded Reload
+; ZVFBFA-NEXT:    vfwcvt.f.f.v v24, v0
+; ZVFBFA-NEXT:    vsetvli zero, zero, e32, m8, ta, ma
+; ZVFBFA-NEXT:    vfadd.vv v16, v16, v24
+; ZVFBFA-NEXT:    vsetvli zero, zero, e16, m4, ta, ma
+; ZVFBFA-NEXT:    vfncvt.f.f.w v8, v16
+; ZVFBFA-NEXT:    csrr a0, vlenb
+; ZVFBFA-NEXT:    slli a0, a0, 3
+; ZVFBFA-NEXT:    add sp, sp, a0
+; ZVFBFA-NEXT:    .cfi_def_cfa sp, 16
+; ZVFBFA-NEXT:    addi sp, sp, 16
+; ZVFBFA-NEXT:    .cfi_def_cfa_offset 0
+; ZVFBFA-NEXT:    ret
   %elt.head = insertelement <vscale x 32 x half> poison, half %b, i32 0
   %vb = shufflevector <vscale x 32 x half> %elt.head, <vscale x 32 x half> poison, <vscale x 32 x i32> zeroinitializer
   %v = call <vscale x 32 x half> @llvm.vp.fadd.nxv32f16(<vscale x 32 x half> %va, <vscale x 32 x half> %vb, <vscale x 32 x i1> splat (i1 true), i32 %evl)
diff --git a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
index bd07ba1316ece..eb4cf76c96da6 100644
--- a/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
+++ b/llvm/test/CodeGen/SPIRV/extensions/SPV_KHR_shader_clock/shader_clock.ll
@@ -20,6 +20,9 @@
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_1]]
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_2]]
 ; CHECK: OpReadClockKHR [[v2uint]] [[uint_3]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_1]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_2]]
+; CHECK: OpReadClockKHR [[ulong]] [[uint_3]]
 
 define dso_local spir_kernel void @test_clocks(ptr addrspace(1) nocapture noundef writeonly align 8 %out64, ptr addrspace(1) nocapture noundef writeonly align 8 %outv2) {
 entry:
@@ -39,6 +42,9 @@ entry:
   %call9 = tail call spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv()
   %arrayidx10 = getelementptr inbounds i8, ptr addrspace(1) %outv2, i32 16
   store <2 x i32> %call9, ptr addrspace(1) %arrayidx10, align 8
+  %call10 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 1)
+  %call11 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 2)
+  %call12 = call spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32 3)
   ret void
 }
 
@@ -59,3 +65,6 @@ declare spir_func <2 x i32> @_Z26clock_read_hilo_work_groupv() local_unnamed_add
 
 ; Function Attrs: convergent nounwind
 declare spir_func <2 x i32> @_Z25clock_read_hilo_sub_groupv() local_unnamed_addr
+
+; Function Attrs: nounwind
+declare spir_func i64 @_Z27__spirv_ReadClockKHR_Rulongi(i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
index 4d32e66d017c9..6d41875798ebc 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/cbuffer.ll
@@ -1,5 +1,5 @@
 ; RUN: llc -O0 -verify-machineinstrs -mtriple=spirv1.6-vulkan1.3-library %s -o - | FileCheck %s
-; Test that uses of cbuffer members inside ConstantExprs are handled correctly.
+; Test that uses of cbuffer members are handled correctly.
 
 ; CHECK-DAG: OpDecorate %[[MyCBuffer:[0-9]+]] DescriptorSet 0
 ; CHECK-DAG: OpDecorate %[[MyCBuffer]] Binding 0
@@ -37,10 +37,8 @@ entry:
 ; CHECK: %[[tmp_ptr:[0-9]+]] = OpAccessChain {{%[0-9]+}} %[[tmp]] %[[uint_0]] %[[uint_0]]
 ; CHECK: %[[v_ptr:.+]] = OpAccessChain %[[_ptr_Uniform_v4float]] %[[tmp]] %[[uint_0]] %[[uint_1]]
 ; CHECK: %[[s_ptr_gep:[0-9]+]] = OpInBoundsAccessChain %[[_ptr_Uniform_float]] %[[tmp_ptr]] %[[uint_0]] %[[uint_1]]
-  %gep = getelementptr inbounds %MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1
-
 ; CHECK: %[[s_val:.+]] = OpLoad %[[float]] %[[s_ptr_gep]]
-  %load_from_gep = load float, ptr addrspace(12) %gep, align 4
+  %load_from_gep = load float, ptr addrspace(12) getelementptr inbounds (%MyStruct, ptr addrspace(12) @s, i32 0, i32 0, i32 1), align 4
 
 ; CHECK: %[[v_val:.+]] = OpLoad %[[v4float]] %[[v_ptr]]
   %load_v = load <4 x float>, ptr addrspace(12) @v, align 16
diff --git a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
index 4a38d7afba2c9..c87f113a1f499 100644
--- a/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-sincos-01.ll
@@ -1,7 +1,7 @@
 ; Test that combined sin/cos library call is emitted when appropriate
 
 ; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
-; RUN: llc < %s -mtriple=s390x-linux-gnu -enable-unsafe-fp-math | FileCheck %s --check-prefix=CHECK-OPT
+; RUN: llc < %s -mtriple=s390x-linux-gnu | FileCheck %s --check-prefix=CHECK-OPT
 
 define float @f1(float %x) {
 ; CHECK-OPT-LABEL: f1:
diff --git a/llvm/test/CodeGen/Thumb/PR17309.ll b/llvm/test/CodeGen/Thumb/PR17309.ll
index b548499602aa3..4da25ca110d1b 100644
--- a/llvm/test/CodeGen/Thumb/PR17309.ll
+++ b/llvm/test/CodeGen/Thumb/PR17309.ll
@@ -48,7 +48,7 @@ declare void @llvm.lifetime.start.p0(i64, ptr nocapture) #1
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
-attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { optsize "less-precise-fpmad"="false" "frame-pointer"="non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #3 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/fastcc.ll b/llvm/test/CodeGen/Thumb/fastcc.ll
index be356d8cb6e9a..000e20af73aa3 100644
--- a/llvm/test/CodeGen/Thumb/fastcc.ll
+++ b/llvm/test/CodeGen/Thumb/fastcc.ll
@@ -29,7 +29,7 @@ for.body193:                                      ; preds = %for.body193, %for.e
   br label %for.body193
 }
 
-attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
index 700b2079a2a86..33c4346598bfc 100644
--- a/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
+++ b/llvm/test/CodeGen/Thumb/ldm-merge-call.ll
@@ -19,6 +19,6 @@ entry:
 ; Function Attrs: optsize
 declare i32 @bar(i32, i32, i32, i32) #1
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
index cc142392981d6..82314be589e9d 100644
--- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll
@@ -50,4 +50,4 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb/stm-merge.ll b/llvm/test/CodeGen/Thumb/stm-merge.ll
index 837c2f6ca7d2b..426210a355307 100644
--- a/llvm/test/CodeGen/Thumb/stm-merge.ll
+++ b/llvm/test/CodeGen/Thumb/stm-merge.ll
@@ -38,4 +38,4 @@ for.end8:                                         ; preds = %for.body5
   ret void
 }
 
-attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
new file mode 100644
index 0000000000000..5783133132fd0
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-vs-unpredicated-copy.mir
@@ -0,0 +1,146 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -verify-machineinstrs -o - | FileCheck %s
+
+# From bug #162644. The _wrong_ output of this test is to generate the
+# body of the tail-predicated loop like this:
+#
+#     $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+#     renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+#     $q0 = MVE_VORR $q1, $q1, 0, $noreg, $noreg, undef $q0
+#     renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+#     $lr = MVE_LETP killed renamable $lr, %bb.1
+#
+# in which the second MVE_VORR, copying q1 into q0, is an invalid conversion of
+# the input MQPRCopy, because it won't copy the vector lanes disabled by
+# FPSCR.LTPSIZE, and those are needed in the output value of the loop.
+#
+# In the right output, that MQPRCopy is expanded into a pair of VMOVD copying
+# d2,d3 into d0,d1 respectively, which are unaffected by LTPSIZE.
+
+--- |
+  target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8.1m.main-unknown-none-eabihf"
+
+  @inactive = dso_local local_unnamed_addr global <4 x float> zeroinitializer, align 16
+
+  define <4 x float> @test_func(ptr %0, i32 %1) {
+    %3 = load <4 x float>, ptr @inactive, align 16
+    %4 = add i32 %1, 3
+    %5 = call i32 @llvm.smin.i32(i32 %1, i32 4)
+    %6 = sub i32 %4, %5
+    %7 = lshr i32 %6, 2
+    %8 = add nuw nsw i32 %7, 1
+    %9 = call i32 @llvm.start.loop.iterations.i32(i32 %8)
+    br label %10
+
+  10:                                               ; preds = %10, %2
+    %11 = phi <4 x float> [ splat (float 0x3FB99999A0000000), %2 ], [ %17, %10 ]
+    %12 = phi i32 [ %1, %2 ], [ %19, %10 ]
+    %13 = phi ptr [ %0, %2 ], [ %18, %10 ]
+    %14 = phi i32 [ %9, %2 ], [ %20, %10 ]
+    %15 = tail call <4 x i1> @llvm.arm.mve.vctp32(i32 %12)
+    %16 = tail call <4 x float> @llvm.masked.load.v4f32.p0(ptr %13, i32 4, <4 x i1> %15, <4 x float> zeroinitializer)
+    %17 = tail call <4 x float> @llvm.arm.mve.add.predicated.v4f32.v4i1(<4 x float> %11, <4 x float> %16, <4 x i1> %15, <4 x float> %3)
+    %18 = getelementptr inbounds nuw i8, ptr %13, i32 16
+    %19 = add i32 %12, -4
+    %20 = call i32 @llvm.loop.decrement.reg.i32(i32 %14, i32 1)
+    %21 = icmp ne i32 %20, 0
+    br i1 %21, label %10, label %22
+
+  22:                                               ; preds = %10
+    ret <4 x float> %17
+  }
+...
+---
+name:            test_func
+alignment:       4
+legalized:       false
+tracksRegLiveness: true
+registers:       []
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+stack:
+  - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+  - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4,
+      stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true,
+      debug-info-variable: '', debug-info-expression: '', debug-info-location: '' }
+body:             |
+  ; CHECK-LABEL: name: test_func
+  ; CHECK: bb.0 (%ir-block.2):
+  ; CHECK-NEXT:   successors: %bb.1(0x80000000)
+  ; CHECK-NEXT:   liveins: $lr, $r0, $r1, $r7
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION def_cfa_offset 8
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $lr, -4
+  ; CHECK-NEXT:   frame-setup CFI_INSTRUCTION offset $r7, -8
+  ; CHECK-NEXT:   $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
+  ; CHECK-NEXT:   $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_DLSTP_32 killed renamable $r1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.1 (%ir-block.10, align 4):
+  ; CHECK-NEXT:   successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+  ; CHECK-NEXT:   liveins: $lr, $d2, $d3, $q0, $r0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   $q2 = MVE_VORR killed $q0, killed $q0, 0, $noreg, $noreg, undef $q2
+  ; CHECK-NEXT:   renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg, renamable $lr :: (load unknown-size from %ir.13, align 4)
+  ; CHECK-NEXT:   $d0 = VMOVD $d2, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   $d1 = VMOVD $d3, 14 /* CC::al */, $noreg
+  ; CHECK-NEXT:   renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 0, killed $noreg, renamable $lr, killed renamable $q0
+  ; CHECK-NEXT:   $lr = MVE_LETP killed renamable $lr, %bb.1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT: bb.2 (%ir-block.22):
+  ; CHECK-NEXT:   liveins: $q0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+  bb.0 (%ir-block.2):
+    successors: %bb.1(0x80000000)
+    liveins: $r0, $r1, $r7, $lr
+
+    frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp
+    frame-setup CFI_INSTRUCTION def_cfa_offset 8
+    frame-setup CFI_INSTRUCTION offset $lr, -4
+    frame-setup CFI_INSTRUCTION offset $r7, -8
+    $r2 = t2MOVi16 target-flags(arm-lo16) @inactive, 14 /* CC::al */, $noreg
+    tCMPi8 renamable $r1, 4, 14 /* CC::al */, $noreg, implicit-def $cpsr
+    $r2 = t2MOVTi16 killed $r2, target-flags(arm-hi16) @inactive, 14 /* CC::al */, $noreg
+    renamable $r3 = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg
+    renamable $q1 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg, $noreg :: (dereferenceable load (s128) from @inactive)
+    $r2 = tMOVr $r1, 14 /* CC::al */, $noreg
+    t2IT 10, 8, implicit-def $itstate
+    renamable $r2 = tMOVi8 $noreg, 4, 10 /* CC::ge */, killed $cpsr, implicit killed renamable $r2, implicit killed $itstate
+    renamable $r2, dead $cpsr = tSUBrr renamable $r1, killed renamable $r2, 14 /* CC::al */, $noreg
+    renamable $r2, dead $cpsr = tADDi8 killed renamable $r2, 3, 14 /* CC::al */, $noreg
+    renamable $r2 = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r2, 19, 14 /* CC::al */, $noreg, $noreg
+    $r3 = t2MOVi16 52429, 14 /* CC::al */, $noreg
+    $r3 = t2MOVTi16 killed $r3, 15820, 14 /* CC::al */, $noreg
+    renamable $q0 = MVE_VDUP32 killed renamable $r3, 0, $noreg, $noreg, undef renamable $q0
+    renamable $lr = t2DoLoopStartTP killed renamable $r2, renamable $r1
+
+  bb.1 (%ir-block.10, align 4):
+    successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+    liveins: $lr, $q0, $q1, $r0, $r1
+
+    renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg, $noreg
+    $q2 = MQPRCopy killed $q0
+    MVE_VPST 8, implicit $vpr
+    renamable $r0, renamable $q3 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr, renamable $lr :: (load unknown-size from %ir.13, align 4)
+    $q0 = MQPRCopy $q1
+    MVE_VPST 8, implicit $vpr
+    renamable $q0 = MVE_VADDf32 killed renamable $q2, killed renamable $q3, 1, killed renamable $vpr, renamable $lr, killed renamable $q0
+    renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg
+    renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr
+    tB %bb.2, 14 /* CC::al */, $noreg
+
+  bb.2 (%ir-block.22):
+    liveins: $q0
+
+    frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc, implicit killed $q0
+...
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
index 4e2a275e213e4..00f0a1c6d9de4 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-block-debug.mir
@@ -118,7 +118,7 @@
   declare i32 @llvm.start.loop.iterations.i32(i32)
   declare i32 @llvm.loop.decrement.reg.i32(i32, i32)
 
-  attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+  attributes #0 = { nofree nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+cdecp0,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 
   !0 = distinct !DICompileUnit(language: DW_LANG_C99, file: !1, producer: "clang", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug, retainedTypes: !2, splitDebugInlining: false, nameTableKind: None)
   !1 = !DIFile(filename: "tmp.c", directory: "")
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
index 4e817ba957174..e48a038037a55 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-1-pred.mir
@@ -13,7 +13,7 @@
     ret <4 x float> %inactive1
   }
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
index 6b5cbcee6d467..b8657c27261ae 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-2-preds.mir
@@ -16,7 +16,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
index 91ccf3b43ec0a..68a38a4c3f19d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-ctrl-flow.mir
@@ -19,7 +19,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
index 1f19ed9ccd144..caa7b174dee6f 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks-non-consecutive-ins.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
index 4f75b01de2803..2f074850548ae 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-2-blocks.mir
@@ -18,7 +18,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
index c26838803e356..f6b64a046a9fb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-3-blocks-kill-vpr.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
index 1e9e0e3d21ca9..d0865667fd62e 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-1-ins.mir
@@ -14,7 +14,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
index cb73cdfc4bd55..54368826d676a 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-2-ins.mir
@@ -15,7 +15,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
index 62d7640d3ac5c..435836df1a69d 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-4-ins.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
index 130c7f4b68de4..dc195dd917e77 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-elses.mir
@@ -17,7 +17,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
index 0ffed2ed4560f..ee2e58f2a6cc1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-fold-vcmp.mir
@@ -18,7 +18,7 @@
   declare <4 x i32> @llvm.masked.load.v4i32.p0(ptr, i32 immarg, <4 x i1>, <4 x i32>) #2
   declare void @llvm.masked.store.v4i32.p0(<4 x i32>, ptr, i32 immarg, <4 x i1>) #3
 
-  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+fp-armv8d16sp,+fp16,+fpregs,+fullfp16,+hwdiv,+lob,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2sp,+vfp3d16sp,+vfp4d16sp" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { argmemonly nounwind readonly willreturn }
   attributes #3 = { argmemonly nounwind willreturn }
diff --git a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
index 695a8d889cb5f..ba210681178c4 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
+++ b/llvm/test/CodeGen/Thumb2/mve-vpt-block-optnone.mir
@@ -14,7 +14,7 @@
 
   declare <4 x float> @llvm.arm.mve.vminnm.m.v4f32.v4f32.v4f32.v4f32.i32(<4 x float>, <4 x float>, <4 x float>, i32) #1
 
-  attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline optnone nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-frame-pointer-elim"="false" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.1-m.main,+hwdiv,+mve.fp,+ras,+thumb-mode" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
index 8777d517c4bad..db779dea717da 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-4.ll
@@ -179,7 +179,7 @@ return:                                           ; preds = %entry, %if.end
 ; CHECK-NOT: aut
 ; CHECK:        b    _Z1hii
 
-attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize noinline optsize "sign-return-address"="non-leaf" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m3" "target-features"="+armv7-m,+hwdiv,+thumb-mode" "use-soft-float"="false" }
 attributes #1 = { nounwind "sign-return-address"="non-leaf" }
 attributes #2 = { noreturn "sign-return-address"="non-leaf" }
 
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
index 053d6a1d01800..d741411fe6cdd 100644
--- a/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-outliner-5.ll
@@ -94,5 +94,5 @@ attributes #1 = { minsize nofree norecurse nounwind optsize }
 !llvm.module.flags = !{!0, !1, !2}
 
 !0 = !{i32 8, !"branch-target-enforcement", i32 0}
-!1 = !{i32 8, !"sign-return-address", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 2}
 !2 = !{i32 8, !"sign-return-address-all", i32 0}
diff --git a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
index 4a93c2c2a33cf..0ee075caed1b2 100644
--- a/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/Thumb2/stack_guard_remat.ll
@@ -38,7 +38,7 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
index 48b75ed5e3465..f5eb64285b65b 100644
--- a/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
+++ b/llvm/test/CodeGen/Thumb2/t2sizereduction.mir
@@ -29,7 +29,7 @@
     br i1 %exitcond, label %for.cond.cleanup, label %for.body
   }
 
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "use-soft-float"="false" }
 
 ...
 ---
diff --git a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
index 94efe0f4157f7..5eb49fda9f014 100644
--- a/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
+++ b/llvm/test/CodeGen/WebAssembly/memory-interleave.ll
@@ -5,6 +5,7 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.FiveShorts = type { i16, i16, i16, i16, i16 }
@@ -12,6 +13,8 @@ target datalayout = "e-m:e-p:32:32-p10:8:8-p20:8:8-i64:64-n32:64-S128-ni:1:10:20
 %struct.ThreeBytes = type { i8, i8, i8 }
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op:
 ; CHECK: loop
@@ -1536,3 +1539,1605 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op:
+; CHECK-NOT: f32x4.mul
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s	
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op:
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load64_zero
+; CHECK: i8x16.shuffle {{.*}} 0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store64_lane
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 6, 7, 10, 11, 14, 15, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: v128.store
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op:
+; CHECK: loop
+; CHECK-NOT: v128.load
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op:
+; CHECK-NOT: f32x4
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 1, 5, 9, 13, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 6, 10, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 3, 7, 11, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK: i16x8.extend_low_i8x16_s
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i8x16.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 8, 9, 16, 17, 24, 25, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.mul
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 2, 3, 10, 11, 18, 19, 26, 27, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.add
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 12, 13, 20, 21, 28, 29, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.div
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: i8x16.shuffle {{.*}} 6, 7, 14, 15, 22, 23, 30, 31, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK: i32x4.extend_low_i16x8_s
+; CHECK: f32x4.convert_i32x4_s
+; CHECK: f32x4.sub
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.store
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op:
+; CHECK: loop
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 16, 17, 18, 19, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: v128.load
+; CHECK: v128.load
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 16, 17, 18, 19
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.mul
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 4, 5, 6, 7, 20, 21, 22, 23, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 4, 5, 6, 7, 20, 21, 22, 23
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.add
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 8, 9, 10, 11, 24, 25, 26, 27, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 8, 9, 10, 11, 24, 25, 26, 27
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.div
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 12, 13, 14, 15, 28, 29, 30, 31, 0, 1, 2, 3, 0, 1, 2, 3
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 0, 1, 2, 3, 12, 13, 14, 15, 28, 29, 30, 31
+; CHECK: i8x16.shuffle {{.*}} 0, 1, 2, 3, 4, 5, 6, 7, 24, 25, 26, 27, 28, 29, 30, 31
+; CHECK: f32x4.sub
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.splat
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: f32x4.extract_lane
+; CHECK: i32.trunc_sat_f32_s
+; CHECK: i16x8.replace_lane
+; CHECK: v128.store
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
new file mode 100644
index 0000000000000..f224a0dc21136
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmax.ll
@@ -0,0 +1,304 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fmaxnum and fmaximumnum get transformed to relaxed_max
+
+target triple = "wasm32"
+
+define <4 x float> @test_maxnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maxnum_f32x4:
+; CHECK:         .functype test_maxnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_maximumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_maximumnum_f32x4:
+; CHECK:         .functype test_maximumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_maxnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_maxnum_f64x2:
+; CHECK:         .functype test_maxnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.maximumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <4 x float> @test_pmax_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_olt:
+; CHECK:         .functype test_pmax_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ole:
+; CHECK:         .functype test_pmax_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_ogt:
+; CHECK:         .functype test_pmax_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmax_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_oge:
+; CHECK:         .functype test_pmax_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmax_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmax_v4f32_fast_olt:
+; CHECK:         .functype pmax_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmax_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ole:
+; CHECK:         .functype test_pmax_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmax_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_ogt:
+; CHECK:         .functype test_pmax_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmax_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmax_v4f32_fast_oge:
+; CHECK:         .functype test_pmax_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %x, <4 x float> %y
+  ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmax_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmax_int_v4f32:
+; CHECK:         .functype test_pmax_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <4 x i32> %x to <4 x float>
+  %fy = bitcast <4 x i32> %y to <4 x float>
+  %c = fcmp olt <4 x float> %fy, %fx
+  %a = select <4 x i1> %c, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmax_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_olt:
+; CHECK:         .functype test_pmax_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ole:
+; CHECK:         .functype test_pmax_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmax_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_ogt:
+; CHECK:         .functype test_pmax_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+define <2 x double> @test_pmax_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_oge:
+; CHECK:         .functype test_pmax_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmax_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmax_v2f64_fast_olt:
+; CHECK:         .functype pmax_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmax_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ole:
+; CHECK:         .functype test_pmax_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+; For setgt
+define <2 x double> @test_pmax_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_ogt:
+; CHECK:         .functype test_pmax_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmax_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmax_v2f64_fast_oge:
+; CHECK:         .functype test_pmax_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %x, <2 x double> %y
+  ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmax_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmax_int_v2f64:
+; CHECK:         .functype test_pmax_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_max
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <2 x i64> %x to <2 x double>
+  %fy = bitcast <2 x i64> %y to <2 x double>
+  %c = fcmp olt <2 x double> %fy, %fx
+  %a = select <2 x i1> %c, <2 x i64> %x, <2 x i64> %y
+  ret <2 x i64> %a
+}
+
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.maximumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.maximumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
new file mode 100644
index 0000000000000..460446574ecfb
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/simd-relaxed-fmin.ll
@@ -0,0 +1,305 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=wasm32-unknown-unknown -mattr=+simd128,+relaxed-simd | FileCheck %s
+
+; Test that fminnum and fminimumnum get transformed to relaxed_min
+
+target triple = "wasm32"
+
+define <4 x float> @test_minnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minnum_f32x4:
+; CHECK:         .functype test_minnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <4 x float> @test_minimumnum_f32x4(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: test_minimumnum_f32x4:
+; CHECK:         .functype test_minimumnum_f32x4 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <4 x float> @llvm.minimumnum.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %result
+}
+
+define <2 x double> @test_minnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minnum_f64x2:
+; CHECK:         .functype test_minnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <2 x double> @test_minimumnum_f64x2(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: test_minimumnum_f64x2:
+; CHECK:         .functype test_minimumnum_f64x2 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %result = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %a, <2 x double> %b)
+  ret <2 x double> %result
+}
+
+define <4 x float> @test_pmin_v4f32_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_olt:
+; CHECK:         .functype test_pmin_v4f32_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ole:
+; CHECK:         .functype test_pmin_v4f32_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_ogt:
+; CHECK:         .functype test_pmin_v4f32_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x float> @test_pmin_v4f32_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_oge:
+; CHECK:         .functype test_pmin_v4f32_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setlt
+define <4 x float> @pmin_v4f32_fast_olt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: pmin_v4f32_fast_olt:
+; CHECK:         .functype pmin_v4f32_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setle
+define <4 x float> @test_pmin_v4f32_fast_ole(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ole:
+; CHECK:         .functype test_pmin_v4f32_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <4 x float> %y, %x
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setgt
+define <4 x float> @test_pmin_v4f32_fast_ogt(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_ogt:
+; CHECK:         .functype test_pmin_v4f32_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+; For setge
+define <4 x float> @test_pmin_v4f32_fast_oge(<4 x float> %x, <4 x float> %y) {
+; CHECK-LABEL: test_pmin_v4f32_fast_oge:
+; CHECK:         .functype test_pmin_v4f32_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <4 x float> %x, %y
+  %a = select <4 x i1> %c, <4 x float> %y, <4 x float> %x
+  ret <4 x float> %a
+}
+
+define <4 x i32> @test_pmin_int_v4f32(<4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: test_pmin_int_v4f32:
+; CHECK:         .functype test_pmin_int_v4f32 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f32x4.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <4 x i32> %x to <4 x float>
+  %fy = bitcast <4 x i32> %y to <4 x float>
+  %c = fcmp olt <4 x float> %fy, %fx
+  %a = select <4 x i1> %c, <4 x i32> %y, <4 x i32> %x
+  ret <4 x i32> %a
+}
+
+define <2 x double> @test_pmin_v2f64_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_olt:
+; CHECK:         .functype test_pmin_v2f64_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp olt <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ole:
+; CHECK:         .functype test_pmin_v2f64_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ole <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_ogt:
+; CHECK:         .functype test_pmin_v2f64_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x double> @test_pmin_v2f64_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_oge:
+; CHECK:         .functype test_pmin_v2f64_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setlt
+define <2 x double> @pmin_v2f64_fast_olt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: pmin_v2f64_fast_olt:
+; CHECK:         .functype pmin_v2f64_fast_olt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast olt <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setle
+define <2 x double> @test_pmin_v2f64_fast_ole(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ole:
+; CHECK:         .functype test_pmin_v2f64_fast_ole (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ole <2 x double> %y, %x
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setgt
+define <2 x double> @test_pmin_v2f64_fast_ogt(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_ogt:
+; CHECK:         .functype test_pmin_v2f64_fast_ogt (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast ogt <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+; For setge
+define <2 x double> @test_pmin_v2f64_fast_oge(<2 x double> %x, <2 x double> %y) {
+; CHECK-LABEL: test_pmin_v2f64_fast_oge:
+; CHECK:         .functype test_pmin_v2f64_fast_oge (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %c = fcmp fast oge <2 x double> %x, %y
+  %a = select <2 x i1> %c, <2 x double> %y, <2 x double> %x
+  ret <2 x double> %a
+}
+
+define <2 x i64> @test_pmin_int_v2f64(<2 x i64> %x, <2 x i64> %y) {
+; CHECK-LABEL: test_pmin_int_v2f64:
+; CHECK:         .functype test_pmin_int_v2f64 (v128, v128) -> (v128)
+; CHECK-NEXT:  # %bb.0:
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 1
+; CHECK-NEXT:    f64x2.relaxed_min
+; CHECK-NEXT:    # fallthrough-return
+  %fx = bitcast <2 x i64> %x to <2 x double>
+  %fy = bitcast <2 x i64> %y to <2 x double>
+  %c = fcmp olt <2 x double> %fy, %fx
+  %a = select <2 x i1> %c, <2 x i64> %y, <2 x i64> %x
+  ret <2 x i64> %a
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <4 x float> @llvm.fminimumnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <2 x double> @llvm.fminimumnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
index 123438d7f838b..f58456b4c8476 100644
--- a/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
+++ b/llvm/test/CodeGen/WebAssembly/simd-vector-trunc.ll
@@ -94,6 +94,19 @@ entry:
   ret <16 x i8> %0
 }
 
+define <8 x i8>  @trunc8i16_8i8(<8 x i16> %a) {
+; CHECK-LABEL: trunc8i16_8i8:
+; CHECK:         .functype trunc8i16_8i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 2, 4, 6, 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <8 x i16> %a to <8 x i8>
+  ret <8 x i8> %0
+}
+
 define <8 x i16> @trunc8i64_8i16(<8 x i64> %a) {
 ; CHECK-LABEL: trunc8i64_8i16:
 ; CHECK:         .functype trunc8i64_8i16 (v128, v128, v128, v128) -> (v128)
@@ -139,3 +152,29 @@ entry:
   %0 = trunc <8 x i32> %a to <8 x i16>
   ret <8 x i16> %0
 }
+
+define <4 x i16> @trunc4i32_4i16(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i16:
+; CHECK:         .functype trunc4i32_4i16 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 1, 4, 5, 8, 9, 12, 13, 0, 1, 0, 1, 0, 1, 0, 1
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i16>
+  ret <4 x i16> %0
+}
+
+define <4 x i8> @trunc4i32_4i8(<4 x i32> %a) {
+; CHECK-LABEL: trunc4i32_4i8:
+; CHECK:         .functype trunc4i32_4i8 (v128) -> (v128)
+; CHECK-NEXT:  # %bb.0: # %entry
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    local.get 0
+; CHECK-NEXT:    i8x16.shuffle 0, 4, 8, 12, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+; CHECK-NEXT:    # fallthrough-return
+entry:
+  %0 = trunc <4 x i32> %a to <4 x i8>
+  ret <4 x i8> %0
+}
diff --git a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
index bea11e91669a0..940fe8cf6ba75 100644
--- a/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
+++ b/llvm/test/CodeGen/X86/2006-05-22-FPSetEQ.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s -mtriple=i686-- -mattr=-sse | FileCheck %s -check-prefix=WITHNANS
-; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
+; RUN: llc < %s -mtriple=i686-- -mattr=-sse -enable-no-nans-fp-math | FileCheck %s -check-prefix=NONANS
 
 ; WITHNANS-LABEL: test:
 ; WITHNANS: setnp
diff --git a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
index 8411a409a522d..ff7a99ada7e50 100644
--- a/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
+++ b/llvm/test/CodeGen/X86/2008-05-01-InvalidOrdCompare.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math -mtriple=i686-- | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- | FileCheck %s
 ; rdar://5902801
 
 declare void @test2()
diff --git a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
index 6ebbb2e97d139..0e0e20f3f2ede 100644
--- a/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
+++ b/llvm/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -enable-unsafe-fp-math
+; RUN: llc < %s
 ; <rdar://problem/12180135>
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx10.8.0"
diff --git a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
index f82cd11d4abf0..f12fc4a8b1f13 100644
--- a/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
+++ b/llvm/test/CodeGen/X86/2013-03-13-VEX-DestReg.ll
@@ -23,6 +23,6 @@ entry:
 
 declare <4 x float> @llvm.x86.sse.rcp.ss(<4 x float>) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
index 64e569cac7602..3d4024076a372 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-legalize-sitofp.mir
@@ -63,7 +63,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
index 0bb60610e06fe..8f76ad500491c 100644
--- a/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
+++ b/llvm/test/CodeGen/X86/GlobalISel/x86_64-select-sitofp.mir
@@ -35,7 +35,7 @@
     ret double %conv
   }
 
-  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
   !llvm.module.flags = !{!0}
   !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
index a9922227b194e..16124854f551c 100644
--- a/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
+++ b/llvm/test/CodeGen/X86/StackColoring-use-between-allocas.mir
@@ -64,9 +64,9 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #3
 
-  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="64" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #1 = { argmemonly nounwind willreturn }
-  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
   attributes #3 = { nounwind }
 
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
index 8e7a46332e0e4..69689f027c982 100644
--- a/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
+++ b/llvm/test/CodeGen/X86/atom-fixup-lea4.ll
@@ -18,5 +18,5 @@ entry:
 ; Function Attrs: uwtable
 declare void @_ZN12ValueWrapperIS_IS_IdEEEC2Ev(ptr) unnamed_addr #0 align 2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/atomic-load-store.ll b/llvm/test/CodeGen/X86/atomic-load-store.ll
index 45277ce3d26c4..3e7b73a65fe07 100644
--- a/llvm/test/CodeGen/X86/atomic-load-store.ll
+++ b/llvm/test/CodeGen/X86/atomic-load-store.ll
@@ -1,12 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O3
 ; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64    | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
-; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,CHECK-SSE-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
+; RUN: llc < %s -mtriple=x86_64-- -verify-machineinstrs -O0 -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,CHECK-AVX-O0
 
 define void @test1(ptr %ptr, i32 %val1) {
 ; CHECK-LABEL: test1:
@@ -34,6 +34,943 @@ define i32 @test3(ptr %ptr) {
   %val = load atomic i32, ptr %ptr seq_cst, align 4
   ret i32 %val
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-O0: {{.*}}
-; CHECK-O3: {{.*}}
+
+define <1 x i32> @atomic_vec1_i32(ptr %x) {
+; CHECK-LABEL: atomic_vec1_i32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i32>, ptr %x acquire, align 4
+  ret <1 x i32> %ret
+}
+
+define <1 x i8> @atomic_vec1_i8(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movb (%rdi), %al
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  ret <1 x i8> %ret
+}
+
+define <1 x i16> @atomic_vec1_i16(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %ax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  ret <1 x i16> %ret
+}
+
+define <1 x i32> @atomic_vec1_i8_zext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-O3-NEXT:    movzbl %al, %eax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    movzbl %al, %eax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzbl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movzbl %al, %eax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movb (%rdi), %al
+; CHECK-O0-NEXT:    movzbl %al, %eax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movb (%rdi), %al
+; CHECK-SSE-O0-NEXT:    movzbl %al, %eax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i8_zext:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movb (%rdi), %al
+; CHECK-AVX-O0-NEXT:    movzbl %al, %eax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i8>, ptr %x acquire, align 1
+  %zret = zext <1 x i8> %ret to <1 x i32>
+  ret <1 x i32> %zret
+}
+
+define <1 x i64> @atomic_vec1_i16_sext(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    movswq %ax, %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    movswq %ax, %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    movswq %ax, %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %ax
+; CHECK-O0-NEXT:    movswq %ax, %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %ax
+; CHECK-SSE-O0-NEXT:    movswq %ax, %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i16_sext:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %ax
+; CHECK-AVX-O0-NEXT:    movswq %ax, %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i16>, ptr %x acquire, align 2
+  %sret = sext <1 x i16> %ret to <1 x i64>
+  ret <1 x i64> %sret
+}
+
+define <1 x ptr addrspace(270)> @atomic_vec1_ptr270(ptr %x) {
+; CHECK-LABEL: atomic_vec1_ptr270:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl (%rdi), %eax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr addrspace(270)>, ptr %x acquire, align 4
+  ret <1 x ptr addrspace(270)> %ret
+}
+
+define <1 x bfloat> @atomic_vec1_bfloat(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT:    # implicit-def: $eax
+; CHECK-AVX-O0-NEXT:    movw %cx, %ax
+; CHECK-AVX-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x bfloat>, ptr %x acquire, align 2
+  ret <1 x bfloat> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_ptr_align:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 8
+  ret <1 x ptr> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64_align(ptr %x) nounwind {
+; CHECK-LABEL: atomic_vec1_i64_align:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movq (%rdi), %rax
+; CHECK-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 8
+  ret <1 x i64> %ret
+}
+
+define <1 x ptr> @atomic_vec1_ptr(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_ptr:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_ptr:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_ptr:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_ptr:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x ptr>, ptr %x acquire, align 4
+  ret <1 x ptr> %ret
+}
+
+define <1 x half> @atomic_vec1_half(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-SSE-O3-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_half:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    movzwl (%rdi), %eax
+; CHECK-AVX-O3-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movw (%rdi), %cx
+; CHECK-O0-NEXT:    # implicit-def: $eax
+; CHECK-O0-NEXT:    movw %cx, %ax
+; CHECK-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movw (%rdi), %cx
+; CHECK-SSE-O0-NEXT:    # implicit-def: $eax
+; CHECK-SSE-O0-NEXT:    movw %cx, %ax
+; CHECK-SSE-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-SSE-O0-NEXT:    pinsrw $0, %eax, %xmm0
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_half:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    movw (%rdi), %cx
+; CHECK-AVX-O0-NEXT:    # implicit-def: $eax
+; CHECK-AVX-O0-NEXT:    movw %cx, %ax
+; CHECK-AVX-O0-NEXT:    # implicit-def: $xmm0
+; CHECK-AVX-O0-NEXT:    vpinsrw $0, %eax, %xmm0, %xmm0
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x half>, ptr %x acquire, align 2
+  ret <1 x half> %ret
+}
+
+define <1 x float> @atomic_vec1_float(ptr %x) {
+; CHECK-O3-LABEL: atomic_vec1_float:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_float:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_float:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_float:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_float:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_float:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x float>, ptr %x acquire, align 4
+  ret <1 x float> %ret
+}
+
+define <1 x double> @atomic_vec1_double_align(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double_align:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double_align:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double_align:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double_align:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x double>, ptr %x acquire, align 8
+  ret <1 x double> %ret
+}
+
+define <1 x i64> @atomic_vec1_i64(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_i64:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movq (%rsp), %rax
+; CHECK-O3-NEXT:    popq %rcx
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O3-NEXT:    popq %rcx
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O3-NEXT:    popq %rcx
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_i64:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq (%rsp), %rax
+; CHECK-O0-NEXT:    popq %rcx
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_i64:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq (%rsp), %rax
+; CHECK-SSE-O0-NEXT:    popq %rcx
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_i64:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    movq (%rsp), %rax
+; CHECK-AVX-O0-NEXT:    popq %rcx
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x i64>, ptr %x acquire, align 4
+  ret <1 x i64> %ret
+}
+
+define <1 x double> @atomic_vec1_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec1_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec1_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec1_double:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec1_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec1_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec1_double:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <1 x double>, ptr %x acquire, align 4
+  ret <1 x double> %ret
+}
+
+define <2 x i32> @atomic_vec2_i32(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec2_i32:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    pushq %rax
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $8, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O3-NEXT:    popq %rax
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    pushq %rax
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $8, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O3-NEXT:    popq %rax
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    pushq %rax
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $8, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovsd {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O3-NEXT:    popq %rax
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec2_i32:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    pushq %rax
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $8, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-O0-NEXT:    popq %rax
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec2_i32:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    pushq %rax
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $8, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-SSE-O0-NEXT:    popq %rax
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec2_i32:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    pushq %rax
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $8, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovq {{.*#+}} xmm0 = mem[0],zero
+; CHECK-AVX-O0-NEXT:    popq %rax
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <2 x i32>, ptr %x acquire, align 4
+  ret <2 x i32> %ret
+}
+
+define <4 x float> @atomic_vec4_float(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec4_float:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $24, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $16, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    addq $24, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec4_float:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $24, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $16, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    addq $24, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec4_float:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $24, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $16, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O3-NEXT:    addq $24, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec4_float:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $24, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $16, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    addq $24, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec4_float:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $24, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $16, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    addq $24, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec4_float:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $24, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $16, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovaps (%rsp), %xmm0
+; CHECK-AVX-O0-NEXT:    addq $24, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <4 x float>, ptr %x acquire, align 4
+  ret <4 x float> %ret
+}
+
+define <8 x double> @atomic_vec8_double(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec8_double:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec8_double:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec8_double:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec8_double:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movapd (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movapd {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <8 x double>, ptr %x acquire, align 4
+  ret <8 x double> %ret
+}
+
+define <16 x bfloat> @atomic_vec16_bfloat(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $40, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $32, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    addq $40, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $40, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $32, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    addq $40, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-AVX-O3-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O3:       # %bb.0:
+; CHECK-AVX-O3-NEXT:    subq $40, %rsp
+; CHECK-AVX-O3-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O3-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O3-NEXT:    movl $32, %edi
+; CHECK-AVX-O3-NEXT:    movl $2, %ecx
+; CHECK-AVX-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O3-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O3-NEXT:    addq $40, %rsp
+; CHECK-AVX-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $40, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $32, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    addq $40, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $40, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $32, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    addq $40, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+;
+; CHECK-AVX-O0-LABEL: atomic_vec16_bfloat:
+; CHECK-AVX-O0:       # %bb.0:
+; CHECK-AVX-O0-NEXT:    subq $40, %rsp
+; CHECK-AVX-O0-NEXT:    movq %rdi, %rsi
+; CHECK-AVX-O0-NEXT:    movl $32, %edi
+; CHECK-AVX-O0-NEXT:    movq %rsp, %rdx
+; CHECK-AVX-O0-NEXT:    movl $2, %ecx
+; CHECK-AVX-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-AVX-O0-NEXT:    vmovups (%rsp), %ymm0
+; CHECK-AVX-O0-NEXT:    addq $40, %rsp
+; CHECK-AVX-O0-NEXT:    retq
+  %ret = load atomic <16 x bfloat>, ptr %x acquire, align 4
+  ret <16 x bfloat> %ret
+}
+
+define <32 x half> @atomic_vec32_half(ptr %x) nounwind {
+; CHECK-O3-LABEL: atomic_vec32_half:
+; CHECK-O3:       # %bb.0:
+; CHECK-O3-NEXT:    subq $72, %rsp
+; CHECK-O3-NEXT:    movq %rdi, %rsi
+; CHECK-O3-NEXT:    movq %rsp, %rdx
+; CHECK-O3-NEXT:    movl $64, %edi
+; CHECK-O3-NEXT:    movl $2, %ecx
+; CHECK-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O3-NEXT:    addq $72, %rsp
+; CHECK-O3-NEXT:    retq
+;
+; CHECK-SSE-O3-LABEL: atomic_vec32_half:
+; CHECK-SSE-O3:       # %bb.0:
+; CHECK-SSE-O3-NEXT:    subq $72, %rsp
+; CHECK-SSE-O3-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O3-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O3-NEXT:    movl $64, %edi
+; CHECK-SSE-O3-NEXT:    movl $2, %ecx
+; CHECK-SSE-O3-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O3-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O3-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O3-NEXT:    addq $72, %rsp
+; CHECK-SSE-O3-NEXT:    retq
+;
+; CHECK-O0-LABEL: atomic_vec32_half:
+; CHECK-O0:       # %bb.0:
+; CHECK-O0-NEXT:    subq $72, %rsp
+; CHECK-O0-NEXT:    movq %rdi, %rsi
+; CHECK-O0-NEXT:    movl $64, %edi
+; CHECK-O0-NEXT:    movq %rsp, %rdx
+; CHECK-O0-NEXT:    movl $2, %ecx
+; CHECK-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-O0-NEXT:    addq $72, %rsp
+; CHECK-O0-NEXT:    retq
+;
+; CHECK-SSE-O0-LABEL: atomic_vec32_half:
+; CHECK-SSE-O0:       # %bb.0:
+; CHECK-SSE-O0-NEXT:    subq $72, %rsp
+; CHECK-SSE-O0-NEXT:    movq %rdi, %rsi
+; CHECK-SSE-O0-NEXT:    movl $64, %edi
+; CHECK-SSE-O0-NEXT:    movq %rsp, %rdx
+; CHECK-SSE-O0-NEXT:    movl $2, %ecx
+; CHECK-SSE-O0-NEXT:    callq __atomic_load@PLT
+; CHECK-SSE-O0-NEXT:    movaps (%rsp), %xmm0
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm1
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm2
+; CHECK-SSE-O0-NEXT:    movaps {{[0-9]+}}(%rsp), %xmm3
+; CHECK-SSE-O0-NEXT:    addq $72, %rsp
+; CHECK-SSE-O0-NEXT:    retq
+  %ret = load atomic <32 x half>, ptr %x acquire, align 4
+  ret <32 x half> %ret
+}
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
index 99fee2756b88e..88d7682753b96 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change.mir
@@ -52,7 +52,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
index 50b2433950e01..8dbd4e2ca983b 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change2.mir
@@ -63,7 +63,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
 
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
index 7a4b99395ea1e..c0924eaa00b6b 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
+++ b/llvm/test/CodeGen/X86/avoid-sfb-g-no-change3.mir
@@ -73,7 +73,7 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #1 = { nounwind readnone speculatable }
   attributes #2 = { nounwind }
   
diff --git a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
index da9d16ca9cf72..f074390552c48 100644
--- a/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
+++ b/llvm/test/CodeGen/X86/avoid-sfb-overlaps.ll
@@ -502,6 +502,6 @@ entry:
   ret void
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 attributes #1 = { argmemonly nounwind }
diff --git a/llvm/test/CodeGen/X86/avx-minmax.ll b/llvm/test/CodeGen/X86/avx-minmax.ll
index 6da04c50c336a..8e4b6c6af4cb1 100644
--- a/llvm/test/CodeGen/X86/avx-minmax.ll
+++ b/llvm/test/CodeGen/X86/avx-minmax.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx -enable-no-nans-fp-math | FileCheck %s
 
 define <2 x double> @maxpd(<2 x double> %x, <2 x double> %y) {
 ; CHECK-LABEL: maxpd:
diff --git a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
index b4ba23934d54d..7fd4f59626649 100644
--- a/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
+++ b/llvm/test/CodeGen/X86/avx512-cmp-kor-sequence.ll
@@ -48,5 +48,5 @@ entry:
 ; Function Attrs: nounwind readnone
 declare <16 x i1> @llvm.x86.avx512.mask.cmp.ps.512(<16 x float>, <16 x float>, i32, <16 x i1>, i32) #1
 
-attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
index f8279985f12d2..eb9de8a9536d0 100644
--- a/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
+++ b/llvm/test/CodeGen/X86/avx512-unsafe-fp-math.ll
@@ -2,7 +2,7 @@
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=CHECK_UNSAFE
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-nans-fp-math -mattr=+avx512f | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64 -enable-no-signed-zeros-fp-math -mattr=+avx512f | FileCheck %s
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s
 
 define <16 x float> @test_max_v16f32(ptr %a_ptr, <16 x float> %b)  {
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
index 5d9784aa5d2eb..1147d7945c694 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce, <32 x half> %rhs.coerce) {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
index b58bae93ed660..1c4d9c6535761 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc-fadd.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %acc.coerce, <32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
index 92bdebb34979a..a8ff969c3c05f 100644
--- a/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
+++ b/llvm/test/CodeGen/X86/avx512fp16-combine-xor-vfmulc.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl --enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512fp16,avx512vl | FileCheck %s
 
 define dso_local <32 x half> @test1(<32 x half> %lhs.coerce.conj, <32 x half> %rhs.coerce) local_unnamed_addr #0 {
 ; CHECK-LABEL: test1:
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
index 4e76262156b42..423e31868bc5f 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-empty-function.ll
@@ -19,7 +19,7 @@ entry:
 ; CHECK:	func:
 ; CHECK:	.Lfunc_begin1:
 ; CHECK:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; BASIC-NEXT:		.byte 0			# feature
 ; PGO-NEXT:		.byte 3			# feature
 ; CHECK-NEXT:		.quad	.Lfunc_begin1	# function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
index f610b043021a7..e32e5222bf016 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-function-sections.ll
@@ -10,7 +10,7 @@ define dso_local i32 @_Z3barv() {
 ; CHECK-LABEL:	_Z3barv:
 ; CHECK-NEXT:	[[BAR_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3barv{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; CHECK-NEXT:		.byte 0			# feature
 ; CHECK-NEXT:		.quad [[BAR_BEGIN]]	# function address
 
@@ -23,7 +23,7 @@ define dso_local i32 @_Z3foov() {
 ; CHECK-LABEL:	_Z3foov:
 ; CHECK-NEXT:	[[FOO_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section  .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3foov{{$}}
-; CHECK-NEXT:		.byte 3			# version
+; CHECK-NEXT:		.byte 4			# version
 ; CHECK-NEXT:		.byte 32                # feature
 ; CHECK-NEXT:		.quad [[FOO_BEGIN]]	# function address
 
@@ -36,6 +36,6 @@ define linkonce_odr dso_local i32 @_Z4fooTIiET_v() comdat {
 ; CHECK-LABEL:	_Z4fooTIiET_v:
 ; CHECK-NEXT:	[[FOOCOMDAT_BEGIN:.Lfunc_begin[0-9]+]]:
 ; CHECK:		.section .llvm_bb_addr_map,"oG",@llvm_bb_addr_map,.text._Z4fooTIiET_v,_Z4fooTIiET_v,comdat{{$}}
-; CHECK-NEXT:		.byte 3				# version
+; CHECK-NEXT:		.byte 4				# version
 ; CHECK-NEXT:		.byte 0				# feature
 ; CHECK-NEXT:		.quad [[FOOCOMDAT_BEGIN]]	# function address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
index ba76f3eab6134..12b1297ba97ce 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-pgo-features.ll
@@ -69,7 +69,7 @@ declare i32 @__gxx_personality_v0(...)
 ; CHECK-LABEL:	.Lfunc_end0:
 
 ; CHECK: 	.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
-; CHECK-NEXT:	.byte	3		# version
+; CHECK-NEXT:	.byte	4		# version
 ; BASIC-NEXT:	.byte	32		# feature
 ; PGO-ALL-NEXT:	.byte	39		# feature
 ; FEC-ONLY-NEXT:.byte	33		# feature
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
index 6157f1a727ab5..aeb6dc95e32f3 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-basic-block-sections.ll
@@ -47,7 +47,7 @@ declare i32 @__gxx_personality_v0(...)
 ; CHECK-LABEL:	.Lfunc_end0:
 
 ; CHECK:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot._Z3bazb
-; CHECK-NEXT:   .byte   3                       # version
+; CHECK-NEXT:   .byte   4                       # version
 ; CHECK-NEXT:   .byte   40                      # feature
 ; CHECK-NEXT:   .byte   2                       # number of basic block ranges
 ; CHECK-NEXT:	.quad	.Lfunc_begin0           # base address
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
new file mode 100644
index 0000000000000..a567887753245
--- /dev/null
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-emit-bb-hash.ll
@@ -0,0 +1,94 @@
+; Check the basic block sections labels option works when used along with -emit-bb-hash.
+; RUN: llc < %s -mtriple=x86_64 -function-sections -unique-section-names=true -basic-block-address-map -emit-bb-hash | FileCheck %s --check-prefixes=CHECK,UNIQ
+
+define void @_Z3bazb(i1 zeroext, i1 zeroext) personality ptr @__gxx_personality_v0 {
+  br i1 %0, label %3, label %8
+
+3:
+  %4 = invoke i32 @_Z3barv()
+          to label %8 unwind label %6
+  br label %10
+
+6:
+  landingpad { ptr, i32 }
+          catch ptr null
+  br label %12
+
+8:
+  %9 = call i32 @_Z3foov()
+  br i1 %1, label %12, label %10
+
+10:
+  %11 = select i1 %1, ptr blockaddress(@_Z3bazb, %3), ptr blockaddress(@_Z3bazb, %12) ; <ptr> [#uses=1]
+  indirectbr ptr %11, [label %3, label %12]
+
+12:
+  ret void
+}
+
+declare i32 @_Z3barv() #1
+
+declare i32 @_Z3foov() #1
+
+declare i32 @__gxx_personality_v0(...)
+
+; UNIQ:			.section .text._Z3bazb,"ax",@progbits{{$}}
+; NOUNIQ:		.section .text,"ax",@progbits,unique,1
+; CHECK-LABEL:	_Z3bazb:
+; CHECK-LABEL:	.Lfunc_begin0:
+; CHECK-LABEL:	.LBB_END0_0:
+; CHECK-LABEL:	.LBB0_1:
+; CHECK-LABEL:  .LBB0_1_CS0:
+; CHECK-LABEL:	.LBB_END0_1:
+; CHECK-LABEL:	.LBB0_2:
+; CHECK-LABEL:  .LBB0_2_CS0:
+; CHECK-LABEL:	.LBB_END0_2:
+; CHECK-LABEL:	.LBB0_3:
+; CHECK-LABEL:	.LBB_END0_3:
+; CHECK-LABEL:	.Lfunc_end0:
+
+; UNIQ:			.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
+;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
+; NOUNIQ:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
+; CHECK-NEXT:   .byte   4		# version
+; CHECK-NEXT:   .byte   96		# feature
+; CHECK-NEXT:	.quad	.Lfunc_begin0	# function address
+; CHECK-NEXT:	.byte	6		# number of basic blocks
+; CHECK-NEXT:   .byte	0		# BB id
+; CHECK-NEXT:	.uleb128 .Lfunc_begin0-.Lfunc_begin0
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_0-.Lfunc_begin0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	1		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_1-.LBB_END0_0
+; CHECK-NEXT:   .byte	1		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB0_1_CS0-.LBB0_1
+; CHECK-NEXT:	.uleb128 .LBB_END0_1-.LBB0_1_CS0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	3		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_2-.LBB_END0_1
+; CHECK-NEXT:   .byte	1		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB0_2_CS0-.LBB0_2
+; CHECK-NEXT:	.uleb128 .LBB_END0_2-.LBB0_2_CS0
+; CHECK-NEXT:	.byte	8
+; CHECK-NEXT:   .quad	{{-?[0-9]+}}
+; CHECK-NEXT:   .byte	4		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_3-.LBB_END0_2
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_3-.LBB0_3
+; CHECK-NEXT:	.byte	16
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
+; CHECK-NEXT:   .byte	5		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_4-.LBB_END0_3
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_4-.LBB0_4
+; CHECK-NEXT:	.byte	1
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
+; CHECK-NEXT:   .byte	2		# BB id
+; CHECK-NEXT:	.uleb128 .LBB0_5-.LBB_END0_4
+; CHECK-NEXT:   .byte	0		# number of callsites
+; CHECK-NEXT:	.uleb128 .LBB_END0_5-.LBB0_5
+; CHECK-NEXT:	.byte	5
+; CHECK-NEXT:   .quad {{-?[0-9]+}}
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
index 1e8cee4dc8b65..d49b313679628 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map-with-mfs.ll
@@ -58,7 +58,7 @@ declare i32 @qux()
 ; CHECK-LABEL:  .Lfunc_end0:
 
 ; CHECK:                .section        .llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text.hot.foo
-; CHECK-NEXT:   .byte   3               # version
+; CHECK-NEXT:   .byte   4               # version
 ; BASIC-NEXT:   .byte   40              # feature
 ; PGO-NEXT:     .byte   47              # feature
 ; CHECK-NEXT:   .byte   2               # number of basic block ranges
diff --git a/llvm/test/CodeGen/X86/basic-block-address-map.ll b/llvm/test/CodeGen/X86/basic-block-address-map.ll
index 5c8f3a645c34a..64cf2c709f3df 100644
--- a/llvm/test/CodeGen/X86/basic-block-address-map.ll
+++ b/llvm/test/CodeGen/X86/basic-block-address-map.ll
@@ -52,7 +52,7 @@ declare i32 @__gxx_personality_v0(...)
 ; UNIQ:			.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text._Z3bazb{{$}}
 ;; Verify that with -unique-section-names=false, the unique id of the text section gets assigned to the llvm_bb_addr_map section.
 ; NOUNIQ:		.section	.llvm_bb_addr_map,"o",@llvm_bb_addr_map,.text,unique,1
-; CHECK-NEXT:   .byte   3		# version
+; CHECK-NEXT:   .byte   4		# version
 ; CHECK-NEXT:   .byte   32		# feature
 ; CHECK-NEXT:	.quad	.Lfunc_begin0	# function address
 ; CHECK-NEXT:	.byte	6		# number of basic blocks
diff --git a/llvm/test/CodeGen/X86/bf16-fast-isel.ll b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
new file mode 100644
index 0000000000000..c659e0e647d36
--- /dev/null
+++ b/llvm/test/CodeGen/X86/bf16-fast-isel.ll
@@ -0,0 +1,66 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --fast-isel < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
+
+define i8 @test_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_direct_call(ptr %f) nounwind {
+; CHECK-LABEL: test_fast_direct_call:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq foo_fast@PLT
+; CHECK-NEXT:    callq bar@PLT
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo_fast(ptr %f)
+  %call2 = call zeroext i8 @bar(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+define i8 @test_fast_indirect_all(ptr %fptr, ptr %f) nounwind {
+; CHECK-LABEL: test_fast_indirect_all:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    movq %rdi, %rbx
+; CHECK-NEXT:    movq %rsi, %rdi
+; CHECK-NEXT:    callq foo@PLT
+; CHECK-NEXT:    callq *%rbx
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    retq
+entry:
+  %call = call fastcc bfloat @foo(ptr %f)
+  %call2 = call zeroext i8 %fptr(bfloat %call)
+  ret i8 %call2
+}
+
+declare bfloat @foo(ptr %f)
+declare zeroext i8 @bar(bfloat)
+declare fastcc bfloat @foo_fast(ptr %f)
diff --git a/llvm/test/CodeGen/X86/bit-piece-comment.ll b/llvm/test/CodeGen/X86/bit-piece-comment.ll
index d74863ff8596f..85c64a7d83406 100644
--- a/llvm/test/CodeGen/X86/bit-piece-comment.ll
+++ b/llvm/test/CodeGen/X86/bit-piece-comment.ll
@@ -32,7 +32,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize readnone uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
index 632d90d825d69..f36baba402421 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-addrtaken.ll
@@ -27,7 +27,7 @@ entry:
 !1 = !{i64 0, !"_ZTSFivE.generalized"}
 !2 = !{i64 0, !"_ZTSFviE.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags -- Potential indirect target so LSB is set to 1. Other bits are 0.
diff --git a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
index ed6849a4e452d..cdbad668aec54 100644
--- a/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
+++ b/llvm/test/CodeGen/X86/call-graph-section-assembly.ll
@@ -36,7 +36,7 @@ entry:
 !4 = !{!5}
 !5 = !{i64 0, !"_ZTSFPvS_E.generalized"}
 
-; CHECK: .section .llvm.callgraph,"o",@progbits,.text
+; CHECK: .section .llvm.callgraph,"o",@llvm_call_graph,.text
 ;; Version
 ; CHECK-NEXT: .byte   0
 ;; Flags
diff --git a/llvm/test/CodeGen/X86/catchpad-regmask.ll b/llvm/test/CodeGen/X86/catchpad-regmask.ll
index 9dba897afd57f..713d015e2c15b 100644
--- a/llvm/test/CodeGen/X86/catchpad-regmask.ll
+++ b/llvm/test/CodeGen/X86/catchpad-regmask.ll
@@ -130,7 +130,7 @@ unreachable:                                      ; preds = %entry
 ; CHECK: retq                            # CATCHRET
 
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { noreturn }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/catchpad-weight.ll b/llvm/test/CodeGen/X86/catchpad-weight.ll
index e97f35866c5eb..699243d9d262c 100644
--- a/llvm/test/CodeGen/X86/catchpad-weight.ll
+++ b/llvm/test/CodeGen/X86/catchpad-weight.ll
@@ -74,8 +74,8 @@ declare void @"\01??1HasDtor@@QEAA@XZ"(ptr) #3
 ; Function Attrs: nounwind argmemonly
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind argmemonly }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/clang-section-coff.ll b/llvm/test/CodeGen/X86/clang-section-coff.ll
index 02381fd256aaa..6b76bb6d222e3 100644
--- a/llvm/test/CodeGen/X86/clang-section-coff.ll
+++ b/llvm/test/CodeGen/X86/clang-section-coff.ll
@@ -37,8 +37,8 @@ attributes #0 = { "bss-section"="my_bss.1" "data-section"="my_data.1" "rodata-se
 attributes #1 = { "data-section"="my_data.1" "rodata-section"="my_rodata.1" }
 attributes #2 = { "bss-section"="my_bss.2" "rodata-section"="my_rodata.1" }
 attributes #3 = { "bss-section"="my_bss.2" "data-section"="my_data.2" "rodata-section"="my_rodata.2" }
-attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #7 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
index 01e70193474f5..863f580374e55 100644
--- a/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
+++ b/llvm/test/CodeGen/X86/cleanuppad-inalloca.ll
@@ -65,4 +65,4 @@ declare i32 @__CxxFrameHandler3(...)
 
 declare x86_thiscallcc void @"\01??1A@@QAE@XZ"(ptr) #0
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/combine-adc.ll b/llvm/test/CodeGen/X86/combine-adc.ll
index 22417363f1093..a2aaea31aa6ff 100644
--- a/llvm/test/CodeGen/X86/combine-adc.ll
+++ b/llvm/test/CodeGen/X86/combine-adc.ll
@@ -89,4 +89,52 @@ define i32 @adc_merge_constants(i32 %a0) nounwind {
   ret i32 %sum
 }
 
+define i32 @adc_merge_sub(i32 %a0) nounwind {
+; X86-LABEL: adc_merge_sub:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    addl $42, %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: adc_merge_sub:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl %edi, %ebx
+; X64-NEXT:    xorl %edi, %edi
+; X64-NEXT:    addl $42, %ebx
+; X64-NEXT:    setb %dil
+; X64-NEXT:    movl %ebx, %ebp
+; X64-NEXT:    negl %ebp
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebx, %ebp
+; X64-NEXT:    movl %ebp, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %adc = tail call { i8, i32 } @llvm.x86.addcarry.32(i8 0, i32 %a0, i32 42)
+  %carry = extractvalue { i8, i32 } %adc, 0
+  call void @use(i8 %carry)
+  %sum = extractvalue { i8, i32 } %adc, 1
+  %sub = sub i32 -42, %a0
+  %result = xor i32 %sum, %sub
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.addcarry.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/combine-sbb.ll b/llvm/test/CodeGen/X86/combine-sbb.ll
index 89aee965a2c1f..62744d4f3050a 100644
--- a/llvm/test/CodeGen/X86/combine-sbb.ll
+++ b/llvm/test/CodeGen/X86/combine-sbb.ll
@@ -333,4 +333,85 @@ define i32 @PR40483_sub6(ptr, i32) nounwind {
   ret i32 %10
 }
 
+define i32 @sbb_merge_add1(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add1:
+; X86:       # %bb.0:
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    cmpl $42, {{[0-9]+}}(%esp)
+; X86-NEXT:    setb %al
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add1:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    cmpl $42, %edi
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    popq %rcx
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 %a0, i32 42)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
+define i32 @sbb_merge_add2(i32 %a0) nounwind {
+; X86-LABEL: sbb_merge_add2:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl $42, %edi
+; X86-NEXT:    xorl %eax, %eax
+; X86-NEXT:    subl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl %edi, %esi
+; X86-NEXT:    negl %esi
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll use@PLT
+; X86-NEXT:    addl $4, %esp
+; X86-NEXT:    xorl %edi, %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    retl
+;
+; X64-LABEL: sbb_merge_add2:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    pushq %rax
+; X64-NEXT:    movl $42, %ebp
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    subl %edi, %ebp
+; X64-NEXT:    setb %al
+; X64-NEXT:    movl %ebp, %ebx
+; X64-NEXT:    negl %ebx
+; X64-NEXT:    movl %eax, %edi
+; X64-NEXT:    callq use@PLT
+; X64-NEXT:    xorl %ebp, %ebx
+; X64-NEXT:    movl %ebx, %eax
+; X64-NEXT:    addq $8, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
+  %sbb = tail call { i8, i32 } @llvm.x86.subborrow.32(i8 0, i32 42, i32 %a0)
+  %borrow = extractvalue { i8, i32 } %sbb, 0
+  call void @use(i8 %borrow)
+  %diff = extractvalue { i8, i32 } %sbb, 1
+  %add = add i32 %a0, -42
+  %result = xor i32 %diff, %add
+  ret i32 %result
+}
+
 declare { i8, i32 } @llvm.x86.subborrow.32(i8, i32, i32)
+declare void @use(i8)
diff --git a/llvm/test/CodeGen/X86/complex-fastmath.ll b/llvm/test/CodeGen/X86/complex-fastmath.ll
index 29a37a12c35e2..21bb64a2f9e80 100644
--- a/llvm/test/CodeGen/X86/complex-fastmath.ll
+++ b/llvm/test/CodeGen/X86/complex-fastmath.ll
@@ -212,4 +212,4 @@ define <2 x double> @complex_mul_f64(<2 x double>, <2 x double>) #0 {
   ret <2 x double> %14
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true"  }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true"  }
diff --git a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
index ddddcfa745dce..9f51fa4be8e1d 100644
--- a/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
+++ b/llvm/test/CodeGen/X86/crash-lre-eliminate-dead-def.ll
@@ -264,5 +264,5 @@ unreachable:                                      ; preds = %cleanup100
 ; Function Attrs: nounwind
 declare void @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/dag-fmf-cse.ll b/llvm/test/CodeGen/X86/dag-fmf-cse.ll
index 609ccdc367395..cdcc082e72f2b 100644
--- a/llvm/test/CodeGen/X86/dag-fmf-cse.ll
+++ b/llvm/test/CodeGen/X86/dag-fmf-cse.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=fma | FileCheck %s
 
 ; If fast-math-flags are propagated correctly, the mul1 expression
 ; should be recognized as a factor in the last fsub, so we should
diff --git a/llvm/test/CodeGen/X86/dag-optnone.ll b/llvm/test/CodeGen/X86/dag-optnone.ll
index 66e4c1d6ce8be..022694e1d516d 100644
--- a/llvm/test/CodeGen/X86/dag-optnone.ll
+++ b/llvm/test/CodeGen/X86/dag-optnone.ll
@@ -28,7 +28,7 @@
 ; a repeated fadd that can be combined into an fmul.  We show that this
 ; happens in both the non-optnone function and the optnone function.
 
-define float @foo(float %x, ...) #0 {
+define float @foo(float %x, ...) {
 entry:
   %add = fadd fast float %x, %x
   %add1 = fadd fast float %add, %x
@@ -68,5 +68,4 @@ entry:
   ret void
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
-attributes #1 = { noinline optnone "unsafe-fp-math"="true" }
+attributes #1 = { noinline optnone }
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index b428ce457ff40..71ad598abe683 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -96,6 +96,17 @@ entry:
 define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-LABEL: _Z2x6v:
 ; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
+; CHECK-NEXT:    movl (%rax), %edx
+; CHECK-NEXT:    andl $511, %edx # imm = 0x1FF
+; CHECK-NEXT:    leaq 1(%rdx), %rax
+; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl %eax, (%rcx)
+; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
+; CHECK-NEXT:    movl (%rcx), %ecx
+; CHECK-NEXT:    testl %ecx, %ecx
+; CHECK-NEXT:    je .LBB1_18
+; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
 ; CHECK-NEXT:    pushq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    pushq %r15
@@ -114,58 +125,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_offset %r14, -32
 ; CHECK-NEXT:    .cfi_offset %r15, -24
 ; CHECK-NEXT:    .cfi_offset %rbp, -16
-; CHECK-NEXT:    movq x1@GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movl (%rax), %ebx
-; CHECK-NEXT:    andl $511, %ebx # imm = 0x1FF
-; CHECK-NEXT:    leaq 1(%rbx), %rax
-; CHECK-NEXT:    movq x4@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl %eax, (%rcx)
-; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rcx
-; CHECK-NEXT:    movl (%rcx), %ecx
-; CHECK-NEXT:    testl %ecx, %ecx
-; CHECK-NEXT:    je .LBB1_18
-; CHECK-NEXT:  # %bb.1: # %for.cond1thread-pre-split.lr.ph
-; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rdx
-; CHECK-NEXT:    movq (%rdx), %rsi
-; CHECK-NEXT:    movl %ecx, %edx
-; CHECK-NEXT:    notl %edx
-; CHECK-NEXT:    leaq 8(,%rdx,8), %rdi
+; CHECK-NEXT:    movq x5@GOTPCREL(%rip), %rsi
+; CHECK-NEXT:    movq (%rsi), %rsi
+; CHECK-NEXT:    movl %ecx, %edi
+; CHECK-NEXT:    notl %edi
+; CHECK-NEXT:    leaq 8(,%rdi,8), %rdi
 ; CHECK-NEXT:    imulq %rax, %rdi
 ; CHECK-NEXT:    addq %rsi, %rdi
 ; CHECK-NEXT:    movq x2@GOTPCREL(%rip), %r8
-; CHECK-NEXT:    movl (%r8), %edx
-; CHECK-NEXT:    leal 8(,%rbx,8), %eax
+; CHECK-NEXT:    movl (%r8), %r9d
+; CHECK-NEXT:    leal 8(,%rdx,8), %eax
 ; CHECK-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT:    leaq 32(%rsi), %r11
-; CHECK-NEXT:    leaq 8(,%rbx,8), %rbx
-; CHECK-NEXT:    xorl %r14d, %r14d
-; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r15
-; CHECK-NEXT:    movq %rsi, %r12
+; CHECK-NEXT:    leaq 32(%rsi), %rbx
+; CHECK-NEXT:    leaq 8(,%rdx,8), %r14
+; CHECK-NEXT:    xorl %r15d, %r15d
+; CHECK-NEXT:    movq x0@GOTPCREL(%rip), %r12
+; CHECK-NEXT:    movq %rsi, %r13
 ; CHECK-NEXT:    jmp .LBB1_2
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_15: # %for.cond1.for.inc3_crit_edge
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movl %edx, (%r8)
+; CHECK-NEXT:    movl %r9d, (%r8)
 ; CHECK-NEXT:  .LBB1_16: # %for.inc3
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rbx, %r12
-; CHECK-NEXT:    incq %r14
-; CHECK-NEXT:    addq %rbx, %r11
+; CHECK-NEXT:    addq %r14, %r13
+; CHECK-NEXT:    incq %r15
+; CHECK-NEXT:    addq %r14, %rbx
 ; CHECK-NEXT:    incl %ecx
 ; CHECK-NEXT:    je .LBB1_17
 ; CHECK-NEXT:  .LBB1_2: # %for.cond1thread-pre-split
 ; CHECK-NEXT:    # =>This Loop Header: Depth=1
 ; CHECK-NEXT:    # Child Loop BB1_12 Depth 2
 ; CHECK-NEXT:    # Child Loop BB1_14 Depth 2
-; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    testl %r9d, %r9d
 ; CHECK-NEXT:    jns .LBB1_16
 ; CHECK-NEXT:  # %bb.3: # %for.body2.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    movslq %edx, %r13
-; CHECK-NEXT:    testq %r13, %r13
+; CHECK-NEXT:    movslq %r9d, %r9
+; CHECK-NEXT:    testq %r9, %r9
 ; CHECK-NEXT:    movq $-1, %rbp
-; CHECK-NEXT:    cmovnsq %r13, %rbp
-; CHECK-NEXT:    subq %r13, %rbp
+; CHECK-NEXT:    cmovnsq %r9, %rbp
+; CHECK-NEXT:    subq %r9, %rbp
 ; CHECK-NEXT:    incq %rbp
 ; CHECK-NEXT:    cmpq $4, %rbp
 ; CHECK-NEXT:    jb .LBB1_14
@@ -177,20 +177,20 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:  # %bb.5: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; CHECK-NEXT:    imulq %r14, %rax
-; CHECK-NEXT:    leaq (%rsi,%rax), %r10
-; CHECK-NEXT:    leaq (%r10,%r13,8), %r9
-; CHECK-NEXT:    testq %r13, %r13
-; CHECK-NEXT:    movq $-1, %r10
-; CHECK-NEXT:    cmovnsq %r13, %r10
-; CHECK-NEXT:    cmpq %r15, %r9
+; CHECK-NEXT:    imulq %r15, %rax
+; CHECK-NEXT:    leaq (%rsi,%rax), %r11
+; CHECK-NEXT:    leaq (%r11,%r9,8), %r10
+; CHECK-NEXT:    testq %r9, %r9
+; CHECK-NEXT:    movq $-1, %r11
+; CHECK-NEXT:    cmovnsq %r9, %r11
+; CHECK-NEXT:    cmpq %r12, %r10
 ; CHECK-NEXT:    jae .LBB1_7
 ; CHECK-NEXT:  # %bb.6: # %vector.memcheck
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    leaq 8(%rsi), %r9
-; CHECK-NEXT:    addq %r9, %rax
-; CHECK-NEXT:    leaq (%rax,%r10,8), %rax
-; CHECK-NEXT:    cmpq %r15, %rax
+; CHECK-NEXT:    leaq 8(%rsi), %r10
+; CHECK-NEXT:    addq %r10, %rax
+; CHECK-NEXT:    leaq (%rax,%r11,8), %rax
+; CHECK-NEXT:    cmpq %r12, %rax
 ; CHECK-NEXT:    ja .LBB1_14
 ; CHECK-NEXT:  .LBB1_7: # %vector.body.preheader
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
@@ -201,50 +201,47 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movdqu %xmm0, (%r12,%r13,8)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r12,%r13,8)
-; CHECK-NEXT:    movl $4, %r10d
+; CHECK-NEXT:    movdqu %xmm0, (%r13,%r9,8)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r13,%r9,8)
+; CHECK-NEXT:    movl $4, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    jne .LBB1_11
 ; CHECK-NEXT:    jmp .LBB1_13
 ; CHECK-NEXT:  .LBB1_8: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    xorl %r10d, %r10d
+; CHECK-NEXT:    xorl %r11d, %r11d
 ; CHECK-NEXT:    shrq $2, %rax
 ; CHECK-NEXT:    je .LBB1_13
 ; CHECK-NEXT:  .LBB1_11: # %vector.body.preheader.new
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
 ; CHECK-NEXT:    movq {{.*#+}} xmm0 = mem[0],zero
 ; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1]
-; CHECK-NEXT:    movq %r10, %rax
+; CHECK-NEXT:    movq %r11, %rax
 ; CHECK-NEXT:    subq %rdx, %rax
-; CHECK-NEXT:    addq %r13, %r10
-; CHECK-NEXT:    leaq (%r11,%r10,8), %r10
+; CHECK-NEXT:    addq %r9, %r11
+; CHECK-NEXT:    leaq (%rbx,%r11,8), %r11
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_12: # %vector.body
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movdqu %xmm0, -32(%r10)
-; CHECK-NEXT:    movdqu %xmm0, -16(%r10)
-; CHECK-NEXT:    movdqu %xmm0, (%r10)
-; CHECK-NEXT:    movdqu %xmm0, 16(%r10)
-; CHECK-NEXT:    addq $64, %r10
+; CHECK-NEXT:    movdqu %xmm0, -32(%r11)
+; CHECK-NEXT:    movdqu %xmm0, -16(%r11)
+; CHECK-NEXT:    movdqu %xmm0, (%r11)
+; CHECK-NEXT:    movdqu %xmm0, 16(%r11)
+; CHECK-NEXT:    addq $64, %r11
 ; CHECK-NEXT:    addq $8, %rax
 ; CHECK-NEXT:    jne .LBB1_12
 ; CHECK-NEXT:  .LBB1_13: # %middle.block
 ; CHECK-NEXT:    # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT:    addq %rdx, %r13
+; CHECK-NEXT:    addq %rdx, %r9
 ; CHECK-NEXT:    cmpq %rdx, %rbp
-; CHECK-NEXT:    movq %r13, %rdx
 ; CHECK-NEXT:    je .LBB1_15
 ; CHECK-NEXT:    .p2align 4
 ; CHECK-NEXT:  .LBB1_14: # %for.body2
 ; CHECK-NEXT:    # Parent Loop BB1_2 Depth=1
 ; CHECK-NEXT:    # => This Inner Loop Header: Depth=2
-; CHECK-NEXT:    movq (%r15), %rax
-; CHECK-NEXT:    movq %rax, (%r12,%r13,8)
-; CHECK-NEXT:    leaq 1(%r13), %rdx
-; CHECK-NEXT:    cmpq $-1, %r13
-; CHECK-NEXT:    movq %rdx, %r13
+; CHECK-NEXT:    movq (%r12), %rax
+; CHECK-NEXT:    movq %rax, (%r13,%r9,8)
+; CHECK-NEXT:    incq %r9
 ; CHECK-NEXT:    jl .LBB1_14
 ; CHECK-NEXT:    jmp .LBB1_15
 ; CHECK-NEXT:  .LBB1_17: # %for.cond.for.end5_crit_edge
@@ -252,7 +249,6 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq x3@GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movl $0, (%rax)
-; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    popq %rbx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 48
 ; CHECK-NEXT:    popq %r12
@@ -265,6 +261,13 @@ define void @_Z2x6v() local_unnamed_addr {
 ; CHECK-NEXT:    .cfi_def_cfa_offset 16
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_restore %rbx
+; CHECK-NEXT:    .cfi_restore %r12
+; CHECK-NEXT:    .cfi_restore %r13
+; CHECK-NEXT:    .cfi_restore %r14
+; CHECK-NEXT:    .cfi_restore %r15
+; CHECK-NEXT:    .cfi_restore %rbp
+; CHECK-NEXT:  .LBB1_18: # %for.end5
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i32, ptr @x1, align 4
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
index deba5a8fe6727..18e5490d8dda1 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -112,9 +112,9 @@ declare void @_Z3fooPcjPKc(ptr, i32, ptr) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #3
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
index fabdbbbc0ab4b..c688895335181 100644
--- a/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
+++ b/llvm/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -68,8 +68,8 @@ _ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !1 = distinct !DISubprogram()
diff --git a/llvm/test/CodeGen/X86/dbg-combine.ll b/llvm/test/CodeGen/X86/dbg-combine.ll
index b3d2213b90c0a..3ff5a266d9834 100644
--- a/llvm/test/CodeGen/X86/dbg-combine.ll
+++ b/llvm/test/CodeGen/X86/dbg-combine.ll
@@ -63,7 +63,7 @@ declare ptr @llvm.stacksave() #2
 ; Function Attrs: nounwind
 declare void @llvm.stackrestore(ptr) #2
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/debug-loclists-lto.ll b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
index fde8e00920adf..2bd927fcb66ed 100644
--- a/llvm/test/CodeGen/X86/debug-loclists-lto.ll
+++ b/llvm/test/CodeGen/X86/debug-loclists-lto.ll
@@ -34,8 +34,8 @@ entry:
   ret void, !dbg !29
 }
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 
 !llvm.dbg.cu = !{!0, !7}
diff --git a/llvm/test/CodeGen/X86/debugloc-argsize.ll b/llvm/test/CodeGen/X86/debugloc-argsize.ll
index 3cfeb6e565e29..f4527c5b786fd 100644
--- a/llvm/test/CodeGen/X86/debugloc-argsize.ll
+++ b/llvm/test/CodeGen/X86/debugloc-argsize.ll
@@ -30,7 +30,7 @@ declare ptr @__cxa_begin_catch(ptr)
 
 declare void @__cxa_end_catch()
 
-attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { optsize }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/early-cfi-sections.ll b/llvm/test/CodeGen/X86/early-cfi-sections.ll
index 3a9e62a1c6c2b..8ab0340cadded 100644
--- a/llvm/test/CodeGen/X86/early-cfi-sections.ll
+++ b/llvm/test/CodeGen/X86/early-cfi-sections.ll
@@ -12,7 +12,7 @@ entry:
   ret void, !dbg !8
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/CodeGen/X86/fabs.ll b/llvm/test/CodeGen/X86/fabs.ll
index 82c82ac3e917e..4e6da83ec55f3 100644
--- a/llvm/test/CodeGen/X86/fabs.ll
+++ b/llvm/test/CodeGen/X86/fabs.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3                                                | FileCheck %s --check-prefix=X87
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=-sse,-sse2,-sse3 -enable-no-nans-fp-math | FileCheck %s --check-prefix=X87UNSAFE
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                                                                      | FileCheck %s --check-prefix=X64
 
 declare float @fabsf(float)
diff --git a/llvm/test/CodeGen/X86/fadd-combines.ll b/llvm/test/CodeGen/X86/fadd-combines.ll
index 2c06c538ae10d..a44671c77e653 100644
--- a/llvm/test/CodeGen/X86/fadd-combines.ll
+++ b/llvm/test/CodeGen/X86/fadd-combines.ll
@@ -275,4 +275,4 @@ define <2 x double> @fmul2_negated_vec(<2 x double> %a, <2 x double> %b, <2 x do
   ret <2 x double> %sub
 }
 
-attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "unsafe-fp-math"="true" }
+attributes #0 = { "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
index 5afa12cd4b477..1bc94b106d398 100644
--- a/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
+++ b/llvm/test/CodeGen/X86/fastmath-float-half-conversion.ll
@@ -65,5 +65,5 @@ entry:
 declare i16 @llvm.convert.to.fp16.f64(double)
 declare i16 @llvm.convert.to.fp16.f80(x86_fp80)
 
-attributes #0 = { nounwind readnone "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/fdiv.ll b/llvm/test/CodeGen/X86/fdiv.ll
index 67bad09d5dd7f..859f54eaff4fd 100644
--- a/llvm/test/CodeGen/X86/fdiv.ll
+++ b/llvm/test/CodeGen/X86/fdiv.ll
@@ -54,7 +54,7 @@ define double @denormal2(double %x) {
 
 ; Deleting the negates does not require unsafe-fp-math.
 
-define float @double_negative(float %x, float %y) #0 {
+define float @double_negative(float %x, float %y) {
 ; CHECK-LABEL: double_negative:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divss %xmm1, %xmm0
@@ -65,7 +65,7 @@ define float @double_negative(float %x, float %y) #0 {
   ret float %div
 }
 
-define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
+define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) {
 ; CHECK-LABEL: double_negative_vector:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    divps %xmm1, %xmm0
@@ -80,7 +80,7 @@ define <4 x float> @double_negative_vector(<4 x float> %x, <4 x float> %y) #0 {
 ; clang/gcc), due to order of argument evaluation not being well defined. We
 ; ended up hitting llvm_unreachable in getNegatedExpression when building with
 ; gcc. Just make sure that we get a deterministic result.
-define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
+define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) {
 ; CHECK-LABEL: fdiv_fneg_combine:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm0, %xmm3
@@ -99,6 +99,3 @@ define float @fdiv_fneg_combine(float %a0, float %a1, float %a2) #0 {
   %div5 = fdiv fast float %mul2, %sub4
   ret float %div5
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-
diff --git a/llvm/test/CodeGen/X86/fma_patterns_wide.ll b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
index 4c16cf9a8550b..0c3ec8df52961 100644
--- a/llvm/test/CodeGen/X86/fma_patterns_wide.ll
+++ b/llvm/test/CodeGen/X86/fma_patterns_wide.ll
@@ -1021,7 +1021,7 @@ define <8 x double> @test_v8f64_interp_ninf(<8 x double> %x, <8 x double> %y, <8
 ; Pattern: (fneg (fma x, y, z)) -> (fma x, -y, -z)
 ;
 
-define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmsub213ps {{.*#+}} ymm0 = -(ymm2 * ymm0) - ymm4
@@ -1044,7 +1044,7 @@ define <16 x float> @test_v16f32_fneg_fmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg
 }
 
-define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfnmadd213pd {{.*#+}} ymm0 = -(ymm2 * ymm0) + ymm4
@@ -1067,7 +1067,7 @@ define <8 x double> @test_v8f64_fneg_fmsub(<8 x double> %a0, <8 x double> %a1, <
   ret <8 x double> %neg
 }
 
-define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) #0 {
+define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1, <16 x float> %a2) {
 ; FMA-LABEL: test_v16f32_fneg_fnmadd:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmsub213ps {{.*#+}} ymm0 = (ymm2 * ymm0) - ymm4
@@ -1091,7 +1091,7 @@ define <16 x float> @test_v16f32_fneg_fnmadd(<16 x float> %a0, <16 x float> %a1,
   ret <16 x float> %neg1
 }
 
-define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) #0 {
+define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1, <8 x double> %a2) {
 ; FMA-LABEL: test_v8f64_fneg_fnmsub:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd213pd {{.*#+}} ymm0 = (ymm2 * ymm0) + ymm4
@@ -1119,7 +1119,7 @@ define <8 x double> @test_v8f64_fneg_fnmsub(<8 x double> %a0, <8 x double> %a1,
 ; Pattern: (fma x, c1, (fmul x, c2)) -> (fmul x, c1+c2)
 ;
 
-define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
+define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) {
 ; FMA-LABEL: test_v16f32_fma_x_c1_fmul_x_c2:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
@@ -1146,7 +1146,7 @@ define <16 x float> @test_v16f32_fma_x_c1_fmul_x_c2(<16 x float> %x) #0 {
 ; Pattern: (fma (fmul x, c1), c2, y) -> (fma x, c1*c2, y)
 ;
 
-define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fma_fmul_x_c1_c2_y:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vfmadd132ps {{.*#+}} ymm0 = (ymm0 * mem) + ymm2
@@ -1171,7 +1171,7 @@ define <16 x float> @test_v16f32_fma_fmul_x_c1_c2_y(<16 x float> %x, <16 x float
 
 ; Pattern: (fneg (fmul x, y)) -> (fnmsub x, y, 0)
 
-define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0 {
+define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) {
 ; FMA-LABEL: test_v16f32_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorps %xmm4, %xmm4, %xmm4
@@ -1196,7 +1196,7 @@ define <16 x float> @test_v16f32_fneg_fmul(<16 x float> %x, <16 x float> %y) #0
   ret <16 x float> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vxorpd %xmm4, %xmm4, %xmm4
@@ -1221,7 +1221,7 @@ define <8 x double> @test_v8f64_fneg_fmul(<8 x double> %x, <8 x double> %y) #0 {
   ret <8 x double> %n
 }
 
-define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) #0 {
+define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %y) {
 ; FMA-LABEL: test_v8f64_fneg_fmul_no_nsz:
 ; FMA:       # %bb.0:
 ; FMA-NEXT:    vmulpd %ymm3, %ymm1, %ymm1
@@ -1250,7 +1250,6 @@ define <8 x double> @test_v8f64_fneg_fmul_no_nsz(<8 x double> %x, <8 x double> %
   ret <8 x double> %n
 }
 
-attributes #0 = { "unsafe-fp-math"="true" }
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
 ; AVX512-INFS: {{.*}}
 ; FMA-INFS: {{.*}}
diff --git a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
index 0fe107cbbeee3..aae6cda4458d2 100644
--- a/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
+++ b/llvm/test/CodeGen/X86/fminimumnum-fmaximumnum.ll
@@ -22,25 +22,24 @@ declare <4 x bfloat> @llvm.maximumnum.v4bf16(<4 x bfloat>, <4 x bfloat>)
 define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; SSE2-LABEL: test_fmaximumnum:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:    js .LBB0_2
-; SSE2-NEXT:  # %bb.1:
+; SSE2-NEXT:    js .LBB0_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
+; SSE2-NEXT:    jmp .LBB0_3
+; SSE2-NEXT:  .LBB0_1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:  .LBB0_3:
 ; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB0_2:
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordss %xmm3, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB0_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:  .LBB0_4:
-; SSE2-NEXT:    maxss %xmm1, %xmm3
-; SSE2-NEXT:    andnps %xmm3, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    maxss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    andnps %xmm3, %xmm2
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum:
@@ -56,7 +55,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB0_3:
 ; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -70,7 +69,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    retq
 ;
@@ -95,7 +94,7 @@ define float @test_fmaximumnum(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB0_3:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -371,26 +370,25 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
 ; SSE2-LABEL: test_fmaximumnum_nsz:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordss %xmm0, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm3
-; SSE2-NEXT:    andps %xmm0, %xmm3
-; SSE2-NEXT:    maxss %xmm1, %xmm0
-; SSE2-NEXT:    andnps %xmm0, %xmm2
-; SSE2-NEXT:    orps %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_nsz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: test_fmaximumnum_nsz:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -404,9 +402,9 @@ define float @test_fmaximumnum_nsz(float %x, float %y) "no-signed-zeros-fp-math"
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vmaxss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -421,23 +419,22 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; SSE2-NEXT:    divss %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    js .LBB9_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:  .LBB9_2:
-; SSE2-NEXT:    movaps %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB9_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB9_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    jmp .LBB9_3
+; SSE2-NEXT:  .LBB9_1:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:  .LBB9_4:
-; SSE2-NEXT:    maxss %xmm1, %xmm3
+; SSE2-NEXT:  .LBB9_3:
+; SSE2-NEXT:    movaps %xmm1, %xmm3
+; SSE2-NEXT:    maxss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_combine_cmps:
@@ -454,7 +451,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovaps %xmm0, %xmm1
 ; AVX1-NEXT:  .LBB9_3:
 ; AVX1-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm1, %xmm1, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -469,7 +466,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512F-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512F-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512F-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512F-NEXT:    retq
 ;
@@ -507,7 +504,7 @@ define float @test_fmaximumnum_combine_cmps(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovaps %xmm1, %xmm0
 ; X86-NEXT:  .LBB9_3:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -527,23 +524,23 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:    js .LBB10_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm0, %xmm3
-; SSE2-NEXT:  .LBB10_2:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB10_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB10_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    jmp .LBB10_3
+; SSE2-NEXT:  .LBB10_1:
+; SSE2-NEXT:    movdqa %xmm0, %xmm2
 ; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:  .LBB10_4:
-; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:  .LBB10_3:
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum:
@@ -559,7 +556,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovdqa %xmm1, %xmm0
 ; AVX1-NEXT:  .LBB10_3:
 ; AVX1-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -573,7 +570,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; AVX512-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -599,7 +596,7 @@ define float @test_fminimumnum(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB10_3:
 ; X86-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -857,26 +854,25 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
 ; SSE2-LABEL: test_fminimumnum_nsz:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordss %xmm0, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm3
-; SSE2-NEXT:    andps %xmm0, %xmm3
-; SSE2-NEXT:    minss %xmm1, %xmm0
-; SSE2-NEXT:    andnps %xmm0, %xmm2
-; SSE2-NEXT:    orps %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    minss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum_nsz:
 ; AVX1:       # %bb.0:
 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
 ; AVX512-LABEL: test_fminimumnum_nsz:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512-NEXT:    retq
@@ -890,9 +886,9 @@ define float @test_fminimumnum_nsz(float %x, float %y) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %eax
 ; X86-NEXT:    vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm1
-; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm1, %xmm0, %xmm2, %xmm0
+; X86-NEXT:    vminss {{[0-9]+}}(%esp), %xmm0, %xmm1
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
 ; X86-NEXT:    popl %eax
@@ -907,23 +903,23 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; SSE2-NEXT:    divss %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    js .LBB19_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:  .LBB19_2:
-; SSE2-NEXT:    movaps %xmm3, %xmm2
-; SSE2-NEXT:    cmpordss %xmm3, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm4
-; SSE2-NEXT:    andps %xmm3, %xmm4
-; SSE2-NEXT:    js .LBB19_4
-; SSE2-NEXT:  # %bb.3:
+; SSE2-NEXT:    js .LBB19_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    jmp .LBB19_3
+; SSE2-NEXT:  .LBB19_1:
+; SSE2-NEXT:    movaps %xmm0, %xmm2
 ; SSE2-NEXT:    movaps %xmm1, %xmm0
-; SSE2-NEXT:  .LBB19_4:
-; SSE2-NEXT:    minss %xmm0, %xmm3
+; SSE2-NEXT:  .LBB19_3:
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    minss %xmm2, %xmm3
+; SSE2-NEXT:    movaps %xmm3, %xmm1
+; SSE2-NEXT:    cmpunordss %xmm3, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm2
 ; SSE2-NEXT:    andnps %xmm3, %xmm2
-; SSE2-NEXT:    orps %xmm4, %xmm2
-; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    orps %xmm2, %xmm1
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fminimumnum_combine_cmps:
@@ -940,7 +936,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX1-NEXT:    vmovaps %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB19_3:
 ; AVX1-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -955,7 +951,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm2, %xmm2 {%k1}
 ; AVX512F-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512F-NEXT:    vminss %xmm2, %xmm0, %xmm1
-; AVX512F-NEXT:    vcmpordss %xmm0, %xmm0, %k1
+; AVX512F-NEXT:    vcmpunordss %xmm1, %xmm1, %k1
 ; AVX512F-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512F-NEXT:    vmovaps %xmm1, %xmm0
 ; AVX512F-NEXT:    retq
@@ -994,7 +990,7 @@ define float @test_fminimumnum_combine_cmps(float %x, float %y) nounwind {
 ; X86-NEXT:    vmovaps %xmm2, %xmm0
 ; X86-NEXT:  .LBB19_3:
 ; X86-NEXT:    vminss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    flds (%esp)
@@ -1022,9 +1018,9 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; SSE2-NEXT:    por %xmm4, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
-; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
 ; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    orpd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
@@ -1034,7 +1030,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -1048,7 +1044,7 @@ define <2 x double> @test_fminimumnum_vector(<2 x double> %x, <2 x double> %y) {
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> %y)
@@ -1084,19 +1080,17 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm2, %xmm0
-; SSE2-NEXT:    andnpd %xmm1, %xmm2
-; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_zero:
@@ -1108,9 +1102,9 @@ define <2 x double> @test_fminimumnum_vector_zero(<2 x double> %x) {
 ; X86-LABEL: test_fminimumnum_vector_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0.>)
   ret <2 x double> %r
@@ -1120,20 +1114,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; SSE2-NEXT:    maxps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
-; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    maxps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximumnum_vector_signed_zero:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero:
@@ -1144,9 +1139,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 ; X86-LABEL: test_fmaximumnum_vector_signed_zero:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> %x, <4 x float> <float -0., float -0., float -0., float -0.>)
   ret <4 x float> %r
@@ -1155,13 +1150,14 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero(<4 x float> %x) {
 define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; SSE2-LABEL: test_fminimumnum_vector_partially_zero:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    movapd %xmm0, %xmm1
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm1
-; SSE2-NEXT:    xorpd %xmm2, %xmm2
-; SSE2-NEXT:    movhpd {{.*#+}} xmm2 = xmm2[0],mem[0]
+; SSE2-NEXT:    xorpd %xmm1, %xmm1
+; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; SSE2-NEXT:    movapd %xmm1, %xmm2
 ; SSE2-NEXT:    minpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm1, %xmm0
-; SSE2-NEXT:    andnpd %xmm2, %xmm1
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm1
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
 ; SSE2-NEXT:    orpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
@@ -1169,9 +1165,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_partially_zero:
@@ -1185,9 +1181,9 @@ define <2 x double> @test_fminimumnum_vector_partially_zero(<2 x double> %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 5.>)
   ret <2 x double> %r
@@ -1212,9 +1208,9 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; SSE2-NEXT:    por %xmm2, %xmm3
 ; SSE2-NEXT:    movdqa %xmm3, %xmm1
 ; SSE2-NEXT:    minpd %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm3, %xmm0
-; SSE2-NEXT:    cmpordpd %xmm3, %xmm0
-; SSE2-NEXT:    andpd %xmm0, %xmm3
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    pand %xmm0, %xmm3
 ; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    orpd %xmm3, %xmm0
 ; SSE2-NEXT:    retq
@@ -1226,7 +1222,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
@@ -1244,7 +1240,7 @@ define <2 x double> @test_fminimumnum_vector_different_zeros(<2 x double> %x) {
 ; X86-NEXT:    vblendvpd %xmm0, %xmm0, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    vminpd %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double -0.>)
@@ -1278,20 +1274,24 @@ define <4 x float> @test_fmaximumnum_vector_non_zero(<4 x float> %x) {
 define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; SSE2-LABEL: test_fminimumnum_vector_nan:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorpd %xmm2, %xmm2
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    movhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
-; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movsd {{.*#+}} xmm1 = xmm2[0],xmm1[1]
-; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm2
+; SSE2-NEXT:    minpd %xmm0, %xmm2
+; SSE2-NEXT:    movapd %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm2, %xmm0
+; SSE2-NEXT:    andpd %xmm0, %xmm1
+; SSE2-NEXT:    andnpd %xmm2, %xmm0
+; SSE2-NEXT:    orpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_nan:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vmovhpd {{.*#+}} xmm2 = xmm1[0],mem[0]
-; AVX-NEXT:    vminpd %xmm0, %xmm2, %xmm0
-; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = xmm1[0],xmm0[1]
+; AVX-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_nan:
@@ -1306,7 +1306,7 @@ define <2 x double> @test_fminimumnum_vector_nan(<2 x double> %x) {
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
 ; X86-NEXT:    vmovhpd {{.*#+}} xmm1 = xmm1[0],mem[0]
 ; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
-; X86-NEXT:    vcmpordpd %xmm1, %xmm1, %xmm2
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm2
 ; X86-NEXT:    vblendvpd %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> %x, <2 x double> <double 0., double 0x7fff000000000000>)
@@ -1318,19 +1318,17 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    xorpd %xmm1, %xmm1
 ; SSE2-NEXT:    minpd %xmm0, %xmm1
-; SSE2-NEXT:    movapd %xmm0, %xmm2
-; SSE2-NEXT:    cmpordpd %xmm0, %xmm2
-; SSE2-NEXT:    andpd %xmm2, %xmm0
-; SSE2-NEXT:    andnpd %xmm1, %xmm2
-; SSE2-NEXT:    orpd %xmm2, %xmm0
+; SSE2-NEXT:    movapd %xmm1, %xmm0
+; SSE2-NEXT:    cmpunordpd %xmm1, %xmm0
+; SSE2-NEXT:    andnpd %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fminimumnum_vector_zero_first:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; AVX-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fminimumnum_vector_zero_first:
@@ -1342,9 +1340,9 @@ define <2 x double> @test_fminimumnum_vector_zero_first(<2 x double> %x) {
 ; X86-LABEL: test_fminimumnum_vector_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vxorpd %xmm1, %xmm1, %xmm1
-; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordpd %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vminpd %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordpd %xmm0, %xmm0, %xmm1
+; X86-NEXT:    vandnpd %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %r = call <2 x double> @llvm.minimumnum.v2f64(<2 x double> <double 0., double 0.>, <2 x double> %x)
   ret <2 x double> %r
@@ -1378,20 +1376,21 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
 ; SSE2-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movaps {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; SSE2-NEXT:    maxps %xmm0, %xmm1
-; SSE2-NEXT:    movaps %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
-; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    maxps %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordps %xmm2, %xmm0
+; SSE2-NEXT:    andps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; AVX-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; AVX-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; AVX-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; AVX-NEXT:    retq
 ;
 ; AVX10_2-LABEL: test_fmaximumnum_vector_signed_zero_first:
@@ -1402,9 +1401,9 @@ define <4 x float> @test_fmaximumnum_vector_signed_zero_first(<4 x float> %x) {
 ; X86-LABEL: test_fmaximumnum_vector_signed_zero_first:
 ; X86:       # %bb.0:
 ; X86-NEXT:    vbroadcastss {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
-; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
-; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vmaxps %xmm0, %xmm1, %xmm0
+; X86-NEXT:    vcmpunordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vblendvps %xmm2, %xmm1, %xmm0, %xmm0
 ; X86-NEXT:    retl
   %r = call <4 x float> @llvm.maximumnum.v4f32(<4 x float> <float -0., float -0., float -0., float -0.>, <4 x float> %x)
   ret <4 x float> %r
@@ -1455,11 +1454,11 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    maxps %xmm4, %xmm1
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    cmpordps %xmm0, %xmm2
-; SSE2-NEXT:    andps %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    cmpunordps %xmm1, %xmm2
+; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    andnps %xmm1, %xmm2
-; SSE2-NEXT:    orps %xmm2, %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_v4f32_splat:
@@ -1468,7 +1467,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    retq
 ;
@@ -1478,7 +1477,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; AVX512-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; AVX512-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; AVX512-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; AVX512-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; AVX512-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX512-NEXT:    retq
 ;
@@ -1494,7 +1493,7 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 ; X86-NEXT:    vblendvps %xmm0, %xmm1, %xmm0, %xmm2
 ; X86-NEXT:    vblendvps %xmm0, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmaxps %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordps %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordps %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    retl
   %splatinsert = insertelement <4 x float> poison, float %y, i64 0
@@ -1506,134 +1505,130 @@ define <4 x float> @test_fmaximumnum_v4f32_splat(<4 x float> %x, float %y) {
 define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind {
 ; SSE2-LABEL: test_fmaximumnum_v4f16:
 ; SSE2:       # %bb.0:
-; SSE2-NEXT:    subq $104, %rsp
-; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    subq $136, %rsp
+; SSE2-NEXT:    movaps %xmm0, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm0[1,1]
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm2
+; SSE2-NEXT:    shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1]
+; SSE2-NEXT:    movaps %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movaps %xmm1, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movaps %xmm1, %xmm0
 ; SSE2-NEXT:    psrld $16, %xmm0
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Folded Spill
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_2:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_4:
-; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1,1,1]
-; SSE2-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    js .LBB33_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_3
+; SSE2-NEXT:  .LBB33_1:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_3:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movdqa (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    psrlq $48, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
+; SSE2-NEXT:    # xmm0 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_6
+; SSE2-NEXT:    js .LBB33_4
 ; SSE2-NEXT:  # %bb.5:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_6:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_8
-; SSE2-NEXT:  # %bb.7:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_8:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_6
+; SSE2-NEXT:  .LBB33_4:
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm1
-; SSE2-NEXT:    movdqa %xmm1, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_6:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_10
-; SSE2-NEXT:  # %bb.9:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_10:
-; SSE2-NEXT:    movdqa %xmm2, %xmm1
-; SSE2-NEXT:    cmpordss %xmm2, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_12
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    movdqa %xmm0, %xmm4
-; SSE2-NEXT:  .LBB33_12:
-; SSE2-NEXT:    maxss %xmm4, %xmm2
+; SSE2-NEXT:    js .LBB33_7
+; SSE2-NEXT:  # %bb.8:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_9
+; SSE2-NEXT:  .LBB33_7:
+; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_9:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
 ; SSE2-NEXT:    andnps %xmm2, %xmm1
-; SSE2-NEXT:    orps %xmm3, %xmm1
-; SSE2-NEXT:    movaps %xmm1, %xmm0
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; SSE2-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
 ; SSE2-NEXT:    callq __extendhfsf2@PLT
-; SSE2-NEXT:    movd (%rsp), %xmm4 # 4-byte Folded Reload
-; SSE2-NEXT:    # xmm4 = mem[0],zero,zero,zero
-; SSE2-NEXT:    movdqa %xmm0, %xmm1
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    testl %eax, %eax
-; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    js .LBB33_14
-; SSE2-NEXT:  # %bb.13:
-; SSE2-NEXT:    movdqa %xmm4, %xmm2
-; SSE2-NEXT:  .LBB33_14:
-; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm3
-; SSE2-NEXT:    andps %xmm2, %xmm3
-; SSE2-NEXT:    js .LBB33_16
-; SSE2-NEXT:  # %bb.15:
-; SSE2-NEXT:    movdqa %xmm1, %xmm4
-; SSE2-NEXT:  .LBB33_16:
-; SSE2-NEXT:    maxss %xmm4, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm3, %xmm0
+; SSE2-NEXT:    js .LBB33_10
+; SSE2-NEXT:  # %bb.11:
+; SSE2-NEXT:    movdqa %xmm0, %xmm1
+; SSE2-NEXT:    movdqa (%rsp), %xmm3 # 16-byte Reload
+; SSE2-NEXT:    jmp .LBB33_12
+; SSE2-NEXT:  .LBB33_10:
+; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
+; SSE2-NEXT:    movdqa %xmm0, %xmm3
+; SSE2-NEXT:  .LBB33_12:
+; SSE2-NEXT:    movdqa %xmm3, %xmm2
+; SSE2-NEXT:    maxss %xmm1, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm1
+; SSE2-NEXT:    andnps %xmm2, %xmm1
+; SSE2-NEXT:    andps %xmm3, %xmm0
+; SSE2-NEXT:    orps %xmm1, %xmm0
 ; SSE2-NEXT:    callq __truncsfhf2@PLT
 ; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -1641,7 +1636,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; SSE2-NEXT:    punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
 ; SSE2-NEXT:    # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
-; SSE2-NEXT:    addq $104, %rsp
+; SSE2-NEXT:    addq $136, %rsp
 ; SSE2-NEXT:    retq
 ;
 ; AVX1-LABEL: test_fmaximumnum_v4f16:
@@ -1679,7 +1674,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vpsrlq $48, %xmm0, %xmm0
 ; AVX1-NEXT:    vmovdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1700,7 +1695,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1721,7 +1716,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -1742,7 +1737,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX1-NEXT:    vmovdqa %xmm0, %xmm2
 ; AVX1-NEXT:  .LBB33_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm2, %xmm0
-; AVX1-NEXT:    vcmpordss %xmm2, %xmm2, %xmm1
+; AVX1-NEXT:    vcmpunordss %xmm0, %xmm0, %xmm1
 ; AVX1-NEXT:    vblendvps %xmm1, %xmm2, %xmm0, %xmm0
 ; AVX1-NEXT:    callq __truncsfhf2@PLT
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
@@ -1768,7 +1763,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmovss %xmm2, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm4, %xmm3, %xmm2
-; AVX512-NEXT:    vcmpordss %xmm3, %xmm3, %k1
+; AVX512-NEXT:    vcmpunordss %xmm2, %xmm2, %k1
 ; AVX512-NEXT:    vmovss %xmm3, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm2, %xmm2
 ; AVX512-NEXT:    vshufps {{.*#+}} xmm3 = xmm0[3,3,3,3]
@@ -1783,7 +1778,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
@@ -1799,7 +1794,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vshufpd {{.*#+}} xmm4 = xmm0[1,0]
@@ -1814,7 +1809,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1831,7 +1826,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm3, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm4, %xmm3
-; AVX512-NEXT:    vcmpordss %xmm4, %xmm4, %k1
+; AVX512-NEXT:    vcmpunordss %xmm3, %xmm3, %k1
 ; AVX512-NEXT:    vmovss %xmm4, %xmm3, %xmm3 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm3, %xmm3
 ; AVX512-NEXT:    vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3]
@@ -1846,7 +1841,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
@@ -1860,7 +1855,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm5, %xmm6, %xmm6 {%k1}
 ; AVX512-NEXT:    vmovss %xmm4, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm6, %xmm5, %xmm4
-; AVX512-NEXT:    vcmpordss %xmm5, %xmm5, %k1
+; AVX512-NEXT:    vcmpunordss %xmm4, %xmm4, %k1
 ; AVX512-NEXT:    vmovss %xmm5, %xmm4, %xmm4 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm4, %xmm4
 ; AVX512-NEXT:    vpsrld $16, %xmm0, %xmm0
@@ -1875,7 +1870,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; AVX512-NEXT:    vmovss %xmm1, %xmm5, %xmm5 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm5, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    vcvtps2ph $4, %xmm0, %xmm0
 ; AVX512-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3]
@@ -1933,7 +1928,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_3:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __extendhfsf2
@@ -1955,7 +1950,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_6:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
@@ -1993,7 +1988,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_9:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __extendhfsf2
@@ -2015,7 +2010,7 @@ define <4 x half> @test_fmaximumnum_v4f16(<4 x half> %x, <4 x half> %y) nounwind
 ; X86-NEXT:    vmovdqa %xmm1, %xmm0
 ; X86-NEXT:  .LBB33_12:
 ; X86-NEXT:    vmaxss %xmm2, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfhf2
@@ -2041,120 +2036,114 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    pushq %rbp
 ; SSE2-NEXT:    pushq %r15
 ; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
 ; SSE2-NEXT:    subq $56, %rsp
-; SSE2-NEXT:    pextrw $0, %xmm1, %r14d
-; SSE2-NEXT:    pextrw $0, %xmm0, %r15d
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm2
-; SSE2-NEXT:    pextrw $0, %xmm2, %eax
 ; SSE2-NEXT:    movdqa %xmm0, %xmm2
-; SSE2-NEXT:    psrld $16, %xmm2
-; SSE2-NEXT:    pextrw $0, %xmm2, %ecx
+; SSE2-NEXT:    psrlq $48, %xmm2
+; SSE2-NEXT:    movdqa %xmm1, %xmm3
+; SSE2-NEXT:    psrlq $48, %xmm3
+; SSE2-NEXT:    movdqa %xmm0, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm0[1,1]
+; SSE2-NEXT:    pextrw $0, %xmm4, %ebp
+; SSE2-NEXT:    movdqa %xmm1, %xmm4
+; SSE2-NEXT:    shufps {{.*#+}} xmm4 = xmm4[1,1],xmm1[1,1]
+; SSE2-NEXT:    pextrw $0, %xmm4, %r15d
+; SSE2-NEXT:    pextrw $0, %xmm0, %r12d
+; SSE2-NEXT:    pextrw $0, %xmm1, %r13d
+; SSE2-NEXT:    psrld $16, %xmm0
+; SSE2-NEXT:    pextrw $0, %xmm0, %eax
+; SSE2-NEXT:    psrld $16, %xmm1
+; SSE2-NEXT:    pextrw $0, %xmm1, %ecx
 ; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    movd %ecx, %xmm3
+; SSE2-NEXT:    movd %ecx, %xmm1
 ; SSE2-NEXT:    shll $16, %eax
-; SSE2-NEXT:    movd %eax, %xmm2
-; SSE2-NEXT:    testl %ecx, %ecx
-; SSE2-NEXT:    movdqa %xmm3, %xmm7
-; SSE2-NEXT:    js .LBB34_2
-; SSE2-NEXT:  # %bb.1:
-; SSE2-NEXT:    movdqa %xmm2, %xmm7
-; SSE2-NEXT:  .LBB34_2:
-; SSE2-NEXT:    movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm1, %xmm5
-; SSE2-NEXT:    shufps {{.*#+}} xmm5 = xmm5[1,1],xmm1[1,1]
-; SSE2-NEXT:    movdqa %xmm0, (%rsp) # 16-byte Spill
-; SSE2-NEXT:    movdqa %xmm0, %xmm6
-; SSE2-NEXT:    shufps {{.*#+}} xmm6 = xmm6[1,1],xmm0[1,1]
-; SSE2-NEXT:    movdqa %xmm7, %xmm0
-; SSE2-NEXT:    cmpordss %xmm7, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm7, %xmm4
-; SSE2-NEXT:    js .LBB34_4
-; SSE2-NEXT:  # %bb.3:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_4:
-; SSE2-NEXT:    pextrw $0, %xmm5, %ebp
-; SSE2-NEXT:    pextrw $0, %xmm6, %ebx
-; SSE2-NEXT:    maxss %xmm2, %xmm7
-; SSE2-NEXT:    andnps %xmm7, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    movd %eax, %xmm4
+; SSE2-NEXT:    js .LBB34_1
+; SSE2-NEXT:  # %bb.2:
+; SSE2-NEXT:    movdqa %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_3
+; SSE2-NEXT:  .LBB34_1:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm4, %xmm1
+; SSE2-NEXT:  .LBB34_3:
+; SSE2-NEXT:    pextrw $0, %xmm2, %ebx
+; SSE2-NEXT:    pextrw $0, %xmm3, %r14d
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    shll $16, %r15d
-; SSE2-NEXT:    movd %r15d, %xmm3
-; SSE2-NEXT:    shll $16, %r14d
-; SSE2-NEXT:    movd %r14d, %xmm2
-; SSE2-NEXT:    testl %r15d, %r15d
-; SSE2-NEXT:    movdqa %xmm3, %xmm1
-; SSE2-NEXT:    js .LBB34_6
+; SSE2-NEXT:    shll $16, %r13d
+; SSE2-NEXT:    movd %r13d, %xmm1
+; SSE2-NEXT:    shll $16, %r12d
+; SSE2-NEXT:    movd %r12d, %xmm2
+; SSE2-NEXT:    js .LBB34_4
 ; SSE2-NEXT:  # %bb.5:
+; SSE2-NEXT:    movdqa %xmm2, %xmm0
+; SSE2-NEXT:    jmp .LBB34_6
+; SSE2-NEXT:  .LBB34_4:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
 ; SSE2-NEXT:    movdqa %xmm2, %xmm1
 ; SSE2-NEXT:  .LBB34_6:
-; SSE2-NEXT:    movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm5
-; SSE2-NEXT:    movdqa (%rsp), %xmm6 # 16-byte Reload
-; SSE2-NEXT:    psrlq $48, %xmm6
-; SSE2-NEXT:    movdqa %xmm1, %xmm0
-; SSE2-NEXT:    cmpordss %xmm1, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm1, %xmm4
-; SSE2-NEXT:    js .LBB34_8
-; SSE2-NEXT:  # %bb.7:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_8:
-; SSE2-NEXT:    pextrw $0, %xmm5, %r15d
-; SSE2-NEXT:    pextrw $0, %xmm6, %r14d
-; SSE2-NEXT:    maxss %xmm2, %xmm1
-; SSE2-NEXT:    andnps %xmm1, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; SSE2-NEXT:    shll $16, %ebx
-; SSE2-NEXT:    movd %ebx, %xmm1
+; SSE2-NEXT:    shll $16, %r15d
+; SSE2-NEXT:    movd %r15d, %xmm1
 ; SSE2-NEXT:    shll $16, %ebp
-; SSE2-NEXT:    movd %ebp, %xmm3
-; SSE2-NEXT:    testl %ebx, %ebx
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    js .LBB34_10
-; SSE2-NEXT:  # %bb.9:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movd %ebp, %xmm2
+; SSE2-NEXT:    js .LBB34_7
+; SSE2-NEXT:  # %bb.8:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm2, %xmm4
-; SSE2-NEXT:    js .LBB34_12
-; SSE2-NEXT:  # %bb.11:
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB34_12:
-; SSE2-NEXT:    maxss %xmm3, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_9
+; SSE2-NEXT:  .LBB34_7:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_9:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
 ; SSE2-NEXT:    shll $16, %r14d
 ; SSE2-NEXT:    movd %r14d, %xmm1
-; SSE2-NEXT:    shll $16, %r15d
-; SSE2-NEXT:    movd %r15d, %xmm3
-; SSE2-NEXT:    testl %r14d, %r14d
-; SSE2-NEXT:    movdqa %xmm1, %xmm2
-; SSE2-NEXT:    js .LBB34_14
-; SSE2-NEXT:  # %bb.13:
-; SSE2-NEXT:    movdqa %xmm3, %xmm2
-; SSE2-NEXT:  .LBB34_14:
+; SSE2-NEXT:    shll $16, %ebx
+; SSE2-NEXT:    movd %ebx, %xmm2
+; SSE2-NEXT:    js .LBB34_10
+; SSE2-NEXT:  # %bb.11:
 ; SSE2-NEXT:    movdqa %xmm2, %xmm0
-; SSE2-NEXT:    cmpordss %xmm2, %xmm0
-; SSE2-NEXT:    movaps %xmm0, %xmm4
-; SSE2-NEXT:    andps %xmm2, %xmm4
-; SSE2-NEXT:    js .LBB34_16
-; SSE2-NEXT:  # %bb.15:
-; SSE2-NEXT:    movdqa %xmm1, %xmm3
-; SSE2-NEXT:  .LBB34_16:
-; SSE2-NEXT:    maxss %xmm3, %xmm2
-; SSE2-NEXT:    andnps %xmm2, %xmm0
-; SSE2-NEXT:    orps %xmm4, %xmm0
+; SSE2-NEXT:    jmp .LBB34_12
+; SSE2-NEXT:  .LBB34_10:
+; SSE2-NEXT:    movdqa %xmm1, %xmm0
+; SSE2-NEXT:    movdqa %xmm2, %xmm1
+; SSE2-NEXT:  .LBB34_12:
+; SSE2-NEXT:    movdqa %xmm1, %xmm2
+; SSE2-NEXT:    maxss %xmm0, %xmm2
+; SSE2-NEXT:    movaps %xmm2, %xmm0
+; SSE2-NEXT:    cmpunordss %xmm2, %xmm0
+; SSE2-NEXT:    movaps %xmm0, %xmm3
+; SSE2-NEXT:    andnps %xmm2, %xmm3
+; SSE2-NEXT:    andps %xmm1, %xmm0
+; SSE2-NEXT:    orps %xmm3, %xmm0
 ; SSE2-NEXT:    callq __truncsfbf2@PLT
 ; SSE2-NEXT:    movdqa (%rsp), %xmm1 # 16-byte Reload
 ; SSE2-NEXT:    punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
@@ -2164,6 +2153,8 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; SSE2-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
 ; SSE2-NEXT:    addq $56, %rsp
 ; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
 ; SSE2-NEXT:    popq %r14
 ; SSE2-NEXT:    popq %r15
 ; SSE2-NEXT:    popq %rbp
@@ -2205,7 +2196,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vpextrw $0, %xmm2, %ebp
 ; AVX1-NEXT:    vpextrw $0, %xmm3, %r15d
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2222,7 +2213,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_6:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2239,7 +2230,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_9:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
@@ -2256,7 +2247,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX1-NEXT:    vmovdqa %xmm2, %xmm0
 ; AVX1-NEXT:  .LBB34_12:
 ; AVX1-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; AVX1-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; AVX1-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; AVX1-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; AVX1-NEXT:    callq __truncsfbf2@PLT
 ; AVX1-NEXT:    vmovdqa (%rsp), %xmm1 # 16-byte Reload
@@ -2305,7 +2296,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2319,7 +2310,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, (%rsp)
@@ -2333,7 +2324,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2347,7 +2338,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; AVX512-NEXT:    vmovss %xmm1, %xmm2, %xmm2 {%k1}
 ; AVX512-NEXT:    vmovss %xmm0, %xmm1, %xmm1 {%k1}
 ; AVX512-NEXT:    vmaxss %xmm2, %xmm1, %xmm0
-; AVX512-NEXT:    vcmpordss %xmm1, %xmm1, %k1
+; AVX512-NEXT:    vcmpunordss %xmm0, %xmm0, %k1
 ; AVX512-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
 ; AVX512-NEXT:    callq __truncsfbf2@PLT
 ; AVX512-NEXT:    vpextrw $0, %xmm0, {{[0-9]+}}(%rsp)
@@ -2400,7 +2391,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vpextrw $0, %xmm2, %edi
 ; X86-NEXT:    vpextrw $0, %xmm3, %ebp
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovss %xmm0, (%esp)
 ; X86-NEXT:    shll $16, %ecx
@@ -2416,7 +2407,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_6:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
@@ -2436,7 +2427,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_9:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
@@ -2456,7 +2447,7 @@ define <4 x bfloat> @test_fmaximumnum_v4bf16(<4 x bfloat> %x, <4 x bfloat> %y) n
 ; X86-NEXT:    vmovdqa %xmm2, %xmm0
 ; X86-NEXT:  .LBB34_12:
 ; X86-NEXT:    vmaxss %xmm1, %xmm0, %xmm1
-; X86-NEXT:    vcmpordss %xmm0, %xmm0, %xmm2
+; X86-NEXT:    vcmpunordss %xmm1, %xmm1, %xmm2
 ; X86-NEXT:    vblendvps %xmm2, %xmm0, %xmm1, %xmm0
 ; X86-NEXT:    vmovups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
 ; X86-NEXT:    calll __truncsfbf2
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964057588..d60d397c50e11 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -158,7 +158,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i64 undef
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
index bd32430fcedbd..fc5279d02ab8a 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics-flags.ll
@@ -100,40 +100,52 @@ entry:
   ret i32 %result
 }
 
-; These 2 divs only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @binop_cse(double %a, double %b, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: binop_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
-; CHECK: [[MOVSDrm_alt:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
-; CHECK: %3:fr64 = DIVSDrm [[MOVSDrm_alt]], %fixed-stack.2, 1, $noreg, 0, $noreg, implicit $mxcsr :: (load (s64) from %fixed-stack.2)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %3 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
+; CHECK: [[B:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.2)
+; CHECK: [[A:%[0-9]+]]:fr64 = MOVSDrm_alt %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %fixed-stack.3, align 16)
+; CHECK: [[DIV0:%[0-9]+]]:fr64 = DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: [[DIV1:%[0-9]+]]:fr64 = nofpexcept DIVSDrr [[A]], [[B]], implicit $mxcsr
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[DIV1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %div = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %div1 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %div2 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %div, ptr %x
-  store double %div2, ptr %y
+  %div3 = call double @llvm.experimental.constrained.fdiv.f64(double %a, double %b, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %div2, ptr %x
+  store double %div3, ptr %y
   ret void
 }
 
-; These 2 sitofps only differ in their exception behavior and will be CSEd. Make
-; sure the nofpexcept flag is not set on the combined node.
+; These 4 divs only differ in their exception behavior. They form two groups,
+; whithin each the constrained functions have the same exception hehavior and
+; may be CSE'd. Instructions with different exception behavior belong to
+; different groups, they have different chain argument and cannot be CSE'd.
 define void @sitofp_cse(i32 %a, ptr %x, ptr %y) #0 {
 entry:
 ; CHECK-LABEL: name: sitofp_cse
-; CHECK: [[MOV32rm:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
-; CHECK: [[MOV32rm1:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
-; CHECK: %2:fr64 = CVTSI2SDrm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
-; CHECK: MOVSDmr killed [[MOV32rm1]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.x, align 4)
-; CHECK: MOVSDmr killed [[MOV32rm]], 1, $noreg, 0, $noreg, %2 :: (store (s64) into %ir.y, align 4)
+; CHECK: [[Y:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.0, align 8)
+; CHECK: [[X:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1)
+; CHECK: [[A:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 16)
+; CHECK: [[CVT0:%[0-9]+]]:fr64 = CVTSI2SDrr [[A]]
+; CHECK: [[CVT1:%[0-9]+]]:fr64 = nofpexcept CVTSI2SDrr [[A]]
+; CHECK: MOVSDmr killed [[X]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.x, align 4)
+; CHECK: MOVSDmr killed [[Y]], 1, $noreg, 0, $noreg, [[CVT1]] :: (store (s64) into %ir.y, align 4)
 ; CHECK: RET 0
   %result = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  %result1 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
   %result2 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
-  store double %result, ptr %x
-  store double %result2, ptr %y
+  %result3 = call double @llvm.experimental.constrained.sitofp.f64.i32(i32 %a, metadata !"round.dynamic", metadata !"fpexcept.ignore") #0
+  store double %result2, ptr %x
+  store double %result3, ptr %y
   ret void
 }
 
diff --git a/llvm/test/CodeGen/X86/fp-undef.ll b/llvm/test/CodeGen/X86/fp-undef.ll
index 227f0073e103b..c358085722a9b 100644
--- a/llvm/test/CodeGen/X86/fp-undef.ll
+++ b/llvm/test/CodeGen/X86/fp-undef.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown                        | FileCheck %s --check-prefix=ANY
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -enable-unsafe-fp-math | FileCheck %s --check-prefix=ANY
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ANY
 
 ; This is duplicated from tests for InstSimplify. If you're
 ; adding something here, you should probably add it there too.
diff --git a/llvm/test/CodeGen/X86/fp128-g.ll b/llvm/test/CodeGen/X86/fp128-g.ll
index 58a57d3b1140b..d2b956f816f02 100644
--- a/llvm/test/CodeGen/X86/fp128-g.ll
+++ b/llvm/test/CodeGen/X86/fp128-g.ll
@@ -106,8 +106,8 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #2
 
-attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
+attributes #1 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/CodeGen/X86/fp128-i128.ll b/llvm/test/CodeGen/X86/fp128-i128.ll
index f176a299c4e9b..ef616ca7d0d1b 100644
--- a/llvm/test/CodeGen/X86/fp128-i128.ll
+++ b/llvm/test/CodeGen/X86/fp128-i128.ll
@@ -526,6 +526,6 @@ cleanup:                                          ; preds = %entry, %if.then
 }
 
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+ssse3,+sse3,+popcnt,+sse,+sse2,+sse4.1,+sse4.2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/fp128-select.ll b/llvm/test/CodeGen/X86/fp128-select.ll
index 659e4ddedc646..27a651e23f886 100644
--- a/llvm/test/CodeGen/X86/fp128-select.ll
+++ b/llvm/test/CodeGen/X86/fp128-select.ll
@@ -13,8 +13,8 @@ define void @test_select(ptr %p, ptr %q, i1 zeroext %c) nounwind {
 ; SSE:       # %bb.0:
 ; SSE-NEXT:    testl %edx, %edx
 ; SSE-NEXT:    jne .LBB0_1
-; SSE-NEXT:  # %bb.3:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE-NEXT:  # %bb.2:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN]
 ; SSE-NEXT:    movaps %xmm0, (%rsi)
 ; SSE-NEXT:    retq
 ; SSE-NEXT:  .LBB0_1:
@@ -58,7 +58,7 @@ define fp128 @test_select_cc(fp128, fp128) nounwind {
 ; SSE-NEXT:    xorps %xmm1, %xmm1
 ; SSE-NEXT:    jmp .LBB1_3
 ; SSE-NEXT:  .LBB1_1:
-; SSE-NEXT:    movaps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; SSE-NEXT:    movaps {{.*#+}} xmm1 = [1.0E+0]
 ; SSE-NEXT:  .LBB1_3: # %BB0
 ; SSE-NEXT:    testl %ebx, %ebx
 ; SSE-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
diff --git a/llvm/test/CodeGen/X86/frame-order.ll b/llvm/test/CodeGen/X86/frame-order.ll
index dcbcb481f927c..f410acfda5c12 100644
--- a/llvm/test/CodeGen/X86/frame-order.ll
+++ b/llvm/test/CodeGen/X86/frame-order.ll
@@ -74,9 +74,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
 declare void @capture(ptr) #2
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/CodeGen/X86/fsafdo_test2.ll b/llvm/test/CodeGen/X86/fsafdo_test2.ll
index d83e241b5dd99..fc4c1e87dd285 100644
--- a/llvm/test/CodeGen/X86/fsafdo_test2.ll
+++ b/llvm/test/CodeGen/X86/fsafdo_test2.ll
@@ -196,10 +196,10 @@ if.end9.3:
 }
 
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/CodeGen/X86/fsxor-alignment.ll b/llvm/test/CodeGen/X86/fsxor-alignment.ll
index 6fa4a310956dc..32af5b969b6d5 100644
--- a/llvm/test/CodeGen/X86/fsxor-alignment.ll
+++ b/llvm/test/CodeGen/X86/fsxor-alignment.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s -mtriple=i686-- -mattr=+sse2 | FileCheck %s
 
 ; Don't fold the incoming stack arguments into the xorps instructions used
 ; to do floating-point negations, because the arguments aren't vectors
diff --git a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
index 7bb3bf42d3361..4347d626e5838 100644
--- a/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/i386-shrink-wrapping.ll
@@ -130,5 +130,5 @@ for.end:                                          ; preds = %for.cond.preheader
 ; Function Attrs: nounwind
 declare i32 @varfunc(ptr nocapture readonly, ...) #0
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
index f982196c02c17..11a1f3976102a 100644
--- a/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
+++ b/llvm/test/CodeGen/X86/inline-asm-A-constraint.ll
@@ -23,7 +23,7 @@ entry:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg16b
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/issue163738.ll b/llvm/test/CodeGen/X86/issue163738.ll
new file mode 100644
index 0000000000000..61fe043a970dd
--- /dev/null
+++ b/llvm/test/CodeGen/X86/issue163738.ll
@@ -0,0 +1,13 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=CHECK
+
+define <8 x i64> @foo(<8 x i64> %a, <8 x i64> %b, <8 x i64> %c) {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpternlogq {{.*#+}} zmm0 = ~(zmm0 | zmm2 | zmm1)
+; CHECK-NEXT:    retq
+  %and.demorgan = or <8 x i64> %b, %a
+  %and3.demorgan = or <8 x i64> %and.demorgan, %c
+  %and3 = xor <8 x i64> %and3.demorgan, splat (i64 -1)
+  ret <8 x i64> %and3
+}
diff --git a/llvm/test/CodeGen/X86/label-annotation.ll b/llvm/test/CodeGen/X86/label-annotation.ll
index 626040c7adde9..05e4e87ea21b2 100644
--- a/llvm/test/CodeGen/X86/label-annotation.ll
+++ b/llvm/test/CodeGen/X86/label-annotation.ll
@@ -77,8 +77,8 @@ entry:
 
 
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { inaccessiblememonly noduplicate nounwind }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/label-heapallocsite.ll b/llvm/test/CodeGen/X86/label-heapallocsite.ll
index 31bca25f0db71..72834be6cadeb 100644
--- a/llvm/test/CodeGen/X86/label-heapallocsite.ll
+++ b/llvm/test/CodeGen/X86/label-heapallocsite.ll
@@ -98,8 +98,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 ; CHECK-NEXT:  .short [[LABEL5]]-[[LABEL4]]
 ; CHECK-NEXT:  .long 4096
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone speculatable willreturn }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/late-remat-update.mir b/llvm/test/CodeGen/X86/late-remat-update.mir
index 32123124a924c..9108002a2ac02 100644
--- a/llvm/test/CodeGen/X86/late-remat-update.mir
+++ b/llvm/test/CodeGen/X86/late-remat-update.mir
@@ -39,8 +39,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
index 5199b1519ebea..96780af0d0fd1 100644
--- a/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-memop-check-1.ll
@@ -92,4 +92,4 @@ if.end:
 ; CHECK:	pushl ([[REG3]])
 }
 
-attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/lifetime-alias.ll b/llvm/test/CodeGen/X86/lifetime-alias.ll
index 3efaccba1b63d..22e350cb4e0af 100644
--- a/llvm/test/CodeGen/X86/lifetime-alias.ll
+++ b/llvm/test/CodeGen/X86/lifetime-alias.ll
@@ -140,10 +140,10 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture reado
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg) #1
 
-attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 attributes #5 = { noreturn nounwind }
 attributes #6 = { builtin nounwind }
diff --git a/llvm/test/CodeGen/X86/limit-split-cost.mir b/llvm/test/CodeGen/X86/limit-split-cost.mir
index 5b8bb98389c02..8e4e786fa9da9 100644
--- a/llvm/test/CodeGen/X86/limit-split-cost.mir
+++ b/llvm/test/CodeGen/X86/limit-split-cost.mir
@@ -53,8 +53,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.stackprotector(ptr, ptr) #2
   
-  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+  attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
   attributes #2 = { nounwind }
   
   !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
index f710a300dd2d4..bd997d1647766 100644
--- a/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
+++ b/llvm/test/CodeGen/X86/machine-trace-metrics-crash.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse < %s | FileCheck %s
 
 ; The debug info in this test case was causing a crash because machine trace metrics
 ; did not correctly ignore debug instructions. The check lines ensure that the
diff --git a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
index 3dba5eb15d67f..206d453ad87d5 100644
--- a/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
+++ b/llvm/test/CodeGen/X86/merge-vector-stores-scale-idx-crash.ll
@@ -41,5 +41,5 @@ if.end:                                           ; preds = %entry
 
 declare <4 x float> @_Z1bv() local_unnamed_addr 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/misched-copy.ll b/llvm/test/CodeGen/X86/misched-copy.ll
index fa6cd157d6ae3..e3ceddf685fd9 100644
--- a/llvm/test/CodeGen/X86/misched-copy.ll
+++ b/llvm/test/CodeGen/X86/misched-copy.ll
@@ -42,7 +42,7 @@ end:
   ret i64 %add
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!"float", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/CodeGen/X86/misched-matmul.ll b/llvm/test/CodeGen/X86/misched-matmul.ll
index a6c489dcb3dad..9029167d107db 100644
--- a/llvm/test/CodeGen/X86/misched-matmul.ll
+++ b/llvm/test/CodeGen/X86/misched-matmul.ll
@@ -222,4 +222,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/movpc32-check.ll b/llvm/test/CodeGen/X86/movpc32-check.ll
index e3730d02a949b..4585dcb577c44 100644
--- a/llvm/test/CodeGen/X86/movpc32-check.ll
+++ b/llvm/test/CodeGen/X86/movpc32-check.ll
@@ -12,8 +12,8 @@ entry:
 
 declare void @bar(...) #1
 
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i686" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8, !9}
diff --git a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
index ec17b1df009bb..04db25bc5308f 100644
--- a/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
+++ b/llvm/test/CodeGen/X86/ms-inline-asm-avx512.ll
@@ -20,5 +20,5 @@ entry:
 ; CHECK: movq    %rax, 7(%rsp)
 ; CHECK: retq
 
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/neg_fp.ll b/llvm/test/CodeGen/X86/neg_fp.ll
index 8020982509819..18ded50b45037 100644
--- a/llvm/test/CodeGen/X86/neg_fp.ll
+++ b/llvm/test/CodeGen/X86/neg_fp.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-- -mattr=+sse4.1 | FileCheck %s
 
-; Test that when we don't -enable-unsafe-fp-math, we don't do the optimization
+; Test that when we don't, we don't do the optimization
 ; -0 - (A - B) to (B - A) because A==B, -0 != 0
 
 define float @negfp(float %a, float %b) nounwind {
diff --git a/llvm/test/CodeGen/X86/negate-add-zero.ll b/llvm/test/CodeGen/X86/negate-add-zero.ll
index eb4e2d312af20..4884832a91a52 100644
--- a/llvm/test/CodeGen/X86/negate-add-zero.ll
+++ b/llvm/test/CodeGen/X86/negate-add-zero.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -enable-unsafe-fp-math | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 ; PR3374
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/llvm/test/CodeGen/X86/nocf_check.ll b/llvm/test/CodeGen/X86/nocf_check.ll
index 7b184edf8c0b0..742b07db766e8 100644
--- a/llvm/test/CodeGen/X86/nocf_check.ll
+++ b/llvm/test/CodeGen/X86/nocf_check.ll
@@ -66,8 +66,8 @@ bb2:
   ret void
 }
 
-attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nocf_check noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind optnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #2 = { nocf_check }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr15705.ll b/llvm/test/CodeGen/X86/pr15705.ll
index 3dd4aabe25b57..2de9a34f9dcfd 100644
--- a/llvm/test/CodeGen/X86/pr15705.ll
+++ b/llvm/test/CodeGen/X86/pr15705.ll
@@ -45,4 +45,4 @@ return:
   ret i32 %retval.0
 }
 
-attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone ssp "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr18846.ll b/llvm/test/CodeGen/X86/pr18846.ll
index 93a9a5d627127..4239f46feba0f 100644
--- a/llvm/test/CodeGen/X86/pr18846.ll
+++ b/llvm/test/CodeGen/X86/pr18846.ll
@@ -122,7 +122,7 @@ for.body65:                                       ; preds = %for.body29
 ; Function Attrs: nounwind
 declare void @llvm.x86.avx.storeu.ps.256(ptr, <8 x float>) #1
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/CodeGen/X86/pr31045.ll b/llvm/test/CodeGen/X86/pr31045.ll
index 4aa73d79d8cfc..78ba7cc5ae1cb 100644
--- a/llvm/test/CodeGen/X86/pr31045.ll
+++ b/llvm/test/CodeGen/X86/pr31045.ll
@@ -73,4 +73,4 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr32610.ll b/llvm/test/CodeGen/X86/pr32610.ll
index dc11ba8466ae5..6f3602dcf8d88 100644
--- a/llvm/test/CodeGen/X86/pr32610.ll
+++ b/llvm/test/CodeGen/X86/pr32610.ll
@@ -50,7 +50,7 @@ entry:
   ret void
 }
 
-attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind optsize ssp "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/CodeGen/X86/pr34080-2.ll b/llvm/test/CodeGen/X86/pr34080-2.ll
index de34bfb13159c..279373a7aab3f 100644
--- a/llvm/test/CodeGen/X86/pr34080-2.ll
+++ b/llvm/test/CodeGen/X86/pr34080-2.ll
@@ -132,4 +132,4 @@ define void @computeJD(ptr) nounwind {
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="i486" "target-features"="+x87" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34080.ll b/llvm/test/CodeGen/X86/pr34080.ll
index d07d1aaf6fc0a..3b46bd3c57617 100644
--- a/llvm/test/CodeGen/X86/pr34080.ll
+++ b/llvm/test/CodeGen/X86/pr34080.ll
@@ -162,4 +162,4 @@ entry:
   ret void
 }
 
-attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/pr34629.ll b/llvm/test/CodeGen/X86/pr34629.ll
index eeb61d259f8f7..f7747b1820b65 100644
--- a/llvm/test/CodeGen/X86/pr34629.ll
+++ b/llvm/test/CodeGen/X86/pr34629.ll
@@ -38,7 +38,7 @@ if.end:                                           ; preds = %entry, %if.then
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr34634.ll b/llvm/test/CodeGen/X86/pr34634.ll
index a37411288e490..980961a511160 100644
--- a/llvm/test/CodeGen/X86/pr34634.ll
+++ b/llvm/test/CodeGen/X86/pr34634.ll
@@ -54,7 +54,7 @@ entry:
   ret i32 0
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/CodeGen/X86/pr42727.ll b/llvm/test/CodeGen/X86/pr42727.ll
index cf1fa5a8fc493..18e884bee0f3d 100644
--- a/llvm/test/CodeGen/X86/pr42727.ll
+++ b/llvm/test/CodeGen/X86/pr42727.ll
@@ -29,5 +29,5 @@ entry:
   ret void
 }
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+avx,+avx2,+cx8,+fxsr,+mmx,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/pr48064.mir b/llvm/test/CodeGen/X86/pr48064.mir
index eb74edd4f1800..d2eaea7ebf457 100644
--- a/llvm/test/CodeGen/X86/pr48064.mir
+++ b/llvm/test/CodeGen/X86/pr48064.mir
@@ -185,8 +185,8 @@
   ; Function Attrs: nounwind
   declare void @llvm.x86.seh.ehregnode(ptr) #7
 
-  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+  attributes #0 = { noinline nounwind sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
+  attributes #1 = { norecurse sspstrong "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
   attributes #2 = { argmemonly nofree nosync nounwind willreturn }
   attributes #3 = { nofree }
   attributes #4 = { nofree nosync nounwind willreturn }
diff --git a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index 0d5d8226a62ce..e4975759c4785 100644
--- a/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/llvm/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -178,4 +178,4 @@ bb439:                                            ; preds = %bb222, %bb85
   ret void
 }
 
-attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/recip-fastmath.ll b/llvm/test/CodeGen/X86/recip-fastmath.ll
index dab7a6aab2d82..f8d28ae260228 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath.ll
@@ -1400,7 +1400,7 @@ define <16 x float> @v16f32_two_step(<16 x float> %x) #2 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
 
diff --git a/llvm/test/CodeGen/X86/recip-fastmath2.ll b/llvm/test/CodeGen/X86/recip-fastmath2.ll
index 77ccaff15e42a..7fa13cb7105b0 100644
--- a/llvm/test/CodeGen/X86/recip-fastmath2.ll
+++ b/llvm/test/CodeGen/X86/recip-fastmath2.ll
@@ -1841,8 +1841,8 @@ define <16 x float> @v16f32_no_step2(<16 x float> %x) #3 {
   ret <16 x float> %div
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf,vec-divf" }
-attributes #2 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:2,vec-divf:2" }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="divf:0,vec-divf:0" }
+attributes #0 = { "reciprocal-estimates"="!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="divf,vec-divf" }
+attributes #2 = { "reciprocal-estimates"="divf:2,vec-divf:2" }
+attributes #3 = { "reciprocal-estimates"="divf:0,vec-divf:0" }
 
diff --git a/llvm/test/CodeGen/X86/recip-pic.ll b/llvm/test/CodeGen/X86/recip-pic.ll
index d01ecc1e2ce1e..d2620e7202ee7 100644
--- a/llvm/test/CodeGen/X86/recip-pic.ll
+++ b/llvm/test/CodeGen/X86/recip-pic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -enable-unsafe-fp-math -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu  -mcpu=slm -relocation-model=pic | FileCheck %s --check-prefix=CHECK
 
 define fastcc float @foo(float %x) unnamed_addr #0 {
 ; CHECK-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
index 50422a867dc32..ea1ca51908134 100644
--- a/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
+++ b/llvm/test/CodeGen/X86/regalloc-advanced-split-cost.ll
@@ -72,7 +72,7 @@ if.end:                                           ; preds = %if.else, %if.then
   ret i32 %add
 }
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind }
 
 !llvm.module.flags = !{!0, !1}
diff --git a/llvm/test/CodeGen/X86/regparm.ll b/llvm/test/CodeGen/X86/regparm.ll
index 6d6802e1cdc7a..95009b587cbaf 100644
--- a/llvm/test/CodeGen/X86/regparm.ll
+++ b/llvm/test/CodeGen/X86/regparm.ll
@@ -38,7 +38,7 @@ entry:
 declare void @llvm.memset.p0.i32(ptr nocapture writeonly, i8, i32, i1) #1
 
 
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
 
 !llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/X86/seh-catchpad.ll b/llvm/test/CodeGen/X86/seh-catchpad.ll
index cb85f39439e02..85e465b822e1f 100644
--- a/llvm/test/CodeGen/X86/seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/seh-catchpad.ll
@@ -189,9 +189,9 @@ entry:
 ; Function Attrs: nounwind
 declare i32 @puts(ptr nocapture readonly) #3
 
-attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
+attributes #3 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+sse,+sse2" "use-soft-float"="false" }
 attributes #4 = { noinline }
 attributes #5 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-except-finally.ll b/llvm/test/CodeGen/X86/seh-except-finally.ll
index 539d776430c76..fedb0c46d1ba1 100644
--- a/llvm/test/CodeGen/X86/seh-except-finally.ll
+++ b/llvm/test/CodeGen/X86/seh-except-finally.ll
@@ -136,10 +136,10 @@ declare ptr @llvm.localaddress() #4
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.typeid.for(ptr) #4
 
-attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
+attributes #3 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { noinline }
 attributes #6 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/seh-no-invokes.ll b/llvm/test/CodeGen/X86/seh-no-invokes.ll
index 63e91d33d4006..112031ca5797e 100644
--- a/llvm/test/CodeGen/X86/seh-no-invokes.ll
+++ b/llvm/test/CodeGen/X86/seh-no-invokes.ll
@@ -63,8 +63,8 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #3
 
-attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
index fe42d312c648f..7e98b8aa1da23 100644
--- a/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
+++ b/llvm/test/CodeGen/X86/shrinkwrap-hang.ll
@@ -29,4 +29,4 @@ if.end3:                                          ; preds = %if.end
   ret void
 }
 
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/sincos-opt.ll b/llvm/test/CodeGen/X86/sincos-opt.ll
index 68854569febba..51f3e52f88a6d 100644
--- a/llvm/test/CodeGen/X86/sincos-opt.ll
+++ b/llvm/test/CodeGen/X86/sincos-opt.ll
@@ -1,10 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.9.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_SINCOS
 ; RUN: llc < %s -mtriple=x86_64-apple-macosx10.8.0 -mcpu=core2 | FileCheck %s --check-prefix=OSX_NOOPT
 ; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
-; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-pc-linux-gnux32 -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
 ; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS
-; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 -enable-unsafe-fp-math | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
+; RUN: llc < %s -mtriple=x86_64-fuchsia -mcpu=core2 | FileCheck %s --check-prefix=GNU_SINCOS_FASTMATH
 ; RUN: llc < %s -mtriple=x86_64-scei-ps4 -mcpu=btver2 | FileCheck %s --check-prefix=PS4_SINCOS
 ; RUN: llc < %s -mtriple=x86_64-sie-ps5  -mcpu=znver2 | FileCheck %s --check-prefix=PS4_SINCOS
 
diff --git a/llvm/test/CodeGen/X86/sincos.ll b/llvm/test/CodeGen/X86/sincos.ll
index 79034077f1835..9206c254949ba 100644
--- a/llvm/test/CodeGen/X86/sincos.ll
+++ b/llvm/test/CodeGen/X86/sincos.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; Make sure this testcase codegens to the sin and cos instructions, not calls
-; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 -enable-unsafe-fp-math  | FileCheck %s
+; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3  | FileCheck %s
 ; RUN: llc < %s -mtriple=i686-apple-macosx -mattr=-sse,-sse2,-sse3 | FileCheck %s
 
 declare float  @sinf(float) readonly
diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
index a260b325f7e3c..83bfcd7f04d9c 100644
--- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll
+++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll
@@ -1012,15 +1012,15 @@ define double @sqrt_simplify_before_recip_order(double %x, ptr %p) nounwind {
   ret double %sqrt_fast
 }
 
-attributes #0 = { "unsafe-fp-math"="true" "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
-attributes #1 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" }
+attributes #0 = { "reciprocal-estimates"="!sqrtf,!vec-sqrtf,!divf,!vec-divf" }
+attributes #1 = { "reciprocal-estimates"="sqrt,vec-sqrt" }
 attributes #2 = { nounwind readnone }
-attributes #3 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
-attributes #4 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
-attributes #5 = { "unsafe-fp-math"="true" "reciprocal-estimates"="all:0" }
-attributes #6 = { "unsafe-fp-math"="true" "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
+attributes #3 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" }
+attributes #4 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="ieee,preserve-sign" }
+attributes #5 = { "reciprocal-estimates"="all:0" }
+attributes #6 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" }
 
-; Attributes without "unsafe-fp-math"="true"
+; Attributes without
 ; TODO: Merge with previous attributes when this attribute can be deleted.
 attributes #7 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,ieee" } ; #3
 attributes #8 = { "reciprocal-estimates"="sqrt,vec-sqrt" "denormal-fp-math"="preserve-sign,dynamic" } ; #6
diff --git a/llvm/test/CodeGen/X86/sse1.ll b/llvm/test/CodeGen/X86/sse1.ll
index 8ac86d11d89e6..5005752b06de4 100644
--- a/llvm/test/CodeGen/X86/sse1.ll
+++ b/llvm/test/CodeGen/X86/sse1.ll
@@ -251,5 +251,5 @@ define <2 x float> @PR31672() #0 {
 
 declare <2 x float> @llvm.sqrt.v2f32(<2 x float>) #1
 
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
+attributes #0 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
index 665a84a26fea0..d7c9438c92ac4 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx1.ll
@@ -1912,7 +1912,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1927,7 +1927,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1941,7 +1941,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1956,7 +1956,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1970,7 +1970,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1985,7 +1985,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1999,7 +1999,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2014,7 +2014,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2028,7 +2028,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2043,7 +2043,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2058,7 +2058,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2073,7 +2073,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2088,7 +2088,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2103,7 +2103,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2118,7 +2118,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2133,7 +2133,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2147,7 +2147,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2162,7 +2162,7 @@ define <4 x double> @stack_fold_minpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.min.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2176,7 +2176,7 @@ define <4 x double> @stack_fold_minpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2191,7 +2191,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2205,7 +2205,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2220,7 +2220,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -2234,7 +2234,7 @@ define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2249,7 +2249,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -2279,7 +2279,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2294,7 +2294,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -2309,7 +2309,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -3632,6 +3632,3 @@ define <8 x float> @stack_fold_xorps_ymm(<8 x float> %a0, <8 x float> %a1) {
   %6 = fadd <8 x float> %5, <float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0, float 0x0>
   ret <8 x float> %6
 }
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
index a75cdf9db619f..43743d54c4ccc 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512.ll
@@ -609,7 +609,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_maxpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -695,7 +695,7 @@ define <8 x double> @stack_fold_maxpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_maxps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -781,7 +781,7 @@ define <16 x float> @stack_fold_maxps_zmm_commutable_kz(<16 x float> %a0, <16 x
   ret <16 x float> %4
 }
 
-define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_minpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -867,7 +867,7 @@ define <8 x double> @stack_fold_minpd_zmm_commutable_kz(<8 x double> %a0, <8 x d
   ret <8 x double> %4
 }
 
-define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_minps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1157,7 +1157,7 @@ define <4 x float> @stack_fold_mulss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1178,7 +1178,7 @@ define <8 x double> @stack_fold_orpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_orps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_orps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1414,7 +1414,7 @@ define <4 x float> @stack_fold_subss_int(<4 x float> %a0, <4 x float> %a1) {
   ret <4 x float> %5
 }
 
-define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0 {
+define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1435,7 +1435,7 @@ define <8 x double> @stack_fold_xorpd_zmm(<8 x double> %a0, <8 x double> %a1) #0
   ret <8 x double> %6
 }
 
-define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) #0 {
+define <16 x float> @stack_fold_xorps_zmm(<16 x float> %a0, <16 x float> %a1) {
 ; CHECK-LABEL: stack_fold_xorps_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -2058,5 +2058,4 @@ define <16 x float> @stack_fold_permilpsvar_zmm_maskz(<16 x float> %a0, <16 x i3
   ret <16 x float> %4
 }
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
index 52d4d8b7fb8b9..b715df8f71eee 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16.ll
@@ -502,7 +502,7 @@ define <8 x half> @stack_fold_getmantsh_maskz(<8 x half> %a0, <8 x half> %a1, pt
   ret <8 x half> %3
 }
 
-define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -517,7 +517,7 @@ define <32 x half> @stack_fold_maxph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.max.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -532,7 +532,7 @@ define <32 x half> @stack_fold_maxph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -552,7 +552,7 @@ define <32 x half> @stack_fold_maxph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -573,7 +573,7 @@ define <32 x half> @stack_fold_maxph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -590,7 +590,7 @@ define <32 x half> @stack_fold_maxph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_maxph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_maxph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -710,7 +710,7 @@ define <32 x half> @stack_fold_maxph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_maxsh(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -725,7 +725,7 @@ define half @stack_fold_maxsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_maxsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_maxsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -772,7 +772,7 @@ define half @stack_fold_maxsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -820,7 +820,7 @@ define <8 x half> @stack_fold_maxsh_maskz(<8 x half> %a0, <8 x half> %a1, i8 %ma
   ret <8 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -835,7 +835,7 @@ define <32 x half> @stack_fold_minph_zmm(<32 x half> %a0, <32 x half> %a1) #0 {
 }
 declare <32 x half> @llvm.x86.avx512fp16.min.ph.512(<32 x half>, <32 x half>, i32) nounwind readnone
 
-define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) #0 {
+define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_zmm_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -850,7 +850,7 @@ define <32 x half> @stack_fold_minph_zmm_commuted(<32 x half> %a0, <32 x half> %
   ret <32 x half> %2
 }
 
-define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -870,7 +870,7 @@ define <32 x half> @stack_fold_minph_zmm_k(<32 x half> %a0, <32 x half> %a1, i32
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) #0 {
+define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask, ptr %passthru) {
 ; CHECK-LABEL: stack_fold_minph_zmm_k_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -891,7 +891,7 @@ define <32 x half> @stack_fold_minph_zmm_k_commuted(<32 x half> %a0, <32 x half>
   ret <32 x half> %5
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -908,7 +908,7 @@ define <32 x half> @stack_fold_minph_zmm_kz(<32 x half> %a0, <32 x half> %a1, i3
   ret <32 x half> %4
 }
 
-define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) #0 {
+define <32 x half> @stack_fold_minph_zmm_kz_commuted(<32 x half> %a0, <32 x half> %a1, i32 %mask) {
 ; CHECK-LABEL: stack_fold_minph_zmm_kz_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %zmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Spill
@@ -1028,7 +1028,7 @@ define <32 x half> @stack_fold_minph_zmm_commutable_kz_commuted(<32 x half> %a0,
   ret <32 x half> %4
 }
 
-define half @stack_fold_minsh(half %a0, half %a1) #0 {
+define half @stack_fold_minsh(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1043,7 +1043,7 @@ define half @stack_fold_minsh(half %a0, half %a1) #0 {
   ret half %3
 }
 
-define half @stack_fold_minsh_commuted(half %a0, half %a1) #0 {
+define half @stack_fold_minsh_commuted(half %a0, half %a1) {
 ; CHECK-LABEL: stack_fold_minsh_commuted:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovsh %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1090,7 +1090,7 @@ define half @stack_fold_minsh_commutable_commuted(half %a0, half %a1) #1 {
   ret half %3
 }
 
-define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minsh_int(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minsh_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2316,5 +2316,4 @@ define <4 x float> @stack_fold_fcmaddcsh_maskz(<4 x float> %a0, <4 x float> %a1,
 }
 declare <4 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.csh(<4 x float>, <4 x float>, <4 x float>, i8, i32)
 
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
+attributes #1 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
index 4fed6bc715dab..cd06f2d054af2 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512fp16vl.ll
@@ -381,7 +381,7 @@ define <16 x half> @stack_fold_getmantph_maskz_ymm(<16 x half> %a0, ptr %mask) {
   ret <16 x half> %3
 }
 
-define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -396,7 +396,7 @@ define <8 x half> @stack_fold_maxph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.max.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -410,7 +410,7 @@ define <8 x half> @stack_fold_maxph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -425,7 +425,7 @@ define <16 x half> @stack_fold_maxph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.max.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_maxph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -439,7 +439,7 @@ define <16 x half> @stack_fold_maxph_ymm_commutable(<16 x half> %a0, <16 x half>
   ret <16 x half> %2
 }
 
-define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
+define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -454,7 +454,7 @@ define <8 x half> @stack_fold_minph(<8 x half> %a0, <8 x half> %a1) #0 {
 }
 declare <8 x half> @llvm.x86.avx512fp16.min.ph.128(<8 x half>, <8 x half>) nounwind readnone
 
-define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #1 {
+define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -468,7 +468,7 @@ define <8 x half> @stack_fold_minph_commutable(<8 x half> %a0, <8 x half> %a1) #
   ret <8 x half> %2
 }
 
-define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
+define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -483,7 +483,7 @@ define <16 x half> @stack_fold_minph_ymm(<16 x half> %a0, <16 x half> %a1) #0 {
 }
 declare <16 x half> @llvm.x86.avx512fp16.min.ph.256(<16 x half>, <16 x half>) nounwind readnone
 
-define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) #1 {
+define <16 x half> @stack_fold_minph_ymm_commutable(<16 x half> %a0, <16 x half> %a1) {
 ; CHECK-LABEL: stack_fold_minph_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1471,6 +1471,3 @@ define <8 x float> @stack_fold_fcmaddc_maskz_ymm(<8 x float> %a0, <8 x float> %a
   ret <8 x float> %3
 }
 declare <8 x float> @llvm.x86.avx512fp16.maskz.vfcmadd.cph.256(<8 x float>, <8 x float>, <8 x float>, i8)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
index b370a80c6cf0a..bd56e619ee70b 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-avx512vl.ll
@@ -457,7 +457,7 @@ define <4 x float> @stack_fold_cvtpd2ps_ymm(<4 x double> %a0) {
   ret <4 x float> %2
 }
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -472,7 +472,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -486,7 +486,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -501,7 +501,7 @@ define <4 x double> @stack_fold_maxpd_ymm(<4 x double> %a0, <4 x double> %a1) #0
 }
 declare <4 x double> @llvm.x86.avx.max.pd.256(<4 x double>, <4 x double>) nounwind readnone
 
-define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) #1 {
+define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -515,7 +515,7 @@ define <4 x double> @stack_fold_maxpd_ymm_commutable(<4 x double> %a0, <4 x doub
   ret <4 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -530,7 +530,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -544,7 +544,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -559,7 +559,7 @@ define <8 x float> @stack_fold_maxps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.max.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -573,7 +573,7 @@ define <8 x float> @stack_fold_maxps_ymm_commutable(<8 x float> %a0, <8 x float>
   ret <8 x float> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -588,7 +588,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -602,7 +602,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
+define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -617,7 +617,7 @@ define <8 x float> @stack_fold_minps_ymm(<8 x float> %a0, <8 x float> %a1) #0 {
 }
 declare <8 x float> @llvm.x86.avx.min.ps.256(<8 x float>, <8 x float>) nounwind readnone
 
-define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) #1 {
+define <8 x float> @stack_fold_minps_ymm_commutable(<8 x float> %a0, <8 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_ymm_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -687,7 +687,7 @@ define <8 x float> @stack_fold_mulps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -708,7 +708,7 @@ define <2 x double> @stack_fold_orpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_orpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_orpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -939,7 +939,7 @@ define <8 x float> @stack_fold_subps_ymm(<8 x float> %a0, <8 x float> %a1) {
   ret <8 x float> %2
 }
 
-define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -960,7 +960,7 @@ define <2 x double> @stack_fold_xorpd(<2 x double> %a0, <2 x double> %a1) #0 {
   ret <2 x double> %6
 }
 
-define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) #0 {
+define <4 x double> @stack_fold_xorpd_ymm(<4 x double> %a0, <4 x double> %a1) {
 ; CHECK-LABEL: stack_fold_xorpd_ymm:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vmovups %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -1391,6 +1391,3 @@ declare <4 x float> @llvm.x86.avx512.vpermi2var.ps.128(<4 x float>, <4 x i32>, <
 declare <2 x double> @llvm.x86.avx512.vpermi2var.pd.128(<2 x double>, <2 x i64>, <2 x double>)
 declare <8 x float> @llvm.x86.avx512.vpermi2var.ps.256(<8 x float>, <8 x i32>, <8 x float>)
 declare <4 x double> @llvm.x86.avx512.vpermi2var.pd.256(<4 x double>, <4 x i64>, <4 x double>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
index 306ee31098090..9bc9a9c94bc4f 100644
--- a/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
+++ b/llvm/test/CodeGen/X86/stack-folding-fp-sse42.ll
@@ -1424,7 +1424,7 @@ define <4 x float> @stack_fold_insertps(<4 x float> %a0, <4 x float> %a1) {
 }
 declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i8) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1439,7 +1439,7 @@ define <2 x double> @stack_fold_maxpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.max.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1453,7 +1453,7 @@ define <2 x double> @stack_fold_maxpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1468,7 +1468,7 @@ define <4 x float> @stack_fold_maxps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1482,7 +1482,7 @@ define <4 x float> @stack_fold_maxps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_maxsd(double %a0, double %a1) #0 {
+define double @stack_fold_maxsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1497,7 +1497,7 @@ define double @stack_fold_maxsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_maxsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1512,7 +1512,7 @@ define double @stack_fold_maxsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_maxsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1527,7 +1527,7 @@ define <2 x double> @stack_fold_maxsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.max.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_maxss(float %a0, float %a1) #0 {
+define float @stack_fold_maxss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1542,7 +1542,7 @@ define float @stack_fold_maxss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_maxss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_maxss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1557,7 +1557,7 @@ define float @stack_fold_maxss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_maxss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1572,7 +1572,7 @@ define <4 x float> @stack_fold_maxss_int(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.max.ss(<4 x float>, <4 x float>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1587,7 +1587,7 @@ define <2 x double> @stack_fold_minpd(<2 x double> %a0, <2 x double> %a1) #0 {
 }
 declare <2 x double> @llvm.x86.sse2.min.pd(<2 x double>, <2 x double>) nounwind readnone
 
-define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) #1 {
+define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minpd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1601,7 +1601,7 @@ define <2 x double> @stack_fold_minpd_commutable(<2 x double> %a0, <2 x double>
   ret <2 x double> %2
 }
 
-define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1616,7 +1616,7 @@ define <4 x float> @stack_fold_minps(<4 x float> %a0, <4 x float> %a1) #0 {
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) nounwind readnone
 
-define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) #1 {
+define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minps_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1630,7 +1630,7 @@ define <4 x float> @stack_fold_minps_commutable(<4 x float> %a0, <4 x float> %a1
   ret <4 x float> %2
 }
 
-define double @stack_fold_minsd(double %a0, double %a1) #0 {
+define double @stack_fold_minsd(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1645,7 +1645,7 @@ define double @stack_fold_minsd(double %a0, double %a1) #0 {
   ret double %3
 }
 
-define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
+define double @stack_fold_minsd_commutable(double %a0, double %a1) {
 ; CHECK-LABEL: stack_fold_minsd_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movsd %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -1660,7 +1660,7 @@ define double @stack_fold_minsd_commutable(double %a0, double %a1) #1 {
   ret double %3
 }
 
-define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0 {
+define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: stack_fold_minsd_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -1675,7 +1675,7 @@ define <2 x double> @stack_fold_minsd_int(<2 x double> %a0, <2 x double> %a1) #0
 }
 declare <2 x double> @llvm.x86.sse2.min.sd(<2 x double>, <2 x double>) nounwind readnone
 
-define float @stack_fold_minss(float %a0, float %a1) #0 {
+define float @stack_fold_minss(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1690,7 +1690,7 @@ define float @stack_fold_minss(float %a0, float %a1) #0 {
   ret float %3
 }
 
-define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
+define float @stack_fold_minss_commutable(float %a0, float %a1) {
 ; CHECK-LABEL: stack_fold_minss_commutable:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movss %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
@@ -1705,7 +1705,7 @@ define float @stack_fold_minss_commutable(float %a0, float %a1) #1 {
   ret float %3
 }
 
-define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) #0 {
+define <4 x float> @stack_fold_minss_int(<4 x float> %a0, <4 x float> %a1) {
 ; CHECK-LABEL: stack_fold_minss_int:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
@@ -2490,6 +2490,3 @@ define <4 x float> @stack_fold_xorps(<4 x float> %a0, <4 x float> %a1) {
 
 declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
 declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-
-attributes #0 = { "unsafe-fp-math"="false" }
-attributes #1 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/CodeGen/X86/stack-protector-3.ll b/llvm/test/CodeGen/X86/stack-protector-3.ll
index 59784af7168db..8ca6a566f6dca 100644
--- a/llvm/test/CodeGen/X86/stack-protector-3.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-3.ll
@@ -118,7 +118,7 @@ declare void @llvm.lifetime.end.p0(i64 immarg, ptr nocapture) #1
 ; Function Attrs: argmemonly nounwind willreturn
 declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind willreturn }
 attributes #2 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
index 63390e46daade..4bc91bfd57125 100644
--- a/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
+++ b/llvm/test/CodeGen/X86/stack-protector-vreg-to-vreg-copy.ll
@@ -55,6 +55,6 @@ entry:
 
 declare void @f(i32) #1
 
-attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind sspreq "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/stack_guard_remat.ll b/llvm/test/CodeGen/X86/stack_guard_remat.ll
index f7a602c0e8a11..f53fa0b4c12fd 100644
--- a/llvm/test/CodeGen/X86/stack_guard_remat.ll
+++ b/llvm/test/CodeGen/X86/stack_guard_remat.ll
@@ -23,4 +23,4 @@ declare void @foo3(ptr)
 ; Function Attrs: nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture)
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/tail-merge-wineh.ll b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
index 00bddc1382e92..a208368e262ac 100644
--- a/llvm/test/CodeGen/X86/tail-merge-wineh.ll
+++ b/llvm/test/CodeGen/X86/tail-merge-wineh.ll
@@ -101,5 +101,5 @@ declare x86_stdcallcc void @_CxxThrowException(ptr, ptr)
 
 declare i32 @__CxxFrameHandler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { noreturn }
diff --git a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
index 2749ebd5e3ca8..f0d5a16ac6dc8 100644
--- a/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/X86/tls-shrink-wrapping.ll
@@ -51,6 +51,6 @@ if.end:                                           ; preds = %if.then, %entry
 
 declare void @f(...) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
+attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/unused_stackslots.ll b/llvm/test/CodeGen/X86/unused_stackslots.ll
index d909dd478cae8..4d390bd58ff2a 100644
--- a/llvm/test/CodeGen/X86/unused_stackslots.ll
+++ b/llvm/test/CodeGen/X86/unused_stackslots.ll
@@ -215,8 +215,8 @@ declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64,
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(i64, ptr nocapture) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "use-soft-float"="false" }
 attributes #3 = { nounwind }
 
diff --git a/llvm/test/CodeGen/X86/uwtables.ll b/llvm/test/CodeGen/X86/uwtables.ll
index 1e2e1d95f3d3d..68a5ff102199c 100644
--- a/llvm/test/CodeGen/X86/uwtables.ll
+++ b/llvm/test/CodeGen/X86/uwtables.ll
@@ -38,5 +38,5 @@ declare i32 @__gxx_personality_v0(...)
 declare void @__cxa_call_unexpected(ptr) local_unnamed_addr
 
 
-attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
diff --git a/llvm/test/CodeGen/X86/vec_int_to_fp.ll b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
index 910dd1ee6c419..5954e3456d674 100644
--- a/llvm/test/CodeGen/X86/vec_int_to_fp.ll
+++ b/llvm/test/CodeGen/X86/vec_int_to_fp.ll
@@ -5435,7 +5435,7 @@ define double @extract3_uitofp_v4i32_f64(<4 x i32> %x) nounwind {
   ret double %r
 }
 
-define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
+define void @PR43609(ptr nocapture %x, <2 x i64> %y) {
 ; SSE2-LABEL: PR43609:
 ; SSE2:       # %bb.0:
 ; SSE2-NEXT:    movdqa {{.*#+}} xmm1 = [2,2]
@@ -5643,6 +5643,3 @@ define void @PR43609(ptr nocapture %x, <2 x i64> %y) #0 {
   store <2 x double> %t23, ptr %t26, align 8
   ret void
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
-
diff --git a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
index c0beb6fef3234..2822d407b8d01 100644
--- a/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
+++ b/llvm/test/CodeGen/X86/vec_uint_to_fp-fastmath.ll
@@ -1,9 +1,9 @@
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math | FileCheck %s --check-prefix=CST --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64 -enable-unsafe-fp-math -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
+; RUN: llc < %s -mtriple=x86_64 | FileCheck %s --check-prefix=CST --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64 -mattr=+sse4.1 | FileCheck %s --check-prefix=CST --check-prefix=SSE41
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx | FileCheck %s --check-prefix=CST --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx2 | FileCheck %s --check-prefix=AVX2
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512f | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64 -mattr=+avx512vl | FileCheck %s --check-prefix=AVX512VL
 
 ; Check that the constant used in the vectors are the right ones.
 ; SSE2: [[MASKCSTADDR:.LCPI[0-9_]+]]:
diff --git a/llvm/test/CodeGen/X86/vector-sqrt.ll b/llvm/test/CodeGen/X86/vector-sqrt.ll
index b08784a694bcf..843f099a2e2b0 100644
--- a/llvm/test/CodeGen/X86/vector-sqrt.ll
+++ b/llvm/test/CodeGen/X86/vector-sqrt.ll
@@ -63,6 +63,6 @@ entry:
 ; Function Attrs: nounwind readnone
 declare float @sqrtf(float) local_unnamed_addr #1
 
-attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone  "target-features"="+avx2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable "target-features"="+avx" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone  "target-features"="+avx2" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
diff --git a/llvm/test/CodeGen/X86/vector-width-store-merge.ll b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
index 50c7b018c0783..9363348f83443 100644
--- a/llvm/test/CodeGen/X86/vector-width-store-merge.ll
+++ b/llvm/test/CodeGen/X86/vector-width-store-merge.ll
@@ -85,8 +85,8 @@ entry:
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memmove.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1 immarg) #1
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "prefer-vector-width"="256" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 
 !0 = !{i32 1, !"wchar_size", i32 4}
diff --git a/llvm/test/CodeGen/X86/win-cleanuppad.ll b/llvm/test/CodeGen/X86/win-cleanuppad.ll
index e9265a1ed42e3..59dcccce37464 100644
--- a/llvm/test/CodeGen/X86/win-cleanuppad.ll
+++ b/llvm/test/CodeGen/X86/win-cleanuppad.ll
@@ -194,6 +194,6 @@ cleanup.outer:                                      ; preds = %invoke.cont.1, %c
 ; X64-NEXT: .long   .Ltmp7@IMGREL
 ; X64-NEXT: .long   -1
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
index 832ddcab5227b..0f5186641ca35 100644
--- a/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-catchpad.ll
@@ -220,7 +220,7 @@ declare i32 @_except_handler3(...)
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #2
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 attributes #2 = { nounwind }
 attributes #3 = { noinline }
diff --git a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
index 5b1f9b3d1987a..5095460559d5e 100644
--- a/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
+++ b/llvm/test/CodeGen/X86/win32-seh-nested-finally.ll
@@ -34,8 +34,8 @@ declare void @f(i32) #0
 
 declare i32 @_except_handler3(...)
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { noinline nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 attributes #3 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
index 785c2606186a6..d4d4fe346755d 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppCatchDotDotDot.ll
@@ -272,12 +272,12 @@ declare dso_local void @llvm.seh.try.end() #2
 ; Function Attrs: nounwind readnone
 declare i32 @llvm.eh.exceptioncode(token) #3
 
-attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
 attributes #3 = { nounwind readnone }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #5 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noreturn }
 attributes #8 = { noinline }
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
index 6c6e9c3b66804..b0baaac45bdea 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-CppDtors01.ll
@@ -240,12 +240,12 @@ declare i32 @llvm.eh.exceptioncode(token) #1
 
 declare dso_local void @"?printf@@YAXZZ"(...) #5
 
-attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
-attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { noinline nounwind optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #3 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #6 = { nounwind }
 attributes #7 = { noinline }
 
diff --git a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
index 9e44299083d46..d3da5f83f4756 100644
--- a/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
+++ b/llvm/test/CodeGen/X86/windows-seh-EHa-TryInFinally.ll
@@ -209,10 +209,10 @@ declare i32 @llvm.eh.exceptioncode(token) #4
 ; Function Attrs: nounwind
 declare void @llvm.localescape(...) #5
 
-attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse optnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #2 = { nounwind willreturn }
-attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { noinline "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #4 = { nounwind readnone }
 attributes #5 = { nounwind }
 attributes #6 = { noinline }
diff --git a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
index cb1248188002f..9f888f8a7fada 100644
--- a/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
+++ b/llvm/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -24,7 +24,7 @@ entry:
   ret i64 %or
 }
 
-attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { minsize nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 
 ; clang -Os -c test2.cpp -emit-llvm -S
@@ -63,7 +63,7 @@ entry:
   ret i64 %or
 }
 
-attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 ; clang -O2 -c test2.cpp -emit-llvm -S
 ; Verify that we do not generate shld insruction when we are not optimizing
@@ -89,7 +89,7 @@ entry:
   ret i64 %or
 }
 
-attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !0 = !{i32 1, !"ProfileSummary", !1}
diff --git a/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
new file mode 100644
index 0000000000000..f6e941a21deb0
--- /dev/null
+++ b/llvm/test/DebugInfo/X86/aggressive-instcombine-store-merge-dbg.ll
@@ -0,0 +1,49 @@
+; RUN: opt -S -passes=aggressive-instcombine -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s
+
+;; Aggressive instcombine merges the two i8 stores into an i16 store. Check
+;; the debug location and DIAssignID metadata get merged.
+
+; CHECK: define void @test_i16(i16 %x, ptr %p) !dbg ![[#]] {
+; CHECK-NEXT: store i16 %x, ptr %p, align 1, !dbg ![[DBG:[0-9]+]], !DIAssignID ![[ID:[0-9]+]]
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 0, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(), ![[#]])
+; CHECK-NEXT: #dbg_assign(i16 %x, ![[#]],
+;    CHECK-SAME: !DIExpression(DW_OP_constu, 8, DW_OP_shr, DW_OP_LLVM_convert, 16, DW_ATE_unsigned, DW_OP_LLVM_convert, 8, DW_ATE_unsigned, DW_OP_stack_value, DW_OP_LLVM_fragment, 8, 8),
+;    CHECK-SAME: ![[ID]], ptr %p, !DIExpression(DW_OP_plus_uconst, 1), ![[#]])
+; CHECK-NEXT: ret void
+
+; CHECK: ![[DBG]] = !DILocation(line: 0, scope: ![[#]])
+
+define void @test_i16(i16 %x, ptr %p) !dbg !5 {
+  %x.0 = trunc i16 %x to i8
+  store i8 %x.0, ptr %p, align 1, !dbg !16, !DIAssignID !17
+    #dbg_assign(i8 %x.0, !9, !DIExpression(DW_OP_LLVM_fragment, 0, 8), !17, ptr %p, !DIExpression(), !18)
+  %shr.1 = lshr i16 %x, 8
+  %x.1 = trunc i16 %shr.1 to i8
+  %gep.1 = getelementptr i8, ptr %p, i64 1
+  store i8 %x.1, ptr %gep.1, align 1, !dbg !19, !DIAssignID !20
+    #dbg_assign(i8 %x.1, !9, !DIExpression(DW_OP_LLVM_fragment, 8, 8), !20, ptr %gep.1, !DIExpression(), !18)
+  ret void
+}
+
+!llvm.dbg.cu = !{!0}
+!llvm.debugify = !{!2, !3}
+!llvm.module.flags = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C, file: !1, producer: "debugify", isOptimized: true, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "/app/example.ll", directory: "/")
+!2 = !{i32 7}
+!3 = !{i32 4}
+!4 = !{i32 2, !"Debug Info Version", i32 3}
+!5 = distinct !DISubprogram(name: "test_i16", linkageName: "test_i16", scope: null, file: !1, line: 1, type: !6, scopeLine: 1, spFlags: DISPFlagDefinition | DISPFlagOptimized, unit: !0, retainedNodes: !8)
+!6 = !DISubroutineType(types: !7)
+!7 = !{}
+!8 = !{!9}
+!9 = !DILocalVariable(name: "1", scope: !5, file: !1, line: 1, type: !10)
+!10 = !DIBasicType(name: "ty16", size: 16, encoding: DW_ATE_unsigned)
+!16 = !DILocation(line: 2, column: 1, scope: !5)
+!17 = distinct !DIAssignID()
+!18 = !DILocation(line: 1, column: 1, scope: !5)
+!19 = !DILocation(line: 6, column: 1, scope: !5)
+!20 = distinct !DIAssignID()
diff --git a/llvm/test/Instrumentation/AllocToken/basic.ll b/llvm/test/Instrumentation/AllocToken/basic.ll
index 099d37df264d6..0c34b1373cfa1 100644
--- a/llvm/test/Instrumentation/AllocToken/basic.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/basic32.ll b/llvm/test/Instrumentation/AllocToken/basic32.ll
index 944a452f4b4d7..52d1d1446f0ef 100644
--- a/llvm/test/Instrumentation/AllocToken/basic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/basic32.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/fast.ll b/llvm/test/Instrumentation/AllocToken/fast.ll
index 19a3ef6bb9ede..f6bf5ee0b7c97 100644
--- a/llvm/test/Instrumentation/AllocToken/fast.ll
+++ b/llvm/test/Instrumentation/AllocToken/fast.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-fast-abi -alloc-token-max=3 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic.ll b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
index 13aaa90008a7c..5c6f2f147b3d0 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
index eb5dbbe91a83e..15f7c258e2a5d 100644
--- a/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
+++ b/llvm/test/Instrumentation/AllocToken/intrinsic32.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
 ; Test that the alloc-token pass lowers the intrinsic to a constant token ID.
 ;
-; RUN: opt < %s -passes=alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
 target triple = "i386-pc-linux-gnu"
diff --git a/llvm/test/Instrumentation/AllocToken/invoke.ll b/llvm/test/Instrumentation/AllocToken/invoke.ll
index 347c99a2e8f8d..8e7ab3848dc05 100644
--- a/llvm/test/Instrumentation/AllocToken/invoke.ll
+++ b/llvm/test/Instrumentation/AllocToken/invoke.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
index 19673da1bcfb6..45f573ee7b044 100644
--- a/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
+++ b/llvm/test/Instrumentation/AllocToken/nonlibcalls.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=increment -alloc-token-extended -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=increment>' -alloc-token-extended -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
index 1f776480c5b3a..4d1be5eac8cd2 100644
--- a/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
+++ b/llvm/test/Instrumentation/AllocToken/typehashpointersplit.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -passes=inferattrs,alloc-token -alloc-token-mode=typehashpointersplit -alloc-token-max=2 -S | FileCheck %s
+; RUN: opt < %s -passes='inferattrs,alloc-token<mode=typehashpointersplit>' -alloc-token-max=2 -S | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
index 1ddcd4b56688c..e7491e985fa26 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
@@ -1,14 +1,16 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
 
-; XFAIL: *
-
 ; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
-; Manually minimized to show MSan leads to a compiler crash
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android9001"
 
 define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+; CHECK-LABEL: @test_return_arg1(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG1:%.*]]
+;
   ret target("aarch64.svcount") %arg1
 }
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
index 9caa89de63748..e1ea9e68aefc3 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=msan -mattr=+sme -o - %s | FileCheck %s
 
-; XFAIL: *
-
 ; Forked from llvm/test/CodeGen/AArch64/sme-aarch64-svcount.ll
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
@@ -12,16 +10,49 @@ target triple = "aarch64--linux-android9001"
 ; Test simple loads, stores and return.
 ;
 define target("aarch64.svcount") @test_load(ptr %ptr) nounwind {
+; CHECK-LABEL: @test_load(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = load target("aarch64.svcount"), ptr [[PTR:%.*]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[RES]]
+;
   %res = load target("aarch64.svcount"), ptr %ptr
   ret target("aarch64.svcount") %res
 }
 
 define void @test_store(ptr %ptr, target("aarch64.svcount") %val) nounwind {
+; CHECK-LABEL: @test_store(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP1:%.*]] = ptrtoint ptr [[PTR:%.*]] to i64
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i64 [[TMP1]], 193514046488576
+; CHECK-NEXT:    [[TMP3:%.*]] = inttoptr i64 [[TMP2]] to ptr
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr [[TMP3]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") [[VAL:%.*]], ptr [[PTR]], align 2
+; CHECK-NEXT:    ret void
+;
   store target("aarch64.svcount") %val, ptr %ptr
   ret void
 }
 
 define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcount") %val) nounwind {
+; CHECK-LABEL: @test_alloca_store_reload(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[PTR:%.*]] = alloca target("aarch64.svcount"), align 1
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP2:%.*]] = mul nuw i64 [[TMP1]], 2
+; CHECK-NEXT:    [[TMP3:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP4:%.*]] = xor i64 [[TMP3]], 193514046488576
+; CHECK-NEXT:    [[TMP5:%.*]] = inttoptr i64 [[TMP4]] to ptr
+; CHECK-NEXT:    call void @llvm.memset.p0.i64(ptr align 1 [[TMP5]], i8 0, i64 [[TMP2]], i1 false)
+; CHECK-NEXT:    [[TMP6:%.*]] = ptrtoint ptr [[PTR]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = xor i64 [[TMP6]], 193514046488576
+; CHECK-NEXT:    [[TMP8:%.*]] = inttoptr i64 [[TMP7]] to ptr
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr [[TMP8]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") [[VAL:%.*]], ptr [[PTR]], align 2
+; CHECK-NEXT:    [[RES:%.*]] = load target("aarch64.svcount"), ptr [[PTR]], align 2
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[RES]]
+;
   %ptr = alloca target("aarch64.svcount"), align 1
   store target("aarch64.svcount") %val, ptr %ptr
   %res = load target("aarch64.svcount"), ptr %ptr
@@ -33,10 +64,20 @@ define target("aarch64.svcount") @test_alloca_store_reload(target("aarch64.svcou
 ;
 
 define target("aarch64.svcount") @test_return_arg1(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1) nounwind {
+; CHECK-LABEL: @test_return_arg1(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG1:%.*]]
+;
   ret target("aarch64.svcount") %arg1
 }
 
 define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4) nounwind {
+; CHECK-LABEL: @test_return_arg4(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    store target("aarch64.svcount") zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[ARG4:%.*]]
+;
   ret target("aarch64.svcount") %arg4
 }
 
@@ -46,22 +87,58 @@ define target("aarch64.svcount") @test_return_arg4(target("aarch64.svcount") %ar
 
 declare void @take_svcount_1(target("aarch64.svcount") %arg)
 define void @test_pass_1arg(target("aarch64.svcount") %arg) nounwind {
+; CHECK-LABEL: @test_pass_1arg(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @take_svcount_1(target("aarch64.svcount") [[ARG:%.*]])
+; CHECK-NEXT:    ret void
+;
   call void @take_svcount_1(target("aarch64.svcount") %arg)
   ret void
 }
 
 declare void @take_svcount_5(target("aarch64.svcount") %arg0, target("aarch64.svcount") %arg1, target("aarch64.svcount") %arg2, target("aarch64.svcount") %arg3, target("aarch64.svcount") %arg4)
 define void @test_pass_5args(target("aarch64.svcount") %arg) nounwind {
+; CHECK-LABEL: @test_pass_5args(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    call void @take_svcount_5(target("aarch64.svcount") [[ARG:%.*]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]], target("aarch64.svcount") [[ARG]])
+; CHECK-NEXT:    ret void
+;
   call void @take_svcount_5(target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg, target("aarch64.svcount") %arg)
   ret void
 }
 
 define target("aarch64.svcount") @test_sel(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i1 %cmp) sanitize_memory {
+; CHECK-LABEL: @test_sel(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i1, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[CMP:%.*]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP1]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") [[TMP2]]
+; CHECK-NEXT:    [[X_Y:%.*]] = select i1 [[CMP]], target("aarch64.svcount") [[X:%.*]], target("aarch64.svcount") [[Y:%.*]]
+; CHECK-NEXT:    store target("aarch64.svcount") [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[X_Y]]
+;
   %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
   ret target("aarch64.svcount") %x.y
 }
 
 define target("aarch64.svcount") @test_sel_cc(target("aarch64.svcount") %x, target("aarch64.svcount") %y, i32 %k) sanitize_memory {
+; CHECK-LABEL: @test_sel_cc(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = xor i32 [[K:%.*]], -2147483648
+; CHECK-NEXT:    [[TMP3:%.*]] = xor i32 [[TMP1]], -1
+; CHECK-NEXT:    [[TMP4:%.*]] = and i32 [[TMP2]], [[TMP3]]
+; CHECK-NEXT:    [[TMP5:%.*]] = or i32 [[TMP2]], [[TMP1]]
+; CHECK-NEXT:    [[TMP6:%.*]] = icmp ugt i32 [[TMP4]], -2147483606
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp ugt i32 [[TMP5]], -2147483606
+; CHECK-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP6]], [[TMP7]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp sgt i32 [[K]], 42
+; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[CMP]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer
+; CHECK-NEXT:    [[_MSPROP_SELECT:%.*]] = select i1 [[TMP8]], target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") [[TMP9]]
+; CHECK-NEXT:    [[X_Y:%.*]] = select i1 [[CMP]], target("aarch64.svcount") [[X:%.*]], target("aarch64.svcount") [[Y:%.*]]
+; CHECK-NEXT:    store target("aarch64.svcount") [[_MSPROP_SELECT]], ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret target("aarch64.svcount") [[X_Y]]
+;
   %cmp = icmp sgt i32 %k, 42
   %x.y = select i1 %cmp, target("aarch64.svcount") %x, target("aarch64.svcount") %y
   ret target("aarch64.svcount") %x.y
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
new file mode 100644
index 0000000000000..3ae73c5719c3a
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s | FileCheck %s
+
+; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+; Manually reduced to show MSan leads to a compiler crash
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32_tuple(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR5:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    ret void
+;
+  %1 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %2 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %1, ptr %ptr)
+  ret void
+}
diff --git a/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
new file mode 100644
index 0000000000000..8d00b930abf95
--- /dev/null
+++ b/llvm/test/Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
@@ -0,0 +1,819 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=msan -mattr=+sme2 -mattr=+sme-i16i64 -mattr=+sme-f64f64 -o - %s | FileCheck %s
+
+; Forked from llvm/test/CodeGen/AArch64/sme2-intrinsics-add.ll
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-android9001"
+
+define void @multi_vector_add_write_single_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,  <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1:![0-9]+]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7:[0-9]+]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZM]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm)
+  ret void
+}
+
+define void @multi_vector_add_write_single_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,  <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZM]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm)
+  ret void
+}
+
+
+define void @multi_vector_add_write_single_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]], <vscale x 4 x i32> [[ZM]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm)
+  ret void
+}
+
+define void @multi_vector_add_write_single_za_vg1x4_i64(i32 %slice,
+; CHECK-LABEL: @multi_vector_add_write_single_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]], <vscale x 2 x i64> [[ZM]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm)
+  ret void
+}
+
+
+define void @multi_vector_add_write_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZM1:%.*]], <vscale x 4 x i32> [[ZM2:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZM1]], <vscale x 4 x i32> [[ZM2]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zm1, <vscale x 4 x i32> %zm2)
+  ret void
+}
+
+
+define void @multi_vector_add_write_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZM1:%.*]], <vscale x 2 x i64> [[ZM2:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZM1]], <vscale x 2 x i64> [[ZM2]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zm1, <vscale x 2 x i64> %zm2)
+  ret void
+}
+
+
+
+define void @multi_vector_add_write_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]], <vscale x 4 x i32> [[ZM0:%.*]], <vscale x 4 x i32> [[ZM1:%.*]], <vscale x 4 x i32> [[ZM2:%.*]], <vscale x 4 x i32> [[ZM3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]], <vscale x 4 x i32> [[ZM0]], <vscale x 4 x i32> [[ZM1]], <vscale x 4 x i32> [[ZM2]], <vscale x 4 x i32> [[ZM3]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3,
+  <vscale x 4 x i32> %zm0, <vscale x 4 x i32> %zm1,
+  <vscale x 4 x i32> %zm2, <vscale x 4 x i32> %zm3)
+  ret void
+}
+
+define void @multi_vector_add_write_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+; CHECK-LABEL: @multi_vector_add_write_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]], <vscale x 2 x i64> [[ZM0:%.*]], <vscale x 2 x i64> [[ZM1:%.*]], <vscale x 2 x i64> [[ZM2:%.*]], <vscale x 2 x i64> [[ZM3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]], <vscale x 2 x i64> [[ZM0]], <vscale x 2 x i64> [[ZM1]], <vscale x 2 x i64> [[ZM2]], <vscale x 2 x i64> [[ZM3]])
+; CHECK-NEXT:    ret void
+;
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3) sanitize_memory {
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3,
+  <vscale x 2 x i64> %zm0, <vscale x 2 x i64> %zm1,
+  <vscale x 2 x i64> %zm2, <vscale x 2 x i64> %zm3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice,<vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32 %slice.7, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32 %slice.7, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE:%.*]], <vscale x 4 x float> [[ZN0:%.*]], <vscale x 4 x float> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 [[SLICE_7]], <vscale x 4 x float> [[ZN0]], <vscale x 4 x float> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32 %slice.7,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE:%.*]], <vscale x 2 x double> [[ZN0:%.*]], <vscale x 2 x double> [[ZN1:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 [[SLICE_7]], <vscale x 2 x double> [[ZN0]], <vscale x 2 x double> [[ZN1]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 %slice.7,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x2_f64_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x2_f64_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP5]], 1
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP8:%.*]], label [[TMP9:%.*]], !prof [[PROF1]]
+; CHECK:       8:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       9:
+; CHECK-NEXT:    [[TMP10:%.*]] = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX2]])
+; CHECK-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP10]], 0
+; CHECK-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } [[TMP10]], 1
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> [[TMP6]], <vscale x 2 x double> [[TMP11]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> [[TMP7]], <vscale x 2 x double> [[TMP12]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 0
+  %3 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %1, 1
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %4 = tail call { <vscale x 2 x double>, <vscale x 2 x double> } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %5 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 0
+  %6 = extractvalue { <vscale x 2 x double>, <vscale x 2 x double> } %4, 1
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %2, <vscale x 2 x double> %5)
+  call void @llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32 0, <vscale x 2 x double> %3, <vscale x 2 x double> %6)
+  ret void
+}
+
+
+define void @multi_vector_add_za_vg1x4_i32(i32 %slice, <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1, <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_i32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE:%.*]], <vscale x 4 x i32> [[ZN0:%.*]], <vscale x 4 x i32> [[ZN1:%.*]], <vscale x 4 x i32> [[ZN2:%.*]], <vscale x 4 x i32> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 [[SLICE_7]], <vscale x 4 x i32> [[ZN0]], <vscale x 4 x i32> [[ZN1]], <vscale x 4 x i32> [[ZN2]], <vscale x 4 x i32> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32 %slice.7,
+  <vscale x 4 x i32> %zn0, <vscale x 4 x i32> %zn1,
+  <vscale x 4 x i32> %zn2, <vscale x 4 x i32> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_i64(i32 %slice, <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1, <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_i64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE:%.*]], <vscale x 2 x i64> [[ZN0:%.*]], <vscale x 2 x i64> [[ZN1:%.*]], <vscale x 2 x i64> [[ZN2:%.*]], <vscale x 2 x i64> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 [[SLICE_7]], <vscale x 2 x i64> [[ZN0]], <vscale x 2 x i64> [[ZN1]], <vscale x 2 x i64> [[ZN2]], <vscale x 2 x i64> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32 %slice.7,
+  <vscale x 2 x i64> %zn0, <vscale x 2 x i64> %zn1,
+  <vscale x 2 x i64> %zn2, <vscale x 2 x i64> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f32(i32 %slice, <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1, <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE:%.*]], <vscale x 4 x float> [[ZN0:%.*]], <vscale x 4 x float> [[ZN1:%.*]], <vscale x 4 x float> [[ZN2:%.*]], <vscale x 4 x float> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 [[SLICE_7]], <vscale x 4 x float> [[ZN0]], <vscale x 4 x float> [[ZN1]], <vscale x 4 x float> [[ZN2]], <vscale x 4 x float> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 %slice.7,
+  <vscale x 4 x float> %zn0, <vscale x 4 x float> %zn1,
+  <vscale x 4 x float> %zn2, <vscale x 4 x float> %zn3)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f32_tuple(i64 %stride, ptr %ptr) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f32_tuple(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr getelementptr (i8, ptr @__msan_param_tls, i64 8), align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i64 [[TMP0]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP3:%.*]], label [[TMP4:%.*]], !prof [[PROF1]]
+; CHECK:       3:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       4:
+; CHECK-NEXT:    [[TMP5:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[PTR:%.*]])
+; CHECK-NEXT:    [[TMP6:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 0
+; CHECK-NEXT:    [[TMP7:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP5]], 3
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[STRIDE:%.*]]
+; CHECK-NEXT:    [[_MSCMP3:%.*]] = icmp ne i64 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP3]], label [[TMP10:%.*]], label [[TMP11:%.*]], !prof [[PROF1]]
+; CHECK:       10:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       11:
+; CHECK-NEXT:    [[TMP12:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX2]])
+; CHECK-NEXT:    [[TMP13:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 0
+; CHECK-NEXT:    [[TMP14:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 1
+; CHECK-NEXT:    [[TMP15:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 2
+; CHECK-NEXT:    [[TMP16:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP12]], 3
+; CHECK-NEXT:    [[TMP17:%.*]] = shl i64 [[TMP1]], 1
+; CHECK-NEXT:    [[TMP18:%.*]] = or i64 [[TMP17]], 0
+; CHECK-NEXT:    [[MUL3:%.*]] = shl i64 [[STRIDE]], 1
+; CHECK-NEXT:    [[_MSPROP1:%.*]] = or i64 [[TMP0]], [[TMP18]]
+; CHECK-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[MUL3]]
+; CHECK-NEXT:    [[_MSCMP4:%.*]] = icmp ne i64 [[_MSPROP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP4]], label [[TMP19:%.*]], label [[TMP20:%.*]], !prof [[PROF1]]
+; CHECK:       19:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       20:
+; CHECK-NEXT:    [[TMP21:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX4]])
+; CHECK-NEXT:    [[TMP22:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 0
+; CHECK-NEXT:    [[TMP23:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 1
+; CHECK-NEXT:    [[TMP24:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 2
+; CHECK-NEXT:    [[TMP25:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP21]], 3
+; CHECK-NEXT:    [[MSPROP_MUL_CST:%.*]] = mul i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MUL5:%.*]] = mul i64 [[STRIDE]], 3
+; CHECK-NEXT:    [[_MSPROP2:%.*]] = or i64 [[TMP0]], [[MSPROP_MUL_CST]]
+; CHECK-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 [[MUL5]]
+; CHECK-NEXT:    [[_MSCMP5:%.*]] = icmp ne i64 [[_MSPROP2]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP5]], label [[TMP26:%.*]], label [[TMP27:%.*]], !prof [[PROF1]]
+; CHECK:       26:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       27:
+; CHECK-NEXT:    [[TMP28:%.*]] = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") [[TMP2]], ptr [[ARRAYIDX6]])
+; CHECK-NEXT:    [[TMP29:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 0
+; CHECK-NEXT:    [[TMP30:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 1
+; CHECK-NEXT:    [[TMP31:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 2
+; CHECK-NEXT:    [[TMP32:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[TMP28]], 3
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP6]], <vscale x 4 x float> [[TMP13]], <vscale x 4 x float> [[TMP22]], <vscale x 4 x float> [[TMP29]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP7]], <vscale x 4 x float> [[TMP14]], <vscale x 4 x float> [[TMP23]], <vscale x 4 x float> [[TMP30]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP8]], <vscale x 4 x float> [[TMP15]], <vscale x 4 x float> [[TMP24]], <vscale x 4 x float> [[TMP31]])
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> [[TMP9]], <vscale x 4 x float> [[TMP16]], <vscale x 4 x float> [[TMP25]], <vscale x 4 x float> [[TMP32]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = tail call target("aarch64.svcount") @llvm.aarch64.sve.ptrue.c8()
+  %1 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %ptr)
+  %2 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 0
+  %3 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 1
+  %4 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 2
+  %5 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %1, 3
+  %arrayidx2 = getelementptr inbounds i8, ptr %ptr, i64 %stride
+  %6 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx2)
+  %7 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 0
+  %8 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 1
+  %9 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 2
+  %10 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %6, 3
+  %mul3 = shl i64 %stride, 1
+  %arrayidx4 = getelementptr inbounds i8, ptr %ptr, i64 %mul3
+  %11 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx4)
+  %12 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 0
+  %13 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 1
+  %14 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 2
+  %15 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %11, 3
+  %mul5 = mul i64 %stride, 3
+  %arrayidx6 = getelementptr inbounds i8, ptr %ptr, i64 %mul5
+  %16 = tail call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %0, ptr %arrayidx6)
+  %17 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 0
+  %18 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 1
+  %19 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 2
+  %20 = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } %16, 3
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %2, <vscale x 4 x float> %7, <vscale x 4 x float> %12, <vscale x 4 x float> %17)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %3, <vscale x 4 x float> %8, <vscale x 4 x float> %13, <vscale x 4 x float> %18)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %4, <vscale x 4 x float> %9, <vscale x 4 x float> %14, <vscale x 4 x float> %19)
+  call void @llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32 0, <vscale x 4 x float> %5, <vscale x 4 x float> %10, <vscale x 4 x float> %15, <vscale x 4 x float> %20)
+  ret void
+}
+
+define void @multi_vector_add_za_vg1x4_f64(i32 %slice, <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1, <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3) sanitize_memory {
+; CHECK-LABEL: @multi_vector_add_za_vg1x4_f64(
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr @__msan_param_tls, align 8
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[_MSCMP:%.*]] = icmp ne i32 [[TMP1]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP]], label [[TMP2:%.*]], label [[TMP3:%.*]], !prof [[PROF1]]
+; CHECK:       2:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       3:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE:%.*]], <vscale x 2 x double> [[ZN0:%.*]], <vscale x 2 x double> [[ZN1:%.*]], <vscale x 2 x double> [[ZN2:%.*]], <vscale x 2 x double> [[ZN3:%.*]])
+; CHECK-NEXT:    [[_MSPROP:%.*]] = or i32 [[TMP1]], 0
+; CHECK-NEXT:    [[SLICE_7:%.*]] = add i32 [[SLICE]], 7
+; CHECK-NEXT:    [[_MSCMP1:%.*]] = icmp ne i32 [[_MSPROP]], 0
+; CHECK-NEXT:    br i1 [[_MSCMP1]], label [[TMP4:%.*]], label [[TMP5:%.*]], !prof [[PROF1]]
+; CHECK:       4:
+; CHECK-NEXT:    call void @__msan_warning_noreturn() #[[ATTR7]]
+; CHECK-NEXT:    unreachable
+; CHECK:       5:
+; CHECK-NEXT:    call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 [[SLICE_7]], <vscale x 2 x double> [[ZN0]], <vscale x 2 x double> [[ZN1]], <vscale x 2 x double> [[ZN2]], <vscale x 2 x double> [[ZN3]])
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
+  %slice.7 = add i32 %slice, 7
+  call void @llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32 %slice.7,
+  <vscale x 2 x double> %zn0, <vscale x 2 x double> %zn1,
+  <vscale x 2 x double> %zn2, <vscale x 2 x double> %zn3)
+  ret void
+}
+
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x2_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s8(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> [[ZDN1:%.*]], <vscale x 16 x i8> [[ZDN2:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8> } [[RES]]
+;
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8> }
+  @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
+  <vscale x 16 x i8> %zm)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_single_x2_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s16(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> [[ZDN1:%.*]], <vscale x 8 x i16> [[ZDN2:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 8 x i16>, <vscale x 8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16> } [[RES]]
+;
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16> }
+  @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
+  <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_single_x2_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> [[ZDN1:%.*]], <vscale x 4 x i32> [[ZDN2:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32> } [[RES]]
+;
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32> }
+  @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
+  <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_single_x2_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x2_s64(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> [[ZDN1:%.*]], <vscale x 2 x i64> [[ZDN2:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 2 x i64>, <vscale x 2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64> } [[RES]]
+;
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64> }
+  @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
+  <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+
+
+define { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @multi_vec_add_single_x4_s8(<vscale x 16 x i8> %unused, <vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2, <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4, <vscale x 16 x i8>%zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_single_x4_s8(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> [[ZDN1:%.*]], <vscale x 16 x i8> [[ZDN2:%.*]], <vscale x 16 x i8> [[ZDN3:%.*]], <vscale x 16 x i8> [[ZDN4:%.*]], <vscale x 16 x i8> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } [[RES]]
+;
+  %res = call { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> }
+  @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8> %zdn1, <vscale x 16 x i8> %zdn2,
+  <vscale x 16 x i8> %zdn3, <vscale x 16 x i8> %zdn4,
+  <vscale x 16 x i8> %zm)
+  ret { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } %res
+}
+
+define { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @multi_vec_add_x4_single_s16(<vscale x 8 x i16> %unused, <vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2, <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4, <vscale x 8 x i16> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s16(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> [[ZDN1:%.*]], <vscale x 8 x i16> [[ZDN2:%.*]], <vscale x 8 x i16> [[ZDN3:%.*]], <vscale x 8 x i16> [[ZDN4:%.*]], <vscale x 8 x i16> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } [[RES]]
+;
+  %res = call { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> }
+  @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16> %zdn1, <vscale x 8 x i16> %zdn2,
+  <vscale x 8 x i16> %zdn3, <vscale x 8 x i16> %zdn4,
+  <vscale x 8 x i16> %zm)
+  ret { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } %res
+}
+
+define { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @multi_vec_add_x4_single_s32(<vscale x 4 x i32> %unused, <vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2, <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4, <vscale x 4 x i32> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s32(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> [[ZDN1:%.*]], <vscale x 4 x i32> [[ZDN2:%.*]], <vscale x 4 x i32> [[ZDN3:%.*]], <vscale x 4 x i32> [[ZDN4:%.*]], <vscale x 4 x i32> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } [[RES]]
+;
+  %res = call { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> }
+  @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32> %zdn1, <vscale x 4 x i32> %zdn2,
+  <vscale x 4 x i32> %zdn3, <vscale x 4 x i32> %zdn4,
+  <vscale x 4 x i32> %zm)
+  ret { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } %res
+}
+
+define { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @multi_vec_add_x4_single_s64(<vscale x 2 x i64> %unused, <vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2, <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4, <vscale x 2 x i64> %zm) sanitize_memory {
+; CHECK-LABEL: @multi_vec_add_x4_single_s64(
+; CHECK-NEXT:    call void @llvm.donothing()
+; CHECK-NEXT:    [[RES:%.*]] = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> [[ZDN1:%.*]], <vscale x 2 x i64> [[ZDN2:%.*]], <vscale x 2 x i64> [[ZDN3:%.*]], <vscale x 2 x i64> [[ZDN4:%.*]], <vscale x 2 x i64> [[ZM:%.*]])
+; CHECK-NEXT:    store { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } zeroinitializer, ptr @__msan_retval_tls, align 8
+; CHECK-NEXT:    ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } [[RES]]
+;
+  %res = call { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> }
+  @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64> %zdn1, <vscale x 2 x i64> %zdn2,
+  <vscale x 2 x i64> %zdn3, <vscale x 2 x i64> %zdn4,
+  <vscale x 2 x i64> %zm)
+  ret { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } %res
+}
+declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.write.single.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.write.single.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.write.za.vg1x2.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv4i32(i32, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.write.za.vg1x4.nxv2i64(i32, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4i32(i32, <vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>,<vscale x 4 x i32>)
+declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2i64(i32, <vscale x 2 x i64>,<vscale x 2 x i64>,<vscale x 2 x i64>, <vscale x 2 x i64>)
+declare void@llvm.aarch64.sme.add.za32.vg1x2.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>)
+declare void@llvm.aarch64.sme.add.za64.vg1x2.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>)
+declare void@llvm.aarch64.sme.add.za32.vg1x4.nxv4f32(i32, <vscale x 4 x float>, <vscale x 4 x float>,<vscale x 4 x float>, <vscale x 4 x float>)
+declare void@llvm.aarch64.sme.add.za64.vg1x4.nxv2f64(i32, <vscale x 2 x double>, <vscale x 2 x double>,<vscale x 2 x double>, <vscale x 2 x double>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x2.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x2.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x2.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x2.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
+declare { <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.aarch64.sve.add.single.x4.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>)
+declare { <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16> } @llvm.aarch64.sve.add.single.x4.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>, <vscale x 8 x i16>)
+declare { <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32> } @llvm.aarch64.sve.add.single.x4.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>)
+declare { <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64> } @llvm.aarch64.sve.add.single.x4.nxv2i64(<vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>, <vscale x 2 x i64>)
diff --git a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
index 434ac8462b624..3d759f7a47d49 100644
--- a/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
+++ b/llvm/test/Instrumentation/NumericalStabilitySanitizer/basic.ll
@@ -865,33 +865,6 @@ entry:
   ret float %r
 }
 
-; Note that the `unsafe-fp-math` from the function attributes should be moved to
-; individual instructions, with the shadow instructions NOT getting the attribute.
-define float @param_add_return_float_unsafe_fp_math(float %a) #0 {
-; CHECK-LABEL: @param_add_return_float_unsafe_fp_math(
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr @__nsan_shadow_args_tag, align 8
-; CHECK-NEXT:    [[TMP1:%.*]] = icmp eq i64 [[TMP0]], ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64)
-; CHECK-NEXT:    [[TMP2:%.*]] = load double, ptr @__nsan_shadow_args_ptr, align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = fpext float [[A:%.*]] to double
-; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP1]], double [[TMP2]], double [[TMP3]]
-; CHECK-NEXT:    store i64 0, ptr @__nsan_shadow_args_tag, align 8
-; CHECK-NEXT:    [[B:%.*]] = fadd fast float [[A]], 1.000000e+00
-; CHECK-NEXT:    [[TMP5:%.*]] = fadd double [[TMP4]], 1.000000e+00
-; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @__nsan_internal_check_float_d(float [[B]], double [[TMP5]], i32 1, i64 0)
-; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[TMP6]], 1
-; CHECK-NEXT:    [[TMP8:%.*]] = fpext float [[B]] to double
-; CHECK-NEXT:    [[TMP9:%.*]] = select i1 [[TMP7]], double [[TMP8]], double [[TMP5]]
-; CHECK-NEXT:    store i64 ptrtoint (ptr @param_add_return_float_unsafe_fp_math to i64), ptr @__nsan_shadow_ret_tag, align 8
-; CHECK-NEXT:    store double [[TMP9]], ptr @__nsan_shadow_ret_ptr, align 8
-; CHECK-NEXT:    ret float [[B]]
-;
-entry:
-  %b = fadd float %a, 1.0
-  ret float %b
-}
-
-
 define void @truncate(<2 x double> %0) sanitize_numerical_stability {
 ; DQQ-LABEL: @truncate(
 ; DQQ-NEXT:  entry:
@@ -941,4 +914,4 @@ entry:
 }
 
 
-attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind readonly uwtable sanitize_numerical_stability "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
diff --git a/llvm/test/LTO/AArch64/Inputs/bar.ll b/llvm/test/LTO/AArch64/Inputs/bar.ll
new file mode 100644
index 0000000000000..7c2a753bcc403
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/bar.ll
@@ -0,0 +1,35 @@
+;; This file contains the new semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local void @bar() #0 {
+entry:
+  ret void
+}
+; CHECK-LABEL: bar:
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
+define dso_local void @baz() #1 {
+entry:
+  ret void
+}
+
+; CHECK-LABEL: baz:
+; CHECK:           bti c
+; CHECK:           ret
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 8, !"branch-target-enforcement", i32 2}
+!1 = !{i32 8, !"sign-return-address", i32 2}
+!2 = !{i32 8, !"sign-return-address-all", i32 2}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
diff --git a/llvm/test/LTO/AArch64/Inputs/fiz.ll b/llvm/test/LTO/AArch64/Inputs/fiz.ll
new file mode 100644
index 0000000000000..e57842681d918
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/fiz.ll
@@ -0,0 +1,41 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @func()
+
+define i32 @fiz_on() #0 {
+entry:
+  call void @func()
+  ret i32 42
+}
+
+; CHECK-LABEL: fiz_on:
+; CHECK:           paciasp
+; CHECK:           bl func
+; CHECK:           retaa
+
+define i32 @fiz_off() #1 {
+entry:
+  ret i32 43
+}
+
+; CHECK-LABEL: fiz_off:
+; CHECK-NOT:       pac
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
+attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" }
+
+!llvm.module.flags = !{!0, !1, !2, !3}
+
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 0}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
diff --git a/llvm/test/LTO/AArch64/Inputs/foo.ll b/llvm/test/LTO/AArch64/Inputs/foo.ll
index 961b0d4e7997e..689d93849939d 100644
--- a/llvm/test/LTO/AArch64/Inputs/foo.ll
+++ b/llvm/test/LTO/AArch64/Inputs/foo.ll
@@ -1,12 +1,34 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
-define dso_local i32 @foo() #0 {
+define i32 @foo_on() #0 {
 entry:
   ret i32 42
 }
 
+; CHECK-LABEL: foo_on:
+; CHECK:           pacibsp
+; CHECK:           mov
+; CHECK:           retab
+
+define i32 @foo_off() #1 {
+entry:
+  ret i32 43
+}
+
+; CHECK-LABEL: foo_off:
+; CHECK-NOT:       pac
+; CHECK-NOT:       hint
+; CHECK-NOT:       bti
+; CHECK:           ret
+
 attributes #0 = { noinline nounwind optnone uwtable }
+attributes #1 = { noinline nounwind optnone uwtable "branch-target-enforcement"="false" "sign-return-address"="none" }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
 
diff --git a/llvm/test/LTO/AArch64/Inputs/old.ll b/llvm/test/LTO/AArch64/Inputs/old.ll
new file mode 100644
index 0000000000000..2b1758b0107a1
--- /dev/null
+++ b/llvm/test/LTO/AArch64/Inputs/old.ll
@@ -0,0 +1,59 @@
+;; This file contains the previous semantic of the branch-target-enforcement, sign-return-address.
+;; Used for test mixing a mixed link case and also verify the import too in llc.
+
+; RUN: llc -mattr=+pauth -mattr=+bti %s -o - | FileCheck %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define i32 @old_bti() #0 {
+entry:
+  ret i32 2
+}
+
+; CHECK-LABEL: old_bti:
+; CHECK:           bti c
+; CHECK:           mov
+; CHECK:           ret
+
+define i32 @old_pac() #1 {
+entry:
+  ret i32 2
+}
+
+; CHECK-LABEL: old_pac:
+; CHECK:           paciasp
+; CHECK:           mov
+; CHECK:           retaa
+
+
+define i32 @old_none() #2 {
+entry:
+  ret i32 3
+}
+
+; CHECK-LABEL: old_none:
+; CHECK-NOT:           hint
+; CHECK-NOT:           paci
+; CHECK-NOT:           bti
+; CHECK:           ret
+
+declare i32 @func(i32)
+
+define i32 @old_none_leaf() #3 {
+entry:
+  %0 = call i32 @func()
+  ret i32 %0
+}
+
+; CHECK-LABEL: old_none_leaf:
+; CHECK:           paciasp
+; CHECK:           bl      func
+; CHECK:           retaa
+
+attributes #0 = { noinline nounwind optnone "branch-target-enforcement"="true" }
+attributes #1 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="all" "sign-return-address-key"="a_key" }
+attributes #2 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="none" }
+attributes #3 = { noinline nounwind optnone "branch-target-enforcement"="false" "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" }
+
+;; Intentionally no module flags
diff --git a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
index b3c9828cb645a..20254de995f73 100644
--- a/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
+++ b/llvm/test/LTO/AArch64/link-branch-target-enforcement.ll
@@ -1,10 +1,10 @@
-; Testcase to check that module with different branch-target-enforcement can
-; be mixed.
-;
+;; Testcase to check that module with different branch-target-enforcement can
+;; be mixed.
+;;
 ; RUN: llvm-as %s -o %t1.bc
 ; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
 ; RUN: llvm-lto -exported-symbol main \
-; RUN:          -exported-symbol foo \
+; RUN:          -exported-symbol foo_on \
 ; RUN:          -filetype=obj \
 ; RUN:           %t1.bc %t2.bc \
 ; RUN:           -o %t1.exe 2>&1 | FileCheck --allow-empty %s
@@ -14,11 +14,11 @@
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
 
-declare i32 @foo();
+declare i32 @foo_on();
 
 define i32 @main() "sign-return-address"="non-leaf" "sign-return-address-key"="a_key" {
 entry:
-  %add = call i32 @foo()
+  %add = call i32 @foo_on()
   ret i32 %add
 }
 
@@ -30,9 +30,12 @@ entry:
 
 ; CHECK-NOT: linking module flags 'branch-target-enforcement': IDs have conflicting values in
 ; CHECK-DUMP: <main>:
+; CHECK-DUMP:      paciasp
+; CHECK-DUMP:      str
 ; CHECK-DUMP:      bl      0x8 <main+0x8>
-; CHECK-DUMP: <foo>:
+; CHECK-DUMP: <foo_on>:
+; CHECK-DUMP:     pacibsp
 
-; `main` doesn't support BTI while `foo` does, so in the binary
-; we should see only PAC which is supported by both.
+;; `main` doesn't support BTI while `foo` does, so in the binary
+;; we should see only PAC which is supported by both.
 ; CHECK-PROP:   Properties: aarch64 feature: PAC
\ No newline at end of file
diff --git a/llvm/test/LTO/AArch64/link-sign-return-address.ll b/llvm/test/LTO/AArch64/link-sign-return-address.ll
new file mode 100644
index 0000000000000..331e481b69e43
--- /dev/null
+++ b/llvm/test/LTO/AArch64/link-sign-return-address.ll
@@ -0,0 +1,127 @@
+;; Testcase to check that module with different sign return address can
+;; be mixed.
+;
+; RUN: llvm-as %s -o %t1.bc
+; RUN: llvm-as %p/Inputs/foo.ll -o %t2.bc
+; RUN: llvm-as %p/Inputs/fiz.ll -o %t3.bc
+; RUN: llvm-as %p/Inputs/bar.ll -o %t4.bc
+; RUN: llvm-as %p/Inputs/old.ll -o %t5.bc
+; RUN: llvm-lto -exported-symbol main \
+; RUN:          -exported-symbol foo_on \
+; RUN:          -exported-symbol foo_off \
+; RUN:          -exported-symbol fiz_on \
+; RUN:          -exported-symbol fiz_off \
+; RUN:          -exported-symbol bar \
+; RUN:          -exported-symbol baz \
+; RUN:          -exported-symbol old_bti \
+; RUN:          -exported-symbol old_pac \
+; RUN:          -exported-symbol old_none \
+; RUN:          -filetype=obj \
+; RUN:          %t5.bc %t4.bc %t3.bc %t2.bc %t1.bc \
+; RUN:           -o %t1.exe 2>&1
+; RUN: llvm-objdump -d %t1.exe | FileCheck --check-prefix=CHECK-DUMP %s
+; RUN: llvm-readelf -n %t1.exe | FileCheck --allow-empty --check-prefix=CHECK-PROP %s
+
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare i32 @foo_on();
+declare i32 @foo_off();
+declare i32 @fiz_on();
+declare i32 @fiz_off();
+declare void @baz();
+declare void @bar();
+declare i32 @old_bti();
+declare i32 @old_pac();
+declare i32 @old_none();
+
+define i32 @main() #0 {
+entry:
+  call i32 @foo_on()
+  call i32 @foo_off()
+  call i32 @fiz_on()
+  call i32 @fiz_off()
+  call void @bar()
+  call void @baz()
+  call i32 @old_bti()
+  call i32 @old_pac()
+  call i32 @old_none()
+  ret i32 0
+}
+
+attributes #0 = { noinline nounwind optnone }
+
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 0}
+!1 = !{i32 8, !"sign-return-address", i32 0}
+!2 = !{i32 8, !"sign-return-address-all", i32 0}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 0}
+
+
+; CHECK-DUMP-LABEL: <old_bti>:
+; CHECK-DUMP-NEXT:     bti c
+; CHECK-DUMP-NEXT:     mov     w0, #0x2
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <old_pac>:
+; CHECK-DUMP-NEXT:     paciasp
+; CHECK-DUMP-NEXT:     mov     w0, #0x2
+; CHECK-DUMP-NEXT:     autiasp
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <old_none>:
+; CHECK-DUMP-NEXT:     mov     w0, #0x3
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <bar>:
+; CHECK-DUMP-NEXT:     ret
+
+; CHECK-DUMP-LABEL: <baz>:
+; CHECK-DUMP-NEXT:     bti c
+; CHECK-DUMP-NEXT:     ret
+
+;; fiz.ll represents a module with the old style of the function attributes.
+;; fiz_on shall have PAC with A-key as it requested at module level.
+; CHECK-DUMP-LABEL: <fiz_on>:
+; CHECK-DUMP-NEXT:     paciasp
+; CHECK-DUMP-NEXT:     str     x30, [sp, #-0x10]!
+; CHECK-DUMP-NEXT:     bl      0x38 <fiz_on+0x8>
+; CHECK-DUMP-NEXT:     mov     w0, #0x2a
+; CHECK-DUMP-NEXT:     ldr     x30, [sp], #0x10
+; CHECK-DUMP-NEXT:     autiasp
+; CHECK-DUMP-NEXT:     ret
+
+;; fiz_off shall not have BTI or PAC instructions as they are disabled at function scope.
+; CHECK-DUMP-LABEL:  <fiz_off>:
+; CHECK-DUMP-NEXT:       mov     w0, #0x2b
+; CHECK-DUMP-NEXT:       ret
+
+;; foo.ll represents a module with the old style of the function attributes.
+;; foo_on shall have PAC with B-key as it requested at module level.
+; CHECK-DUMP-LABEL: <foo_on>:
+; CHECK-DUMP-NEXT:     pacibsp
+; CHECK-DUMP-NEXT:     mov     w0, #0x2a
+; CHECK-DUMP-NEXT:     autibsp
+; CHECK-DUMP-NEXT:     ret
+
+;; foo_off shall not have BTI or PAC instructions as they are disabled at function scope.
+; CHECK-DUMP-LABEL:  <foo_off>:
+; CHECK-DUMP-NEXT:       mov     w0, #0x2b
+; CHECK-DUMP-NEXT:       ret
+
+; CHECK-DUMP-LABEL: <main>:
+; CHECK-DUMP-NOT:       paciasp
+; CHECK-DUMP-NEXT:      str     x30,
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+; CHECK-DUMP-NEXT:      bl
+
+;; `main` doesn't support PAC sign-return-address while `foo` does, so in the binary
+;; we should not see anything.
+; CHECK-PROP-NOT:   Properties: aarch64 feature: PAC
diff --git a/llvm/test/Linker/link-arm-and-thumb.ll b/llvm/test/Linker/link-arm-and-thumb.ll
index a90f2128e4430..b5984bf557947 100644
--- a/llvm/test/Linker/link-arm-and-thumb.ll
+++ b/llvm/test/Linker/link-arm-and-thumb.ll
@@ -13,11 +13,11 @@ entry:
   ret i32 %add
 }
 
-; CHECK: define i32 @main() {
+; CHECK: define i32 @main()
 ; CHECK: define i32 @foo(i32 %a, i32 %b) [[ARM_ATTRS:#[0-9]+]]
 ; CHECK: define i32 @bar(i32 %a, i32 %b) [[THUMB_ATTRS:#[0-9]+]]
 
-; CHECK: attributes [[ARM_ATTRS]] = { "target-features"="-thumb-mode" }
-; CHECK: attributes [[THUMB_ATTRS]] = { "target-features"="+thumb-mode" }
+; CHECK: attributes [[ARM_ATTRS]] = {{{.*}}"target-features"="-thumb-mode" }
+; CHECK: attributes [[THUMB_ATTRS]] = {{{.*}}"target-features"="+thumb-mode" }
 
 ; STDERR-NOT: warning: Linking two modules of different target triples:
diff --git a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
index cf8d216581240..15efbee14ddc1 100644
--- a/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/FP8/fmmla-diagnostics.s
@@ -16,7 +16,7 @@ fmmla v0.4s, v1.4s, v2.4s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 fmmla v0.8h, v1.8h, v2.8h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction requires: f16mm
 // CHECK-NEXT: fmmla v0.8h, v1.8h, v2.8h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
new file mode 100644
index 0000000000000..c25ff664afb73
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6-diagnostics.s
@@ -0,0 +1,191 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z0.h, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z0.h, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.s, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.s, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z0.d, zt0, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z0.d, zt0, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, zt0, z1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, zt0, z1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid vectors/mis-matched registers/invalid index
+
+luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.h - z5.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers/index
+
+luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.h, z2.h, z3.h, z4.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: luti6 { z0.b, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid registers
+
+luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z1 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid number of vectors
+// CHECK-NEXT: luti6 { z0.b - z5.b }, zt0, { z7 - z11 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid lookup table, expected zt0
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt1, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z4.b, z8.b, z12.b, z16.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z17.b, z21.b, z25.b, z29.b}, zt0, { z2 - z5 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b - z3.b }, zt0, { z1 - z3 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Wrong striding/registers
+
+luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: registers must have the same sequential stride
+// CHECK-NEXT: luti6 { z1.b, z5.b, z9.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 4 consecutive SVE vectors, where the first vector is a multiple of 4 and with matching element types
+// CHECK-NEXT: luti6 { z1.b, z2.b, z3.b, z4.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector register expected
+// CHECK-NEXT: luti6 { z20.b, z24.b, z28.b, z32.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 { z1.h, z5.h, z9.h, z13.h }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z0 - z2 }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SME2p3/luti6.s b/llvm/test/MC/AArch64/SME2p3/luti6.s
new file mode 100644
index 0000000000000..7a7872f37a73b
--- /dev/null
+++ b/llvm/test/MC/AArch64/SME2p3/luti6.s
@@ -0,0 +1,472 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sme2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sme2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sme2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (single)
+
+luti6 z0.b, zt0, z0
+// CHECK-INST: luti6 z0.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x00,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c84000 <unknown>
+
+luti6 z31.b, zt0, z0
+// CHECK-INST: luti6 z31.b, zt0, z0
+// CHECK-ENCODING: encoding: [0x1f,0x40,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c8401f <unknown>
+
+luti6 z0.b, zt0, z31
+// CHECK-INST: luti6 z0.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xe0,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843e0 <unknown>
+
+luti6 z31.b, zt0, z31
+// CHECK-INST: luti6 z31.b, zt0, z31
+// CHECK-ENCODING: encoding: [0xff,0x43,0xc8,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c0c843ff <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - consecutive
+
+luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f400 <unknown>
+
+luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x08,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f408 <unknown>
+
+luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x14,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f414 <unknown>
+
+luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x1c,0xf4,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120f41c <unknown>
+
+luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h - z3.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e0 <unknown>
+
+luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z8.h - z11.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe8,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7e8 <unknown>
+
+luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z20.h - z23.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf4,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7f4 <unknown>
+
+luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z28.h - z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xfc,0xf7,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ff7fc <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit) - strided
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc00 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x01,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc01 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x02,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc02 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x03,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc03 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x10,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc10 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x11,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc11 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x12,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc12 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z0.h, z1.h }, { z0, z1 }[0]
+// CHECK-ENCODING: encoding: [0x13,0xfc,0x20,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c120fc13 <unknown>
+
+luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z0.h, z4.h, z8.h, z12.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe0 <unknown>
+
+luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z1.h, z5.h, z9.h, z13.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe1 <unknown>
+
+luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z2.h, z6.h, z10.h, z14.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe2 <unknown>
+
+luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z3.h, z7.h, z11.h, z15.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xe3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17fffe3 <unknown>
+
+luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z16.h, z20.h, z24.h, z28.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf0,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff0 <unknown>
+
+luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z17.h, z21.h, z25.h, z29.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf1,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff1 <unknown>
+
+luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z18.h, z22.h, z26.h, z30.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf2,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff2 <unknown>
+
+luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-INST: luti6 { z19.h, z23.h, z27.h, z31.h }, { z31.h, z0.h }, { z31, z0 }[1]
+// CHECK-ENCODING: encoding: [0xf3,0xff,0x7f,0xc1]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c17ffff3 <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - consecutive
+
+luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x08,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0008 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x14,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0014 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x1c,0x00,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a001c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0100 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x08,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0108 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x14,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0114 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x1c,0x01,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a011c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0280 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x88,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0288 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x94,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0294 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x9c,0x02,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a029c <unknown>
+
+luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b - z3.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0380 <unknown>
+
+luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z8.b - z11.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x88,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0388 <unknown>
+
+luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z20.b - z23.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x94,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a0394 <unknown>
+
+luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z28.b - z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x9c,0x03,0x8a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c08a039c <unknown>
+
+// ----------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit) - strided
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x01,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0001 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x02,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0002 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x03,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0003 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x10,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0010 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x11,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0011 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x12,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0012 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z0 - z2 }
+// CHECK-ENCODING: encoding: [0x13,0x00,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0013 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x00,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0100 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x01,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0101 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x02,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0102 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x03,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0103 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x10,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0110 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x11,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0111 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x12,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0112 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z2 - z4 }
+// CHECK-ENCODING: encoding: [0x13,0x01,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0113 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x80,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0280 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x81,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0281 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x82,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0282 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x83,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0283 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x90,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0290 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x91,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0291 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x92,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0292 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z5 - z7 }
+// CHECK-ENCODING: encoding: [0x93,0x02,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0293 <unknown>
+
+luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z0.b, z4.b, z8.b, z12.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x80,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0380 <unknown>
+
+luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z1.b, z5.b, z9.b, z13.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x81,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0381 <unknown>
+
+luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z2.b, z6.b, z10.b, z14.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x82,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0382 <unknown>
+
+luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z3.b, z7.b, z11.b, z15.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x83,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0383 <unknown>
+
+luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z16.b, z20.b, z24.b, z28.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x90,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0390 <unknown>
+
+luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z17.b, z21.b, z25.b, z29.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x91,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0391 <unknown>
+
+luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z18.b, z22.b, z26.b, z30.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x92,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0392 <unknown>
+
+luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-INST: luti6 { z19.b, z23.b, z27.b, z31.b }, zt0, { z7 - z9 }
+// CHECK-ENCODING: encoding: [0x93,0x03,0x9a,0xc0]
+// CHECK-ERROR: instruction requires: sme2p3
+// CHECK-UNKNOWN: c09a0393 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
index 409c2c5741cab..4695b050a681e 100644
--- a/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE/bfmmla-diagnostics.s
@@ -5,11 +5,6 @@ bfmmla z0.s, z1.s, z2.h
 // CHECK-NEXT: bfmmla z0.s, z1.s, z2.h
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
-bfmmla z0.h, z1.h, z2.h
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
-// CHECK-NEXT: bfmmla z0.h, z1.h, z2.h
-// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
-
 bfmmla z0.s, z1.h, z2.s
 // CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
 // CHECK-NEXT: bfmmla z0.s, z1.h, z2.s
diff --git a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
index 344abb1fe9082..42c6ae7da156b 100644
--- a/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/sdot-diagnostics.s
@@ -29,7 +29,7 @@ sdot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 sdot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: sdot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
diff --git a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
index 0debddf82b37c..aa2ec7f4d3204 100644
--- a/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
+++ b/llvm/test/MC/AArch64/SVE2p1/udot-diagnostics.s
@@ -29,11 +29,11 @@ udot z0.s, z0.h, z0.h[-1]
 // Invalid vector suffix
 
 udot z0.h, z0.s, z0.s
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
 
 udot z0.h, z0.s, z0.s[1]
-// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // CHECK-NEXT: udot z0.h, z0.s, z0.s[1]
 // CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
new file mode 100644
index 0000000000000..e88fcce482f13
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla-diagnostics.s
@@ -0,0 +1,15 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+fmmla z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: fmmla z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0.d, p0/z, z6.d
+fmmla z0.h, z2.h, z3.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx
+// CHECK-NEXT: fmmla z0.h, z2.h, z3.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p2/fmmla.s b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
new file mode 100644
index 0000000000000..19929a921dba6
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p2/fmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p2,+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p2,-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p2,+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p2,+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla z0.h, z0.h, z0.h
+// CHECK-INST: fmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xa0,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a0e000 <unknown>
+
+fmmla z10.h, z10.h, z10.h
+// CHECK-INST: fmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xaa,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64aae14a <unknown>
+
+fmmla z21.h, z21.h, z21.h
+// CHECK-INST: fmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xb5,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64b5e2b5 <unknown>
+
+fmmla z31.h, z31.h, z31.h
+// CHECK-INST: fmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xbf,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64bfe3ff <unknown>
+
+movprfx z0, z7
+fmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: fmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xa2,0x64]
+// CHECK-ERROR: instruction requires: f16mm sve2p2
+// CHECK-UNKNOWN: 64a2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
new file mode 100644
index 0000000000000..2c0cf07c95aae
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic-diagnostics.s
@@ -0,0 +1,147 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Test addqp
+
+addqp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addqp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addqp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.s, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.s, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+addsubp z0.b, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: addsubp z0.b, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uabal z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uabal z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.h, p0/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.h, p0/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.s, p0/m, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.s, p0/m, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.d, p0/m, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.d, p0/m, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+subp z0.b, p0/m, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: subp z0.b, p0/m, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Predicate not in restricted predicate range
+
+subp z0.h, p8/m, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix)
+// CHECK-NEXT: subp z0.h, p8/m, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Operand must match destination register
+
+subp z0.b, p0/m, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must match destination register
+// CHECK-NEXT: subp z0.b, p0/m, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+addqp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addqp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+addsubp z0.b, z1.b, z2.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: addsubp z0.b, z1.b, z2.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/arithmetic.s b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
new file mode 100644
index 0000000000000..12df18de60381
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/arithmetic.s
@@ -0,0 +1,275 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+addqp z0.b, z0.b, z0.b
+// CHECK-INST: addqp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x78,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207800 <unknown>
+
+addqp z31.b, z31.b, z31.b
+// CHECK-INST: addqp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7bff <unknown>
+
+addqp z0.h, z0.h, z0.h
+// CHECK-INST: addqp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x78,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607800 <unknown>
+
+addqp z31.h, z31.h, z31.h
+// CHECK-INST: addqp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7b,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7bff <unknown>
+
+addqp z0.s, z0.s, z0.s
+// CHECK-INST: addqp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x78,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07800 <unknown>
+
+addqp z31.s, z31.s, z31.s
+// CHECK-INST: addqp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7bff <unknown>
+
+addqp z0.d, z0.d, z0.d
+// CHECK-INST: addqp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x78,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07800 <unknown>
+
+addqp z31.d, z31.d, z31.d
+// CHECK-INST: addqp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7b,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7bff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test addsubp
+
+addsubp z0.b, z0.b, z0.b
+// CHECK-INST: addsubp z0.b, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x20,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04207c00 <unknown>
+
+addsubp z31.b, z31.b, z31.b
+// CHECK-INST: addsubp z31.b, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x3f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 043f7fff <unknown>
+
+addsubp z0.h, z0.h, z0.h
+// CHECK-INST: addsubp z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x7c,0x60,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04607c00 <unknown>
+
+addsubp z31.h, z31.h, z31.h
+// CHECK-INST: addsubp z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x7f,0x7f,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 047f7fff <unknown>
+
+addsubp z0.s, z0.s, z0.s
+// CHECK-INST: addsubp z0.s, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xa0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04a07c00 <unknown>
+
+addsubp z31.s, z31.s, z31.s
+// CHECK-INST: addsubp z31.s, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xbf,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04bf7fff <unknown>
+
+addsubp z0.d, z0.d, z0.d
+// CHECK-INST: addsubp z0.d, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0x7c,0xe0,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04e07c00 <unknown>
+
+addsubp z31.d, z31.d, z31.d
+// CHECK-INST: addsubp z31.d, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0x7f,0xff,0x04]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 04ff7fff <unknown>
+
+// --------------------------------------------------------------------------//
+// Test sabal
+
+sabal z0.h, z0.b, z0.b
+// CHECK-INST: sabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440d400 <unknown>
+
+sabal z31.h, z31.b, z31.b
+// CHECK-INST: sabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fd7ff <unknown>
+
+sabal z0.s, z0.h, z0.h
+// CHECK-INST: sabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xd4,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480d400 <unknown>
+
+sabal z31.s, z31.h, z31.h
+// CHECK-INST: sabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xd7,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fd7ff <unknown>
+
+sabal z0.d, z0.s, z0.s
+// CHECK-INST: sabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xd4,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0d400 <unknown>
+
+sabal z31.d, z31.s, z31.s
+// CHECK-INST: sabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xd7,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfd7ff <unknown>
+
+movprfx z0, z7
+sabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xd4,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442d420 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test uabal
+
+uabal z0.h, z0.b, z0.b
+// CHECK-INST: uabal z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4440dc00 <unknown>
+
+uabal z31.h, z31.b, z31.b
+// CHECK-INST: uabal z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445fdfff <unknown>
+
+uabal z0.s, z0.h, z0.h
+// CHECK-INST: uabal z0.s, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xdc,0x80,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4480dc00 <unknown>
+
+uabal z31.s, z31.h, z31.h
+// CHECK-INST: uabal z31.s, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xdf,0x9f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 449fdfff <unknown>
+
+uabal z0.d, z0.s, z0.s
+// CHECK-INST: uabal z0.d, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xdc,0xc0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44c0dc00 <unknown>
+
+uabal z31.d, z31.s, z31.s
+// CHECK-INST: uabal z31.d, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xdf,0xdf,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44dfdfff <unknown>
+
+movprfx z0, z7
+uabal	z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: uabal z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0xdc,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4442dc20 <unknown>
+
+// --------------------------------------------------------------------------//
+// Test subp
+
+subp z0.b, p0/m, z0.b, z0.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a000 <unknown>
+
+subp z31.b, p7/m, z31.b, z31.b
+// CHECK-INST: subp z31.b, p7/m, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410bfff <unknown>
+
+subp z0.h, p0/m, z0.h, z0.h
+// CHECK-INST: subp z0.h, p0/m, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450a000 <unknown>
+
+subp z31.h, p7/m, z31.h, z31.h
+// CHECK-INST: subp z31.h, p7/m, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x50,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4450bfff <unknown>
+
+subp z0.s, p0/m, z0.s, z0.s
+// CHECK-INST: subp z0.s, p0/m, z0.s, z0.s
+// CHECK-ENCODING: encoding: [0x00,0xa0,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490a000 <unknown>
+
+subp z31.s, p7/m, z31.s, z31.s
+// CHECK-INST: subp z31.s, p7/m, z31.s, z31.s
+// CHECK-ENCODING: encoding: [0xff,0xbf,0x90,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4490bfff <unknown>
+
+subp z0.d, p0/m, z0.d, z0.d
+// CHECK-INST: subp z0.d, p0/m, z0.d, z0.d
+// CHECK-ENCODING: encoding: [0x00,0xa0,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0a000 <unknown>
+
+subp z31.d, p7/m, z31.d, z31.d
+// CHECK-INST: subp z31.d, p7/m, z31.d, z31.d
+// CHECK-ENCODING: encoding: [0xff,0xbf,0xd0,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44d0bfff <unknown>
+
+movprfx z0.b, p0/m, z7.b
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0.b, p0/m, z7.b
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
+
+movprfx z0, z7
+subp	z0.b, p0/m, z0.b, z1.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: subp z0.b, p0/m, z0.b, z1.b
+// CHECK-ENCODING: encoding: [0x20,0xa0,0x10,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4410a020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
new file mode 100644
index 0000000000000..28ec78d90f63e
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla-diagnostics.s
@@ -0,0 +1,19 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+bfmmla z0.h, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.h, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+bfmmla z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: bfmmla z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/bfmmla.s b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
new file mode 100644
index 0000000000000..77440eeb30f97
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/bfmmla.s
@@ -0,0 +1,45 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve-b16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve-b16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve-b16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve-b16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+bfmmla z0.h, z0.h, z0.h
+// CHECK-INST: bfmmla z0.h, z0.h, z0.h
+// CHECK-ENCODING: encoding: [0x00,0xe0,0xe0,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e0e000 <unknown>
+
+bfmmla z10.h, z10.h, z10.h
+// CHECK-INST: bfmmla z10.h, z10.h, z10.h
+// CHECK-ENCODING: encoding: [0x4a,0xe1,0xea,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64eae14a <unknown>
+
+bfmmla z21.h, z21.h, z21.h
+// CHECK-INST: bfmmla z21.h, z21.h, z21.h
+// CHECK-ENCODING: encoding: [0xb5,0xe2,0xf5,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64f5e2b5 <unknown>
+
+bfmmla z31.h, z31.h, z31.h
+// CHECK-INST: bfmmla z31.h, z31.h, z31.h
+// CHECK-ENCODING: encoding: [0xff,0xe3,0xff,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64ffe3ff <unknown>
+
+movprfx z0, z7
+bfmmla z0.h, z1.h, z2.h
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: bfmmla z0.h, z1.h, z2.h
+// CHECK-ENCODING: encoding: [0x20,0xe0,0xe2,0x64]
+// CHECK-ERROR: instruction requires: sve-b16mm
+// CHECK-UNKNOWN: 64e2e020 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
new file mode 100644
index 0000000000000..68a50ab711b13
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt-diagnostics.s
@@ -0,0 +1,193 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzsn z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzsn z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzsn z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+fcvtzun z0.b, { z0.b, z1.b }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.b, { z0.b, z1.b }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.h, { z0.h, z1.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.h, { z0.h, z1.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.s, { z0.s, z1.s }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fcvtzun z0.s, { z0.s, z1.s }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fcvtzun z0.b, { z1.h, z2.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: fcvtzun z0.b, { z1.h, z2.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+fcvtzun z0.b, { z2.h, z3.h }
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: fcvtzun z0.b, { z2.h, z3.h }
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+scvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+scvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: scvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+scvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: scvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtf z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtf z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtf z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtf z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtf z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+ucvtflt z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+ucvtflt z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: ucvtflt z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+ucvtflt z0.h, z1.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: ucvtflt z0.h, z1.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/cvt.s b/llvm/test/MC/AArch64/SVE2p3/cvt.s
new file mode 100644
index 0000000000000..da9c463c8b232
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/cvt.s
@@ -0,0 +1,321 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to signed integer, rounding toward zero
+
+fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3000 <unknown>
+
+fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzsn z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d301f <unknown>
+
+fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33c0 <unknown>
+
+fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzsn z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d33df <unknown>
+
+fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3000 <unknown>
+
+fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzsn z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d301f <unknown>
+
+fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33c0 <unknown>
+
+fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzsn z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d33df <unknown>
+
+fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3000 <unknown>
+
+fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzsn z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x30,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd301f <unknown>
+
+fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33c0 <unknown>
+
+fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzsn z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x33,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd33df <unknown>
+
+// -------------------------------------------------------------
+// Floating-point convert, narrow and interleave to unsigned integer, rounding toward zero
+
+fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z0.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d3400 <unknown>
+
+fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-INST: fcvtzun z31.b, { z0.h, z1.h }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d341f <unknown>
+
+fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z0.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37c0 <unknown>
+
+fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-INST: fcvtzun z31.b, { z30.h, z31.h }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x4d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654d37df <unknown>
+
+fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z0.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d3400 <unknown>
+
+fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-INST: fcvtzun z31.h, { z0.s, z1.s }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d341f <unknown>
+
+fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z0.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37c0 <unknown>
+
+fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-INST: fcvtzun z31.h, { z30.s, z31.s }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0x8d,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658d37df <unknown>
+
+fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z0.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd3400 <unknown>
+
+fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-INST: fcvtzun z31.s, { z0.d, z1.d }
+// CHECK-ENCODING: encoding: [0x1f,0x34,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd341f <unknown>
+
+fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z0.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xc0,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37c0 <unknown>
+
+fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-INST: fcvtzun z31.s, { z30.d, z31.d }
+// CHECK-ENCODING: encoding: [0xdf,0x37,0xcd,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cd37df <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (bottom, unpredicated)
+
+scvtf z0.h, z0.b
+// CHECK-INST: scvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x30,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3000 <unknown>
+
+scvtf z31.h, z31.b
+// CHECK-INST: scvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x33,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c33ff <unknown>
+
+scvtf z0.s, z0.h
+// CHECK-INST: scvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x30,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3000 <unknown>
+
+scvtf z31.s, z31.h
+// CHECK-INST: scvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x33,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c33ff <unknown>
+
+scvtf z0.d, z0.s
+// CHECK-INST: scvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x30,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3000 <unknown>
+
+scvtf z31.d, z31.s
+// CHECK-INST: scvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x33,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc33ff <unknown>
+
+// -----------------------------------------------------------------------
+// Signed integer convert to floating-point (top, unpredicated)
+
+scvtflt z0.h, z0.b
+// CHECK-INST: scvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x38,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3800 <unknown>
+
+scvtflt z31.h, z31.b
+// CHECK-INST: scvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3bff <unknown>
+
+scvtflt z0.s, z0.h
+// CHECK-INST: scvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x38,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3800 <unknown>
+
+scvtflt z31.s, z31.h
+// CHECK-INST: scvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3b,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3bff <unknown>
+
+scvtflt z0.d, z0.s
+// CHECK-INST: scvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x38,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3800 <unknown>
+
+scvtflt z31.d, z31.s
+// CHECK-INST: scvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3b,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3bff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (bottom, unpredicated)
+
+ucvtf z0.h, z0.b
+// CHECK-INST: ucvtf z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x34,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3400 <unknown>
+
+ucvtf z31.h, z31.b
+// CHECK-INST: ucvtf z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x37,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c37ff <unknown>
+
+ucvtf z0.s, z0.h
+// CHECK-INST: ucvtf z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x34,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3400 <unknown>
+
+ucvtf z31.s, z31.h
+// CHECK-INST: ucvtf z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x37,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c37ff <unknown>
+
+ucvtf z0.d, z0.s
+// CHECK-INST: ucvtf z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x34,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3400 <unknown>
+
+ucvtf z31.d, z31.s
+// CHECK-INST: ucvtf z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x37,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc37ff <unknown>
+
+// -----------------------------------------------------------------------
+// Unsigned integer convert to floating-point (top, unpredicated)
+
+ucvtflt z0.h, z0.b
+// CHECK-INST: ucvtflt z0.h, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3c00 <unknown>
+
+ucvtflt z31.h, z31.b
+// CHECK-INST: ucvtflt z31.h, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x4c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 654c3fff <unknown>
+
+ucvtflt z0.s, z0.h
+// CHECK-INST: ucvtflt z0.s, z0.h
+// CHECK-ENCODING: encoding: [0x00,0x3c,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3c00 <unknown>
+
+ucvtflt z31.s, z31.h
+// CHECK-INST: ucvtflt z31.s, z31.h
+// CHECK-ENCODING: encoding: [0xff,0x3f,0x8c,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 658c3fff <unknown>
+
+ucvtflt z0.d, z0.s
+// CHECK-INST: ucvtflt z0.d, z0.s
+// CHECK-ENCODING: encoding: [0x00,0x3c,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3c00 <unknown>
+
+ucvtflt z31.d, z31.s
+// CHECK-INST: ucvtflt z31.d, z31.s
+// CHECK-ENCODING: encoding: [0xff,0x3f,0xcc,0x65]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 65cc3fff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
new file mode 100644
index 0000000000000..0a12cf85cb007
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch armv9-a+sve2p3
+.arch armv9-a+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
new file mode 100644
index 0000000000000..1af6245abe16d
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-arch_extension-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.arch_extension sve2p3
+.arch_extension nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
new file mode 100644
index 0000000000000..f3dac045d4cfd
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/directive-cpu-negative.s
@@ -0,0 +1,7 @@
+// RUN: not llvm-mc -triple aarch64 -filetype asm -o - %s 2>&1 | FileCheck %s
+
+.cpu generic+sve2p3
+.cpu generic+nosve2p3
+addqp z0.b, z0.b, z0.b
+// CHECK: error: instruction requires: sme2p3 or sve2p3
+// CHECK-NEXT: addqp z0.b, z0.b, z0.b
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
new file mode 100644
index 0000000000000..3eb3792fcb042
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot-diagnostics.s
@@ -0,0 +1,137 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sdot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sdot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.b, z0.b, z0.b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.b, z0.b, z0.b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.h, z0.h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.h, z0.h, z0.h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.s, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.s, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.d, z0.d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.d, z0.d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.d, z0.s, z0.s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: udot z0.d, z0.s, z0.s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid register range and index
+
+sdot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sdot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sdot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: sdot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z8.b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: udot z0.h, z0.b, z8.b[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+udot z0.h, z0.b, z0.b[8]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 7].
+// CHECK-NEXT: udot z0.h, z0.b, z0.b[8]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/dot.s b/llvm/test/MC/AArch64/SVE2p3/dot.s
new file mode 100644
index 0000000000000..01021b38e2f26
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/dot.s
@@ -0,0 +1,173 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+sdot z0.h, z0.b, z0.b
+// CHECK-INST: sdot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x00,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400000 <unknown>
+
+sdot z10.h, z10.b, z10.b
+// CHECK-INST: sdot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x01,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a014a <unknown>
+
+sdot z21.h, z21.b, z21.b
+// CHECK-INST: sdot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x02,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445502b5 <unknown>
+
+sdot z31.h, z31.b, z31.b
+// CHECK-INST: sdot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x03,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x00,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420020 <unknown>
+
+// sdot indexed
+
+sdot z0.h, z0.b, z0.b[0]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200000 <unknown>
+
+sdot z31.h, z31.b, z7.b[0]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442703ff <unknown>
+
+sdot z0.h, z0.b, z0.b[1]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280000 <unknown>
+
+sdot z31.h, z31.b, z7.b[1]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f03ff <unknown>
+
+sdot z0.h, z0.b, z0.b[7]
+// CHECK-INST: sdot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x00,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780000 <unknown>
+
+sdot z31.h, z31.b, z7.b[7]
+// CHECK-INST: sdot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x03,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f03ff <unknown>
+
+movprfx z0, z7
+sdot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: sdot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x00,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220020 <unknown>
+
+// udot
+
+udot z0.h, z0.b, z0.b
+// CHECK-INST: udot z0.h, z0.b, z0.b
+// CHECK-ENCODING: encoding: [0x00,0x04,0x40,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44400400 <unknown>
+
+udot z10.h, z10.b, z10.b
+// CHECK-INST: udot z10.h, z10.b, z10.b
+// CHECK-ENCODING: encoding: [0x4a,0x05,0x4a,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 444a054a <unknown>
+
+udot z21.h, z21.b, z21.b
+// CHECK-INST: udot z21.h, z21.b, z21.b
+// CHECK-ENCODING: encoding: [0xb5,0x06,0x55,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445506b5 <unknown>
+
+udot z31.h, z31.b, z31.b
+// CHECK-INST: udot z31.h, z31.b, z31.b
+// CHECK-ENCODING: encoding: [0xff,0x07,0x5f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 445f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b
+// CHECK-ENCODING: encoding: [0x20,0x04,0x42,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44420420 <unknown>
+
+// udot indexed
+
+udot z0.h, z0.b, z0.b[0]
+// CHECK-INST: udot z0.h, z0.b, z0.b[0]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x20,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44200400 <unknown>
+
+udot z31.h, z31.b, z7.b[0]
+// CHECK-INST: udot z31.h, z31.b, z7.b[0]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x27,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442707ff <unknown>
+
+udot z0.h, z0.b, z0.b[1]
+// CHECK-INST: udot z0.h, z0.b, z0.b[1]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x28,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44280400 <unknown>
+
+udot z31.h, z31.b, z7.b[1]
+// CHECK-INST: udot z31.h, z31.b, z7.b[1]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x2f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 442f07ff <unknown>
+
+udot z0.h, z0.b, z0.b[7]
+// CHECK-INST: udot z0.h, z0.b, z0.b[7]
+// CHECK-ENCODING: encoding: [0x00,0x04,0x78,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44780400 <unknown>
+
+udot z31.h, z31.b, z7.b[7]
+// CHECK-INST: udot z31.h, z31.b, z7.b[7]
+// CHECK-ENCODING: encoding: [0xff,0x07,0x7f,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 447f07ff <unknown>
+
+movprfx z0, z7
+udot z0.h, z1.b, z2.b[0]
+// CHECK-INST: movprfx z0, z7
+// CHECK-INST: udot z0.h, z1.b, z2.b[0]
+// CHECK-ENCODING: encoding: [0x20,0x04,0x22,0x44]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 44220420 <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
new file mode 100644
index 0000000000000..21b4df9638d74
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6-diagnostics.s
@@ -0,0 +1,70 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.h, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.b, z1.b }, z0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.b, { z2.b, z3.b }, z4
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: luti6 z10.s, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: luti6 z10.b, { z0.h, z1.h }, z0[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 1].
+// CHECK-NEXT: luti6 z10.h, { z0.h, z1.h }, z0[2]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0.h, p0/m, z7.h
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+movprfx z0, z7
+luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: luti6 z0.h, { z2.h, z3.h }, z4[0]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/luti6.s b/llvm/test/MC/AArch64/SVE2p3/luti6.s
new file mode 100644
index 0000000000000..848091ccd7dde
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/luti6.s
@@ -0,0 +1,115 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (8-bit)
+
+luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z0.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x00,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac00 <unknown>
+
+luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z10.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac0a <unknown>
+
+luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z21.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x15,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac15 <unknown>
+
+luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-INST: luti6 z31.b, { z0.b, z1.b }, z0
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x20,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 4520ac1f <unknown>
+
+luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z0.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafe0 <unknown>
+
+luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z10.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xea,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafea <unknown>
+
+luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z21.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453faff5 <unknown>
+
+luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-INST: luti6 z31.b, { z31.b, z0.b }, z31
+// CHECK-ENCODING: encoding: [0xff,0xaf,0x3f,0x45]
+// CHECK-ERROR: instruction requires: sve2p3
+// CHECK-UNKNOWN: 453fafff <unknown>
+
+// ---------------------------------------------------------------
+// Lookup table read with 6-bit indices (16-bit)
+
+luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z0.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x00,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac00 <unknown>
+
+luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z10.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x0a,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac0a <unknown>
+
+luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z21.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x15,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac15 <unknown>
+
+luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-INST: luti6 z31.h, { z0.h, z1.h }, z0[0]
+// CHECK-ENCODING: encoding: [0x1f,0xac,0x60,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 4560ac1f <unknown>
+
+luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z0.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xe0,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafe0 <unknown>
+
+luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z10.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xea,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafea <unknown>
+
+luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z21.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xf5,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffaff5 <unknown>
+
+luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-INST: luti6 z31.h, { z31.h, z0.h }, z31[1]
+// CHECK-ENCODING: encoding: [0xff,0xaf,0xff,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ffafff <unknown>
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
new file mode 100644
index 0000000000000..a4dbae2bbb888
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn-diagnostics.s
@@ -0,0 +1,343 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrn z0.b, { z2.h, z3.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqrshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: sqrshrun z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqrshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqrshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: sqshrun z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: sqshrun z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: sqshrun z10.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: sqshrun z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqrshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid operand for instruction
+
+uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: uqrshrn z10.h, { z0.b, z1.b }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqrshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqrshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid element width
+
+uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.s, { z0.s, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width
+// CHECK-NEXT: uqshrn z10.d, { z0.d, z1.d }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Mismatched register size suffix
+
+uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: mismatched register size suffix
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.s }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Intermediate out of range
+
+uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 8].
+// CHECK-NEXT: uqshrn z10.b, { z0.h, z1.h }, #9
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #0
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: immediate must be an integer in range [1, 16].
+// CHECK-NEXT: uqshrn z0.h, { z0.s, z1.s }, #17
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid vector list, expected list with 2 consecutive SVE vectors, where the first vector is a multiple of 2 and with matching element types
+// CHECK-NEXT: uqshrn z10.b, { z1.h, z2.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Negative tests for instructions that are incompatible with movprfx
+
+movprfx z0, z7
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a movprfx, suggest replacing movprfx with mov
+// CHECK-NEXT: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/SVE2p3/qshrn.s b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
new file mode 100644
index 0000000000000..31c87cf342671
--- /dev/null
+++ b/llvm/test/MC/AArch64/SVE2p3/qshrn.s
@@ -0,0 +1,255 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sme2p3 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=+sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve2p3 < %s \
+// RUN:        | llvm-objdump -d --mattr=-sve2p3 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve2p3 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+sve2p3 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right narrow by immediate and interleave
+
+sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x28,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2bdf <unknown>
+
+sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x28,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82800 <unknown>
+
+sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-INST: sqrshrn z31.b, { z30.h, z31.h }, #3
+// CHECK-ENCODING: encoding: [0xdf,0x2b,0xad,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45ad2bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating rounding shift right unsigned narrow by immediate and interleave
+
+sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x08,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0bdf <unknown>
+
+sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqrshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x08,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80800 <unknown>
+
+sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqrshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x0b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a80bdf <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right narrow by immediate and interleave
+
+sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z21.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xd5,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03d5 <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af03df <unknown>
+
+sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrn z10.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x0a,0x00,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a8000a <unknown>
+
+sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a803df <unknown>
+
+sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af0000 <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x00,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf0000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf03df <unknown>
+
+sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x00,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b00000 <unknown>
+
+sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x03,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b003df <unknown>
+
+// -----------------------------------------------------------------
+// Signed saturating shift right unsigned narrow by immediate and interleave
+
+sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af2000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af23df <unknown>
+
+sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: sqshrun z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x20,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a82000 <unknown>
+
+sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: sqshrun z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a823df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x20,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf2000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf23df <unknown>
+
+sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: sqshrun z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x20,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b02000 <unknown>
+
+sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: sqshrun z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x23,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b023df <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating rounding shift right narrow by immediate and interleave
+
+uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x38,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af3bdf <unknown>
+
+uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqrshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x38,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83800 <unknown>
+
+uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqrshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x3b,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a83bdf <unknown>
+
+// -----------------------------------------------------------------
+// Unsigned saturating shift right narrow by immediate and interleave
+
+uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af1000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xaf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45af13df <unknown>
+
+uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-INST: uqshrn z0.b, { z0.h, z1.h }, #8
+// CHECK-ENCODING: encoding: [0x00,0x10,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a81000 <unknown>
+
+uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-INST: uqshrn z31.b, { z30.h, z31.h }, #8
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xa8,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45a813df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #1
+// CHECK-ENCODING: encoding: [0x00,0x10,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf1000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #1
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xbf,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45bf13df <unknown>
+
+uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-INST: uqshrn z0.h, { z0.s, z1.s }, #16
+// CHECK-ENCODING: encoding: [0x00,0x10,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b01000 <unknown>
+
+uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-INST: uqshrn z31.h, { z30.s, z31.s }, #16
+// CHECK-ENCODING: encoding: [0xdf,0x13,0xb0,0x45]
+// CHECK-ERROR: instruction requires: sme2p3 or sve2p3
+// CHECK-UNKNOWN: 45b013df <unknown>
diff --git a/llvm/test/MC/AArch64/armv8.4a-mpam.s b/llvm/test/MC/AArch64/armv8.4a-mpam.s
index 14787e6628c20..7469227974410 100644
--- a/llvm/test/MC/AArch64/armv8.4a-mpam.s
+++ b/llvm/test/MC/AArch64/armv8.4a-mpam.s
@@ -1,6 +1,4 @@
 // RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.4a < %s 2> %t | FileCheck %s --check-prefix=CHECK
-// RUN: FileCheck --check-prefix=CHECK-RO < %t %s
-// RUN: not llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=-v8.4a < %s 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
 
 //------------------------------------------------------------------------------
 // ARMV8.4-A MPAM Extensions
@@ -56,9 +54,6 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  msr MPAMVPM6_EL2, x0        // encoding: [0xc0,0xa6,0x1c,0xd5]
 //CHECK:  msr MPAMVPM7_EL2, x0        // encoding: [0xe0,0xa6,0x1c,0xd5]
 
-//CHECK-RO: error: expected writable system register or pstate
-//CHECK-RO: msr MPAMIDR_EL1, x0
-//CHECK-RO:     ^
 
 //CHECK:  mrs x0, MPAM0_EL1           // encoding: [0x20,0xa5,0x38,0xd5]
 //CHECK:  mrs x0, MPAM1_EL1           // encoding: [0x00,0xa5,0x38,0xd5]
@@ -77,100 +72,4 @@ mrs x0, MPAMIDR_EL1
 //CHECK:  mrs x0, MPAMVPM7_EL2        // encoding: [0xe0,0xa6,0x3c,0xd5]
 //CHECK:  mrs x0, MPAMIDR_EL1         // encoding: [0x80,0xa4,0x38,0xd5]
 
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM0_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL1, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM3_EL3, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAM1_EL12, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMHCR_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPMV_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM0_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM1_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM2_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM3_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM4_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM5_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM6_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMVPM7_EL2, x0
-//CHECK-ERROR:     ^
-//CHECK-ERROR: error: expected writable system register or pstate
-//CHECK-ERROR: msr MPAMIDR_EL1, x0
-//CHECK-ERROR:     ^
 
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM0_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL1
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM3_EL3
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAM1_EL12
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMHCR_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPMV_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM0_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM1_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM2_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM3_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM4_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM5_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM6_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMVPM7_EL2
-//CHECK-ERROR:         ^
-//CHECK-ERROR: error: expected readable system register
-//CHECK-ERROR: mrs x0, MPAMIDR_EL1
-//CHECK-ERROR:         ^
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
new file mode 100644
index 0000000000000..cffee7dbbe31e
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+gcie -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// FEAT_GCIE instructions
+//------------------------------------------------------------------------------
+
+gsb
+// CHECK-ERROR: error: invalid operand for GSB instruction
+
+gicr
+// CHECK-ERROR: error: expected register operand
+
+gicr x3, foo
+// CHECK-ERROR: error: invalid operand for GICR instruction
+
+gic cdaff
+// CHECK-ERROR: error: specified gic op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-gcie.s b/llvm/test/MC/AArch64/armv9.7a-gcie.s
new file mode 100644
index 0000000000000..4fd5d2577e26a
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-gcie.s
@@ -0,0 +1,985 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=+gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+gcie < %s \
+// RUN:        | llvm-objdump -d --mattr=-gcie --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+gcie < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+gcie -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_GCIE Extensions
+//------------------------------------------------------------------------------
+
+// CPU Interface Registers MRS Instruction - Encodings Checked
+MRS x3, ICC_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICC_APR_EL3
+// CHECK-INST:    mrs x3, ICC_APR_EL3
+// CHECK-ENCODING: [0x03,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec803
+
+MRS x3, ICC_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICC_CR0_EL3
+// CHECK-INST:    mrs x3, ICC_CR0_EL3
+// CHECK-ENCODING: [0x03,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec903
+
+MRS x3, ICC_DOMHPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_DOMHPPIR_EL3
+// CHECK-ENCODING: [0x43,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec843
+
+MRS x3, ICC_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICC_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICC_HPPIR_EL3
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL3
+// CHECK-ENCODING: [0x23,0xc9,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec923
+
+MRS x3, ICC_IAFFIDR_EL1
+// CHECK-INST:    mrs x3, ICC_IAFFIDR_EL1
+// CHECK-ENCODING: [0xa3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538caa3
+
+MRS x3, ICC_ICSR_EL1
+// CHECK-INST:    mrs x3, ICC_ICSR_EL1
+// CHECK-ENCODING: [0x83,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca83
+
+MRS x3, ICC_IDR0_EL1
+// CHECK-INST:    mrs x3, ICC_IDR0_EL1
+// CHECK-ENCODING: [0x43,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca43
+
+MRS x3, ICC_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+MRS x3, ICC_PCR_EL3
+// CHECK-INST:    mrs x3, ICC_PCR_EL3
+// CHECK-ENCODING: [0x23,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec823
+
+MRS x3, ICC_SRE_EL1
+// CHECK-INST:    mrs x3, ICC_SRE_EL1
+// CHECK-ENCODING: [0xa3,0xcc,0x38,0xd5]
+// CHECK-UNKNOWN: d538cca3
+
+// -----------------------------------------------
+MSR ICC_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICC_APR_EL3, x3
+// CHECK-INST:    msr ICC_APR_EL3, x3
+// CHECK-ENCODING: [0x03,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec803
+
+MSR ICC_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICC_CR0_EL3, x3
+// CHECK-INST:    msr ICC_CR0_EL3, x3
+// CHECK-ENCODING: [0x03,0xc9,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec903
+
+MSR ICC_ICSR_EL1, x3
+// CHECK-INST:    msr ICC_ICSR_EL1, x3
+// CHECK-ENCODING: [0x83,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518ca83
+
+MSR ICC_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+MSR ICC_PCR_EL3, x3
+// CHECK-INST:    msr ICC_PCR_EL3, x3
+// CHECK-ENCODING: [0x23,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec823
+
+
+// -----------------------------------------------
+// Virtual CPU Registers MRS Instructions
+
+// The specification says:
+//   "Each ICC_system register that is accessible at EL1 and higher and whose state
+//    is specific to the Virtual Interrupt Domain, has a corresponding virtual
+//    ICV_register. The ICV_registers are accessed using the same system register
+//    encodings as their ICC_counterparts."
+//
+// So expect ICC_* encodings here, not ICV_* encodings
+
+MRS x3, ICV_APR_EL1
+// CHECK-INST:    mrs x3, ICC_APR_EL1
+// CHECK-ENCODING: [0x03,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c003
+
+MRS x3, ICV_CR0_EL1
+// CHECK-INST:    mrs x3, ICC_CR0_EL1
+// CHECK-ENCODING: [0x23,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c023
+
+MRS x3, ICV_HAPR_EL1
+// CHECK-INST:    mrs x3, ICC_HAPR_EL1
+// CHECK-ENCODING: [0x63,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c063
+
+MRS x3, ICV_HPPIR_EL1
+// CHECK-INST:    mrs x3, ICC_HPPIR_EL1
+// CHECK-ENCODING: [0x63,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca63
+
+MRS x3, ICV_PCR_EL1
+// CHECK-INST:    mrs x3, ICC_PCR_EL1
+// CHECK-ENCODING: [0x43,0xc0,0x39,0xd5]
+// CHECK-UNKNOWN: d539c043
+
+
+// -----------------------------------------------
+// Likewise here, expect ICC_* encodings here, not ICV_* encodings
+MSR ICV_APR_EL1, x3
+// CHECK-INST:    msr ICC_APR_EL1, x3
+// CHECK-ENCODING: [0x03,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c003
+
+MSR ICV_CR0_EL1, x3
+// CHECK-INST:    msr ICC_CR0_EL1, x3
+// CHECK-ENCODING: [0x23,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c023
+
+MSR ICV_PCR_EL1, x3
+// CHECK-INST:    msr ICC_PCR_EL1, x3
+// CHECK-ENCODING: [0x43,0xc0,0x19,0xd5]
+// CHECK-UNKNOWN: d519c043
+
+// -----------------------------------------------
+// PPI Registers MRS Instructions - Encodings Checked
+MRS x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER0_EL1
+// CHECK-ENCODING: [0x03,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd03
+
+MRS x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CACTIVER1_EL1
+// CHECK-ENCODING: [0x23,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd23
+
+MRS x3, ICC_PPI_CPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR0_EL1
+// CHECK-ENCODING: [0x83,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd83
+
+MRS x3, ICC_PPI_CPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_CPENDR1_EL1
+// CHECK-ENCODING: [0xa3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cda3
+
+MRS x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR0_EL3
+// CHECK-ENCODING: [0x83,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec883
+
+MRS x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR1_EL3
+// CHECK-ENCODING: [0xa3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8a3
+
+MRS x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR2_EL3
+// CHECK-ENCODING: [0xc3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8c3
+
+MRS x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-INST:    mrs x3, ICC_PPI_DOMAINR3_EL3
+// CHECK-ENCODING: [0xe3,0xc8,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ec8e3
+
+MRS x3, ICC_PPI_ENABLER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER0_EL1
+// CHECK-ENCODING: [0xc3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cac3
+
+MRS x3, ICC_PPI_ENABLER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_ENABLER1_EL1
+// CHECK-ENCODING: [0xe3,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538cae3
+
+MRS x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR0_EL1
+// CHECK-ENCODING: [0x03,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce03
+
+MRS x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR1_EL1
+// CHECK-ENCODING: [0x23,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce23
+
+MRS x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR2_EL1
+// CHECK-ENCODING: [0x43,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce43
+
+MRS x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR3_EL1
+// CHECK-ENCODING: [0x63,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce63
+
+MRS x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR4_EL1
+// CHECK-ENCODING: [0x83,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538ce83
+
+MRS x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR5_EL1
+// CHECK-ENCODING: [0xa3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cea3
+
+MRS x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR6_EL1
+// CHECK-ENCODING: [0xc3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cec3
+
+MRS x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR7_EL1
+// CHECK-ENCODING: [0xe3,0xce,0x38,0xd5]
+// CHECK-UNKNOWN: d538cee3
+
+MRS x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR8_EL1
+// CHECK-ENCODING: [0x03,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf03
+
+MRS x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR9_EL1
+// CHECK-ENCODING: [0x23,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf23
+
+MRS x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR10_EL1
+// CHECK-ENCODING: [0x43,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf43
+
+MRS x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR11_EL1
+// CHECK-ENCODING: [0x63,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf63
+
+MRS x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR12_EL1
+// CHECK-ENCODING: [0x83,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cf83
+
+MRS x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR13_EL1
+// CHECK-ENCODING: [0xa3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfa3
+
+MRS x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR14_EL1
+// CHECK-ENCODING: [0xc3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfc3
+
+MRS x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_PRIORITYR15_EL1
+// CHECK-ENCODING: [0xe3,0xcf,0x38,0xd5]
+// CHECK-UNKNOWN: d538cfe3
+
+MRS x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER0_EL1
+// CHECK-ENCODING: [0x43,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd43
+
+MRS x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SACTIVER1_EL1
+// CHECK-ENCODING: [0x63,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cd63
+
+MRS x3, ICC_PPI_SPENDR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR0_EL1
+// CHECK-ENCODING: [0xc3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cdc3
+
+MRS x3, ICC_PPI_SPENDR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_SPENDR1_EL1
+// CHECK-ENCODING: [0xe3,0xcd,0x38,0xd5]
+// CHECK-UNKNOWN: d538cde3
+
+MRS x3, ICC_PPI_HMR0_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR0_EL1
+// CHECK-ENCODING: [0x03,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca03
+
+MRS x3, ICC_PPI_HMR1_EL1
+// CHECK-INST:    mrs x3, ICC_PPI_HMR1_EL1
+// CHECK-ENCODING: [0x23,0xca,0x38,0xd5]
+// CHECK-UNKNOWN: d538ca23
+
+// -----------------------------------------------
+// MSR PPI Registers Instructions
+MSR ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x03,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd03
+
+MSR ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x23,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd23
+
+MSR ICC_PPI_CPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR0_EL1, x3
+// CHECK-ENCODING: [0x83,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd83
+
+MSR ICC_PPI_CPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_CPENDR1_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cda3
+
+MSR ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR0_EL3, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec883
+
+MSR ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR1_EL3, x3
+// CHECK-ENCODING: [0xa3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8a3
+
+MSR ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR2_EL3, x3
+// CHECK-ENCODING: [0xc3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8c3
+
+MSR ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-INST:    msr ICC_PPI_DOMAINR3_EL3, x3
+// CHECK-ENCODING: [0xe3,0xc8,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ec8e3
+
+MSR ICC_PPI_ENABLER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cac3
+
+MSR ICC_PPI_ENABLER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_ENABLER1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xca,0x18,0xd5]
+// CHECK-UNKNOWN: d518cae3
+
+MSR ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR0_EL1, x3
+// CHECK-ENCODING: [0x03,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce03
+
+MSR ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR1_EL1, x3
+// CHECK-ENCODING: [0x23,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce23
+
+MSR ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR2_EL1, x3
+// CHECK-ENCODING: [0x43,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce43
+
+MSR ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR3_EL1, x3
+// CHECK-ENCODING: [0x63,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce63
+
+MSR ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR4_EL1, x3
+// CHECK-ENCODING: [0x83,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518ce83
+
+MSR ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR5_EL1, x3
+// CHECK-ENCODING: [0xa3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cea3
+
+MSR ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR6_EL1, x3
+// CHECK-ENCODING: [0xc3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cec3
+
+MSR ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR7_EL1, x3
+// CHECK-ENCODING: [0xe3,0xce,0x18,0xd5]
+// CHECK-UNKNOWN: d518cee3
+
+MSR ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR8_EL1, x3
+// CHECK-ENCODING: [0x03,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf03
+
+MSR ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR9_EL1, x3
+// CHECK-ENCODING: [0x23,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf23
+
+MSR ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR10_EL1, x3
+// CHECK-ENCODING: [0x43,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf43
+
+MSR ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR11_EL1, x3
+// CHECK-ENCODING: [0x63,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf63
+
+MSR ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR12_EL1, x3
+// CHECK-ENCODING: [0x83,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cf83
+
+MSR ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR13_EL1, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfa3
+
+MSR ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR14_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfc3
+
+MSR ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-INST:    msr ICC_PPI_PRIORITYR15_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x18,0xd5]
+// CHECK-UNKNOWN: d518cfe3
+
+MSR ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER0_EL1, x3
+// CHECK-ENCODING: [0x43,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd43
+
+MSR ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SACTIVER1_EL1, x3
+// CHECK-ENCODING: [0x63,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cd63
+
+MSR ICC_PPI_SPENDR0_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR0_EL1, x3
+// CHECK-ENCODING: [0xc3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cdc3
+
+MSR ICC_PPI_SPENDR1_EL1, x3
+// CHECK-INST:    msr ICC_PPI_SPENDR1_EL1, x3
+// CHECK-ENCODING: [0xe3,0xcd,0x18,0xd5]
+// CHECK-UNKNOWN: d518cde3
+
+// -----------------------------------------------
+// Hypervisor Control Register MRS Instructions
+MRS x3, ICH_APR_EL2
+// CHECK-INST:    mrs x3, ICH_APR_EL2
+// CHECK-ENCODING: [0x83,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc883
+
+MRS x3, ICH_CONTEXTR_EL2
+// CHECK-INST:    mrs x3, ICH_CONTEXTR_EL2
+// CHECK-ENCODING: [0xc3,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccbc3
+
+MRS x3, ICH_HFGITR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGITR_EL2
+// CHECK-ENCODING: [0xe3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9e3
+
+MRS x3, ICH_HFGRTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGRTR_EL2
+// CHECK-ENCODING: [0x83,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc983
+
+MRS x3, ICH_HFGWTR_EL2
+// CHECK-INST:    mrs x3, ICH_HFGWTR_EL2
+// CHECK-ENCODING: [0xc3,0xc9,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc9c3
+
+MRS x3, ICH_HPPIR_EL2
+// CHECK-INST:    mrs x3, ICH_HPPIR_EL2
+// CHECK-ENCODING: [0xa3,0xc8,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cc8a3
+
+MRS x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER0_EL2
+// CHECK-ENCODING: [0xc3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccac3
+
+MRS x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ACTIVER1_EL2
+// CHECK-ENCODING: [0xe3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccae3
+
+MRS x3, ICH_PPI_DVIR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR0_EL2
+// CHECK-ENCODING: [0x03,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca03
+
+MRS x3, ICH_PPI_DVIR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_DVIR1_EL2
+// CHECK-ENCODING: [0x23,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca23
+
+MRS x3, ICH_PPI_ENABLER0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER0_EL2
+// CHECK-ENCODING: [0x43,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca43
+
+MRS x3, ICH_PPI_ENABLER1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_ENABLER1_EL2
+// CHECK-ENCODING: [0x63,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca63
+
+MRS x3, ICH_PPI_PENDR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR0_EL2
+// CHECK-ENCODING: [0x83,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cca83
+
+MRS x3, ICH_PPI_PENDR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PENDR1_EL2
+// CHECK-ENCODING: [0xa3,0xca,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccaa3
+
+MRS x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR0_EL2
+// CHECK-ENCODING: [0x03,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce03
+
+MRS x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR1_EL2
+// CHECK-ENCODING: [0x23,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce23
+
+MRS x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR2_EL2
+// CHECK-ENCODING: [0x43,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce43
+
+MRS x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR3_EL2
+// CHECK-ENCODING: [0x63,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce63
+
+MRS x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR4_EL2
+// CHECK-ENCODING: [0x83,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53cce83
+
+MRS x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR5_EL2
+// CHECK-ENCODING: [0xa3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccea3
+
+MRS x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR6_EL2
+// CHECK-ENCODING: [0xc3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccec3
+
+MRS x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR7_EL2
+// CHECK-ENCODING: [0xe3,0xce,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccee3
+
+MRS x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR8_EL2
+// CHECK-ENCODING: [0x03,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf03
+
+MRS x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR9_EL2
+// CHECK-ENCODING: [0x23,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf23
+
+MRS x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR10_EL2
+// CHECK-ENCODING: [0x43,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf43
+
+MRS x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR11_EL2
+// CHECK-ENCODING: [0x63,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf63
+
+MRS x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR12_EL2
+// CHECK-ENCODING: [0x83,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccf83
+
+MRS x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR13_EL2
+// CHECK-ENCODING: [0xa3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfa3
+
+MRS x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR14_EL2
+// CHECK-ENCODING: [0xc3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfc3
+
+MRS x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-INST:    mrs x3, ICH_PPI_PRIORITYR15_EL2
+// CHECK-ENCODING: [0xe3,0xcf,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccfe3
+
+MRS x3, ICH_VCTLR_EL2
+// CHECK-INST:    mrs x3, ICH_VCTLR_EL2
+// CHECK-ENCODING: [0x83,0xcb,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ccb83
+
+// -----------------------------------------------
+// Hypervisor Control Register MSR Instructions
+MSR ICH_APR_EL2, x3
+// CHECK-INST:    msr ICH_APR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc8,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc883
+
+MSR ICH_CONTEXTR_EL2, x3
+// CHECK-INST:    msr ICH_CONTEXTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccbc3
+
+MSR ICH_HFGITR_EL2, x3
+// CHECK-INST:    msr ICH_HFGITR_EL2, x3
+// CHECK-ENCODING: [0xe3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9e3
+
+MSR ICH_HFGRTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGRTR_EL2, x3
+// CHECK-ENCODING: [0x83,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc983
+
+MSR ICH_HFGWTR_EL2, x3
+// CHECK-INST:    msr ICH_HFGWTR_EL2, x3
+// CHECK-ENCODING: [0xc3,0xc9,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cc9c3
+
+MSR ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER0_EL2, x3
+// CHECK-ENCODING: [0xc3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccac3
+
+MSR ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ACTIVER1_EL2, x3
+// CHECK-ENCODING: [0xe3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccae3
+
+MSR ICH_PPI_DVIR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca03
+
+MSR ICH_PPI_DVIR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_DVIR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca23
+
+MSR ICH_PPI_ENABLER0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER0_EL2, x3
+// CHECK-ENCODING: [0x43,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca43
+
+MSR ICH_PPI_ENABLER1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_ENABLER1_EL2, x3
+// CHECK-ENCODING: [0x63,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca63
+
+MSR ICH_PPI_PENDR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR0_EL2, x3
+// CHECK-ENCODING: [0x83,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cca83
+
+MSR ICH_PPI_PENDR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PENDR1_EL2, x3
+// CHECK-ENCODING: [0xa3,0xca,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccaa3
+
+MSR ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR0_EL2, x3
+// CHECK-ENCODING: [0x03,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce03
+
+MSR ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR1_EL2, x3
+// CHECK-ENCODING: [0x23,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce23
+
+MSR ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR2_EL2, x3
+// CHECK-ENCODING: [0x43,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce43
+
+MSR ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR3_EL2, x3
+// CHECK-ENCODING: [0x63,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce63
+
+MSR ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR4_EL2, x3
+// CHECK-ENCODING: [0x83,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51cce83
+
+MSR ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR5_EL2, x3
+// CHECK-ENCODING: [0xa3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccea3
+
+MSR ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR6_EL2, x3
+// CHECK-ENCODING: [0xc3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccec3
+
+MSR ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR7_EL2, x3
+// CHECK-ENCODING: [0xe3,0xce,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccee3
+
+MSR ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR8_EL2, x3
+// CHECK-ENCODING: [0x03,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf03
+
+MSR ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR9_EL2, x3
+// CHECK-ENCODING: [0x23,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf23
+
+MSR ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR10_EL2, x3
+// CHECK-ENCODING: [0x43,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf43
+
+MSR ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR11_EL2, x3
+// CHECK-ENCODING: [0x63,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf63
+
+MSR ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR12_EL2, x3
+// CHECK-ENCODING: [0x83,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccf83
+
+MSR ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR13_EL2, x3
+// CHECK-ENCODING: [0xa3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfa3
+
+MSR ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR14_EL2, x3
+// CHECK-ENCODING: [0xc3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfc3
+
+MSR ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-INST:    msr ICH_PPI_PRIORITYR15_EL2, x3
+// CHECK-ENCODING: [0xe3,0xcf,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccfe3
+
+MSR ICH_VCTLR_EL2, x3
+// CHECK-INST:    msr ICH_VCTLR_EL2, x3
+// CHECK-ENCODING: [0x83,0xcb,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ccb83
+
+// -----------------------------------------------
+// FEAT_GCIE Instructions
+// Current Interrupt Domain
+GIC CDAFF, x3
+// CHECK-INST:    gic cdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c163 sys #0, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC cdaff requires: gcie
+
+GIC CDDI, x3
+// CHECK-INST:    gic cddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c203 sys #0, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC cddi requires: gcie
+
+GIC CDDIS, x3
+// CHECK-INST:    gic cddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c103 sys #0, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC cddis requires: gcie
+
+GIC CDEN, x3
+// CHECK-INST:    gic cden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c123 sys #0, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC cden requires: gcie
+
+GIC CDEOI, x3
+// CHECK-INST:    gic cdeoi, x3
+// CHECK-ENCODING: [0xe3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1e3 sys #0, c12, c1, #7, x3
+// CHECK-ERROR: error: GIC cdeoi requires: gcie
+
+GIC CDHM, x3
+// CHECK-INST:    gic cdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x08,0xd5]
+// CHECK-UNKNOWN: d508c223 sys #0, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC cdhm requires: gcie
+
+GIC CDPEND, x3
+// CHECK-INST:    gic cdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c183 sys #0, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC cdpend requires: gcie
+
+GIC CDPRI, x3
+// CHECK-INST:    gic cdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c143 sys #0, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC cdpri requires: gcie
+
+GIC CDRCFG, x3
+// CHECK-INST:    gic cdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x08,0xd5]
+// CHECK-UNKNOWN: d508c1a3 sys #0, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC cdrcfg requires: gcie
+
+GICR x3, CDIA
+// CHECK-INST:    gicr x3, cdia
+// CHECK-ENCODING: [0x03,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c303 sysl x3, #0, c12, c3, #0
+// CHECK-ERROR: error: GICR cdia requires: gcie
+
+GICR x3, CDNMIA
+// CHECK-INST:    gicr x3, cdnmia
+// CHECK-ENCODING: [0x23,0xc3,0x28,0xd5]
+// CHECK-UNKNOWN: d528c323 sysl x3, #0, c12, c3, #1
+// CHECK-ERROR: error: GICR cdnmia requires: gcie
+
+// -----------------------------------------------
+// Virtual Interrupt Domain
+GIC VDAFF, x3
+// CHECK-INST:    gic vdaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc163 sys #4, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC vdaff requires: gcie
+
+GIC VDDI, x3
+// CHECK-INST:    gic vddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc203 sys #4, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC vddi requires: gcie
+
+GIC VDDIS, x3
+// CHECK-INST:    gic vddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc103 sys #4, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC vddis requires: gcie
+
+GIC VDEN, x3
+// CHECK-INST:    gic vden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc123 sys #4, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC vden requires: gcie
+
+GIC VDHM, x3
+// CHECK-INST:    gic vdhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc223 sys #4, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC vdhm requires: gcie
+
+GIC VDPEND, x3
+// CHECK-INST:    gic vdpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc183 sys #4, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC vdpend requires: gcie
+
+GIC VDPRI, x3
+// CHECK-INST:    gic vdpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc143 sys #4, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC vdpri requires: gcie
+
+GIC VDRCFG, x3
+// CHECK-INST:    gic vdrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0c,0xd5]
+// CHECK-UNKNOWN: d50cc1a3 sys #4, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC vdrcfg requires: gcie
+
+// -----------------------------------------------
+// Logical Interrupt Domain
+GIC LDAFF, x3
+// CHECK-INST:    gic ldaff, x3
+// CHECK-ENCODING: [0x63,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec163 sys #6, c12, c1, #3, x3
+// CHECK-ERROR: error: GIC ldaff requires: gcie
+
+GIC LDDI, x3
+// CHECK-INST:    gic lddi, x3
+// CHECK-ENCODING: [0x03,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec203 sys #6, c12, c2, #0, x3
+// CHECK-ERROR: error: GIC lddi requires: gcie
+
+GIC LDDIS, x3
+// CHECK-INST:    gic lddis, x3
+// CHECK-ENCODING: [0x03,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec103 sys #6, c12, c1, #0, x3
+// CHECK-ERROR: error: GIC lddis requires: gcie
+
+GIC LDEN, x3
+// CHECK-INST:    gic lden, x3
+// CHECK-ENCODING: [0x23,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec123 sys #6, c12, c1, #1, x3
+// CHECK-ERROR: error: GIC lden requires: gcie
+
+GIC LDHM, x3
+// CHECK-INST:    gic ldhm, x3
+// CHECK-ENCODING: [0x23,0xc2,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec223 sys #6, c12, c2, #1, x3
+// CHECK-ERROR: error: GIC ldhm requires: gcie
+
+GIC LDPEND, x3
+// CHECK-INST:    gic ldpend, x3
+// CHECK-ENCODING: [0x83,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec183 sys #6, c12, c1, #4, x3
+// CHECK-ERROR: error: GIC ldpend requires: gcie
+
+GIC LDPRI, x3
+// CHECK-INST:    gic ldpri, x3
+// CHECK-ENCODING: [0x43,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec143 sys #6, c12, c1, #2, x3
+// CHECK-ERROR: error: GIC ldpri requires: gcie
+
+GIC LDRCFG, x3
+// CHECK-INST:    gic ldrcfg, x3
+// CHECK-ENCODING: [0xa3,0xc1,0x0e,0xd5]
+// CHECK-UNKNOWN: d50ec1a3 sys #6, c12, c1, #5, x3
+// CHECK-ERROR: error: GIC ldrcfg requires: gcie
+
+// -----------------------------------------------
+// GIC Synchronization Barrier Instructions
+GSB SYS
+// CHECK-INST:    gsb sys
+// CHECK-ENCODING: [0x1f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c01f sys #0, c12, c0, #0
+// CHECK-ERROR: error: GSB sys requires: gcie
+
+GSB ACK
+// CHECK-INST:    gsb ack
+// CHECK-ENCODING: [0x3f,0xc0,0x08,0xd5]
+// CHECK-UNKNOWN: d508c03f sys #0, c12, c0, #1
+// CHECK-ERROR: error: GSB ack requires: gcie
diff --git a/llvm/test/MC/AArch64/armv9.7a-memsys.s b/llvm/test/MC/AArch64/armv9.7a-memsys.s
new file mode 100644
index 0000000000000..228c71ef6e5c5
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-memsys.s
@@ -0,0 +1,140 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=+cmh,+lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+cmh,+lscp < %s \
+// RUN:        | llvm-objdump -d --mattr=-cmh,-lscp --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+cmh,+lscp < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+cmh,+lscp -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A Contention Management Hints (FEAT_CMH).
+
+shuh
+// CHECK-INST: shuh
+// CHECK-ENCODING: encoding: [0x5f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503265f hint    #50
+
+shuh ph
+// CHECK-INST: shuh  ph
+// CHECK-ENCODING: encoding: [0x7f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503267f hint    #51
+
+stcph
+// CHECK-INST: stcph
+// CHECK-ENCODING: [0x9f,0x26,0x03,0xd5]
+// CHECK-ERROR: error: instruction requires: cmh
+// CHECK-UNKNOWN: d503269f hint    #52
+
+ldap x0, x1, [x2]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldap x0, x1, [x2, #0]
+// CHECK-INST: ldap    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9415840 <unknown>
+
+ldapp x0, x1, [x2]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+ldapp x0, x1, [x2, #0]
+// CHECK-INST: ldapp   x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x78,0x41,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9417840 <unknown>
+
+stlp x0, x1, [x2, #0]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+stlp x0, x1, [x2]
+// CHECK-INST: stlp    x0, x1, [x2]
+// CHECK-ENCODING: encoding: [0x40,0x58,0x01,0xd9]
+// CHECK-ERROR: error: instruction requires: lscp
+// CHECK-UNKNOWN: d9015840 <unknown>
+
+mrs x3, VTLBID0_EL2
+// CHECK-INST: mrs	x3, VTLBID0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2803
+mrs x3, VTLBID1_EL2
+// CHECK-INST: mrs	x3, VTLBID1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2823
+mrs x3, VTLBID2_EL2
+// CHECK-INST: mrs	x3, VTLBID2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2843
+mrs x3, VTLBID3_EL2
+// CHECK-INST: mrs	x3, VTLBID3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x28,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2863
+mrs x3, VTLBIDOS0_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS0_EL2
+// CHECK-ENCODING: encoding: [0x03,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2903
+mrs x3, VTLBIDOS1_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS1_EL2
+// CHECK-ENCODING: encoding: [0x23,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2923
+mrs x3, VTLBIDOS2_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS2_EL2
+// CHECK-ENCODING: encoding: [0x43,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2943
+mrs x3, VTLBIDOS3_EL2
+// CHECK-INST: mrs	x3, VTLBIDOS3_EL2
+// CHECK-ENCODING: encoding: [0x63,0x29,0x3c,0xd5]
+// CHECK-UNKNOWN: d53c2963
+mrs x3, TLBIDIDR_EL1
+// CHECK-INST: mrs	x3, TLBIDIDR_EL1
+// CHECK-ENCODING: encoding: [0xc3,0xa4,0x38,0xd5]
+// CHECK-UNKNOWN: d538a4c3
+
+msr VTLBID0_EL2, x3
+// CHECK-INST: msr	VTLBID0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2803
+msr VTLBID1_EL2, x3
+// CHECK-INST: msr	VTLBID1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2823
+msr VTLBID2_EL2, x3
+// CHECK-INST: msr	VTLBID2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2843
+msr VTLBID3_EL2, x3
+// CHECK-INST: msr	VTLBID3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x28,0x1c,0xd5]
+// CHECK-UNKNOWN: d51c2863
+msr VTLBIDOS0_EL2, x3
+// CHECK-INST: msr	VTLBIDOS0_EL2, x3
+// CHECK-ENCODING: encoding: [0x03,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2903
+msr VTLBIDOS1_EL2, x3
+// CHECK-INST: msr	VTLBIDOS1_EL2, x3
+// CHECK-ENCODING: encoding: [0x23,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2923
+msr VTLBIDOS2_EL2, x3
+// CHECK-INST: msr	VTLBIDOS2_EL2, x3
+// CHECK-ENCODING: encoding: [0x43,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2943
+msr VTLBIDOS3_EL2, x3
+// CHECK-INST: msr	VTLBIDOS3_EL2, x3
+// CHECK-ENCODING: encoding: [0x63,0x29,0x1c,0xd5]
+// CHECK-UNKNOWN:  d51c2963
+
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
new file mode 100644
index 0000000000000..54fdc230a3a10
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2-diagnostics.s
@@ -0,0 +1,18 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mpamv2 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+mlbi alle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vmalle1, x30
+// CHECK-ERROR: error: specified mlbi op does not use a register
+
+mlbi vpide1
+// CHECK-ERROR: error: specified mlbi op requires a register
+
+mlbi vpmge1
+// CHECK-ERROR: error: specified mlbi op requires a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-mpamv2.s b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
new file mode 100644
index 0000000000000..b8b21e96869f3
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mpamv2.s
@@ -0,0 +1,126 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=+mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mpamv2 < %s \
+// RUN:        | llvm-objdump -d --mattr=-mpamv2 --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mpamv2 < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mpamv2 -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2 Extensions
+//------------------------------------------------------------------------------
+
+msr MPAMCTL_EL1, x0
+// CHECK-INST:    msr MPAMCTL_EL1, x0
+// CHECK-ENCODING: [0x40,0xa5,0x18,0xd5]
+// CHECK-UNKNOWN: d518a540
+
+msr MPAMCTL_EL12, x0
+// CHECK-INST:   msr MPAMCTL_EL12, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1d,0xd5]
+// CHECK-UNKNOWN: d51da540
+
+msr MPAMCTL_EL2, x0
+// CHECK-INST:    msr     MPAMCTL_EL2, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca540
+
+msr MPAMCTL_EL3, x0
+// CHECK-INST:    msr     MPAMCTL_EL3, x0
+// CHECK-ENCODING: [0x40,0xa5,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea540
+
+msr MPAMVIDCR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDCR_EL2, x0
+// CHECK-ENCODING: [0x00,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca700
+
+msr MPAMVIDSR_EL2, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL2, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1c,0xd5]
+// CHECK-UNKNOWN: d51ca720
+
+msr MPAMVIDSR_EL3, x0
+// CHECK-INST:    msr     MPAMVIDSR_EL3, x0
+// CHECK-ENCODING: [0x20,0xa7,0x1e,0xd5]
+// CHECK-UNKNOWN: d51ea720
+
+
+mrs x0, MPAMCTL_EL1
+// CHECK-INST:        mrs     x0, MPAMCTL_EL1
+// CHECK-ENCODING: [0x40,0xa5,0x38,0xd5]
+// CHECK-UNKNOWN: d538a540
+
+mrs x0, MPAMCTL_EL12
+// CHECK-INST:   mrs     x0, MPAMCTL_EL12
+// CHECK-ENCODING: [0x40,0xa5,0x3d,0xd5]
+// CHECK-UNKNOWN: d53da540
+
+mrs x0, MPAMCTL_EL2
+// CHECK-INST:   mrs     x0, MPAMCTL_EL2
+// CHECK-ENCODING: [0x40,0xa5,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca540
+
+mrs x0, MPAMCTL_EL3
+// CHECK-INST:   mrs     x0, MPAMCTL_EL3
+// CHECK-ENCODING: [0x40,0xa5,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea540
+
+mrs x0, MPAMVIDCR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDCR_EL2
+// CHECK-ENCODING: [0x00,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca700
+
+mrs x0, MPAMVIDSR_EL2
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL2
+// CHECK-ENCODING: [0x20,0xa7,0x3c,0xd5]
+// CHECK-UNKNOWN: d53ca720
+
+mrs x0, MPAMVIDSR_EL3
+// CHECK-INST:   mrs     x0, MPAMVIDSR_EL3
+// CHECK-ENCODING: [0x20,0xa7,0x3e,0xd5]
+// CHECK-UNKNOWN: d53ea720
+
+
+//------------------------------------------------------------------------------
+// Armv9.7-A FEAT_MPAMV2_VID Extensions
+//------------------------------------------------------------------------------
+
+mlbi vmalle1
+// CHECK-INST:    mlbi vmalle1
+// CHECK-ENCODING: [0xbf,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70bf sys	#4, c7, c0, #5
+// CHECK-ERROR: error: MLBI VMALLE1 requires: mpamv2
+
+mlbi vpide1, x0
+// CHECK-INST:    mlbi vpide1, x0
+// CHECK-ENCODING: [0xc0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70c0 sys	#4, c7, c0, #6, x0
+// CHECK-ERROR: error: MLBI VPIDE1 requires: mpamv2
+
+mlbi vpmge1, x0
+// CHECK-INST:    mlbi vpmge1, x0
+// CHECK-ENCODING: [0xe0,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c70e0 sys	#4, c7, c0, #7, x0
+// CHECK-ERROR: error: MLBI VPMGE1 requires: mpamv2
+
+// Check that invalid encodings are rendered as SYS aliases
+// [0x9f,0x70,0x0c,0xd5] -> mlbi alle1
+// [0x9e,0x70,0x0c,0xd5] -> sys #4, c7, c0, #4, x30
+
+mlbi alle1
+// CHECK-INST:    mlbi alle1
+// CHECK-ENCODING: [0x9f,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709f sys	#4, c7, c0, #4
+// CHECK-ERROR: error: MLBI ALLE1 requires: mpamv2
+
+sys #4, c7, c0, #4, x30
+// CHECK-INST: sys #4, c7, c0, #4, x30
+// CHECK-ENCODING: [0x9e,0x70,0x0c,0xd5]
+// CHECK-UNKNOWN: d50c709e sys #4, c7, c0, #4, x30
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
new file mode 100644
index 0000000000000..dc2a2905709e2
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc-diagnostics.s
@@ -0,0 +1,16 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+mtetc -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-REQUIRES-MTETC
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC ZGBVA requires: mtetc
+
+dc gbva
+// CHECK-ERROR: error: specified dc op requires a register
+// CHECK-REQUIRES-MTETC: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-mtetc.s b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
new file mode 100644
index 0000000000000..087b23b8f1478
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-mtetc.s
@@ -0,0 +1,29 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=+mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+mtetc < %s \
+// RUN:        | llvm-objdump -d --mattr=-mtetc --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+mtetc < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+mtetc -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+//------------------------------------------------------------------------------
+// FEAT_MTETC Extension instructions
+//------------------------------------------------------------------------------
+
+dc zgbva, x0
+// CHECK-INST:    dc zgbva, x0
+// CHECK-ENCODING: [0xa0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74a0 sys #3, c7, c4, #5, x0
+// CHECK-ERROR: DC ZGBVA requires: mtetc
+
+dc gbva, x0
+// CHECK-INST:    dc gbva, x0
+// CHECK-ENCODING: [0xe0,0x74,0x0b,0xd5]
+// CHECK-UNKNOWN: d50b74e0 sys #3, c7, c4, #7, x0
+// CHECK-ERROR: DC GBVA requires: mtetc
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
new file mode 100644
index 0000000000000..2440fd30fae03
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid-diagnostics.s
@@ -0,0 +1,64 @@
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw,+tlbid,+rme < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-NO-REGISTER
+
+// Test without using +tlbid - no optional register operand allowed
+
+tlbi vmalle1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1os, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle1is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle2is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi alle3is, x5
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1os, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmallws2e1is, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+
+tlbi vmalle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle2, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi alle3, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi vmalls12e1, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paallos, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
+
+tlbi paall, x1
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-NO-REGISTER: error: specified tlbi op does not use a register
diff --git a/llvm/test/MC/AArch64/armv9.7a-tlbid.s b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
new file mode 100644
index 0000000000000..1362bd3ae1799
--- /dev/null
+++ b/llvm/test/MC/AArch64/armv9.7a-tlbid.s
@@ -0,0 +1,84 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+tlb-rmi,+tlbiw < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=+tlbid,+tlb-rmi,+tlbiw --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | llvm-objdump -d --mattr=-tlbid --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+tlbid,+tlb-rmi,+tlbiw < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+tlbid,+tlb-rmi,+tlbiw -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+// Armv9.7-A TLBI Domains (FEAT_TLBID)
+
+tlbi vmalle1is
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, xzr
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x31
+// CHECK-INST: tlbi vmalle1is
+// CHECK-ENCODING: encoding: [0x1f,0x83,0x08,0xd5]
+// CHECK-UNKNOWN: d508831f tlbi vmalle1is
+
+tlbi vmalle1is, x5
+// CHECK-INST: tlbi vmalle1is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088305 sys	#0, c8, c3, #0, x5
+
+tlbi vmalle1os, x5
+// CHECK-INST: tlbi vmalle1os, x5
+// CHECK-ENCODING: encoding: [0x05,0x81,0x08,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d5088105 sys	#0, c8, c1, #0, x5
+
+tlbi alle1is, x5
+// CHECK-INST: tlbi alle1is, x5
+// CHECK-ENCODING: encoding: [0x85,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8385 sys	#4, c8, c3, #4, x5
+
+tlbi alle2is, x5
+// CHECK-INST: tlbi alle2is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8305 sys	#4, c8, c3, #0, x5
+
+tlbi alle3is, x5
+// CHECK-INST: tlbi alle3is, x5
+// CHECK-ENCODING: encoding: [0x05,0x83,0x0e,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50e8305 sys	#6, c8, c3, #0, x5
+
+tlbi vmalls12e1is, x1
+// CHECK-INST: tlbi vmalls12e1is, x1
+// CHECK-ENCODING: encoding: [0xc1,0x83,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c83c1 sys	#4, c8, c3, #6, x1
+
+tlbi vmalls12e1os, x5
+// CHECK-INST: tlbi vmalls12e1os, x5
+// CHECK-ENCODING: encoding: [0xc5,0x81,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c81c5 sys	#4, c8, c1, #6, x5
+
+tlbi vmallws2e1is, x1
+// CHECK-INST: tlbi vmallws2e1is, x1
+// CHECK-ENCODING: encoding: [0x41,0x82,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8241 sys	#4, c8, c2, #2, x1
+
+tlbi vmallws2e1os, x1
+// CHECK-INST: tlbi vmallws2e1os, x1
+// CHECK-ENCODING: encoding: [0x41,0x85,0x0c,0xd5]
+// CHECK-ERROR: error: specified tlbi op does not use a register
+// CHECK-UNKNOWN: d50c8541 sys	#4, c8, c5, #2, x1
diff --git a/llvm/test/MC/AArch64/neon-fdot-diagnostics.s b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
new file mode 100644
index 0000000000000..4f5f557644094
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot-diagnostics.s
@@ -0,0 +1,59 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=f16f32dot 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand
+
+fdot v0.2s, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// fdot indexed
+
+fdot v0.2s, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2b, v0.4b, v0.4b[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2b, v0.4b, v0.4b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4s, v0.4s[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2s, v0.4s, v0.4s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2h, v0.4h, v0.4h[0]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fdot v0.2h, v0.4h, v0.4h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+// --------------------------------------------------------------------------//
+// Invalid immediate range
+
+fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[-1]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3].
+// CHECK-NEXT: fdot v0.2s, v0.4h, v0.2h[4]
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fdot.s b/llvm/test/MC/AArch64/neon-fdot.s
new file mode 100644
index 0000000000000..c8a8e2f9023e1
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fdot.s
@@ -0,0 +1,147 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32dot < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32dot --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32dot < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32dot -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fdot v0.2s, v0.4h, v0.4h
+// CHECK-INST: fdot v0.2s, v0.4h, v0.4h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e80fc00 <unknown>
+
+fdot v10.2s, v10.4h, v10.4h
+// CHECK-INST: fdot v10.2s, v10.4h, v10.4h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e8afd4a <unknown>
+
+fdot v31.2s, v31.4h, v31.4h
+// CHECK-INST: fdot v31.2s, v31.4h, v31.4h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x0e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0e9fffff <unknown>
+
+fdot v0.4s, v0.8h, v0.8h
+// CHECK-INST: fdot v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xfc,0x80,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e80fc00 <unknown>
+
+fdot v10.4s, v10.8h, v10.8h
+// CHECK-INST: fdot v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xfd,0x8a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e8afd4a <unknown>
+
+fdot v31.4s, v31.8h, v31.8h
+// CHECK-INST: fdot v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xff,0x9f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4e9fffff <unknown>
+
+// fdot indexed
+
+fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x00,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409000 <unknown>
+
+fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x0a,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40900a <unknown>
+
+fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x15,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409015 <unknown>
+
+fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v0.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x1f,0x90,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40901f <unknown>
+
+fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v0.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x40,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409140 <unknown>
+
+fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v10.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x4a,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40914a <unknown>
+
+fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v21.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x55,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f409155 <unknown>
+
+fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-INST: fdot v31.2s, v10.4h, v0.2h[0]
+// CHECK-ENCODING: encoding: [0x5f,0x91,0x40,0x0f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 0f40915f <unknown>
+
+fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xa0,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aa0 <unknown>
+
+fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xaa,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9aaa <unknown>
+
+fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xb5,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9ab5 <unknown>
+
+fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v21.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xbf,0x9a,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9abf <unknown>
+
+fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v0.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xe0,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9be0 <unknown>
+
+fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v10.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xea,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bea <unknown>
+
+fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v21.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xf5,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bf5 <unknown>
+
+fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-INST: fdot v31.4s, v31.8h, v31.2h[3]
+// CHECK-ENCODING: encoding: [0xff,0x9b,0x7f,0x4f]
+// CHECK-ERROR: instruction requires: f16f32dot
+// CHECK-UNKNOWN: 4f7f9bff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
new file mode 100644
index 0000000000000..ccc074225b6a1
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16f32mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.4b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4h, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.4h, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4s, v0.8s, v0.8s
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4s, v0.8s, v0.8s
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.4d, v0.8d, v0.8d
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.4d, v0.8d, v0.8d
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla-HtoS.s b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
new file mode 100644
index 0000000000000..6b3d352d285d8
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-HtoS.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16f32mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16f32mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16f32mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16f32mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.4s, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.4s, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0x40,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e40ec00 <unknown>
+
+fmmla v10.4s, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.4s, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0x4a,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e4aed4a <unknown>
+
+fmmla v21.4s, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.4s, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0x55,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e55eeb5 <unknown>
+
+fmmla v31.4s, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.4s, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0x5f,0x4e]
+// CHECK-ERROR: instruction requires: f16f32mm
+// CHECK-UNKNOWN: 4e5fefff <unknown>
diff --git a/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
new file mode 100644
index 0000000000000..7fc5373fb30bb
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla-diagnostics.s
@@ -0,0 +1,24 @@
+// RUN: not llvm-mc -triple=aarch64 -mattr=+f16mm 2>&1 < %s| FileCheck %s
+
+// --------------------------------------------------------------------------//
+// Invalid operand/vector
+
+fmmla v0.8b, v0.8b, v0.8b
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8b, v0.8b
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8b, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
+// CHECK-NEXT: fmmla v0.8b, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8s, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8s, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
+
+fmmla v0.8d, v0.8h, v0.8h
+// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid vector kind qualifier
+// CHECK-NEXT: fmmla v0.8d, v0.8h, v0.8h
+// CHECK-NOT: [[@LINE-1]]:{{[0-9]+}}:
diff --git a/llvm/test/MC/AArch64/neon-fmmla.s b/llvm/test/MC/AArch64/neon-fmmla.s
new file mode 100644
index 0000000000000..f35c2fb9bfd91
--- /dev/null
+++ b/llvm/test/MC/AArch64/neon-fmmla.s
@@ -0,0 +1,37 @@
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm< %s \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+// RUN: not llvm-mc -triple=aarch64 -show-encoding < %s 2>&1 \
+// RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=+f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-INST
+// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+f16mm < %s \
+// RUN:        | llvm-objdump -d --mattr=-f16mm --no-print-imm-hex - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+// Disassemble encoding and check the re-encoding (-show-encoding) matches.
+// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+f16mm < %s \
+// RUN:        | sed '/.text/d' | sed 's/.*encoding: //g' \
+// RUN:        | llvm-mc -triple=aarch64 -mattr=+f16mm -disassemble -show-encoding \
+// RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+
+fmmla v0.8h, v0.8h, v0.8h
+// CHECK-INST: fmmla v0.8h, v0.8h, v0.8h
+// CHECK-ENCODING: encoding: [0x00,0xec,0xc0,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ec0ec00 <unknown>
+
+fmmla v10.8h, v10.8h, v10.8h
+// CHECK-INST: fmmla v10.8h, v10.8h, v10.8h
+// CHECK-ENCODING: encoding: [0x4a,0xed,0xca,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ecaed4a <unknown>
+
+fmmla v21.8h, v21.8h, v21.8h
+// CHECK-INST: fmmla v21.8h, v21.8h, v21.8h
+// CHECK-ENCODING: encoding: [0xb5,0xee,0xd5,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4ed5eeb5 <unknown>
+
+fmmla v31.8h, v31.8h, v31.8h
+// CHECK-INST: fmmla v31.8h, v31.8h, v31.8h
+// CHECK-ENCODING: encoding: [0xff,0xef,0xdf,0x4e]
+// CHECK-ERROR: instruction requires: f16mm
+// CHECK-UNKNOWN: 4edfefff <unknown>
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
index 4f7ca47705eb2..358fe0b008368 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_sopp.s
@@ -45,6 +45,10 @@ s_set_vgpr_msb 255
 // GFX1250: [0xff,0x00,0x86,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
 
+s_set_vgpr_msb 0xffff
+// GFX1250: [0xff,0xff,0x86,0xbf]
+// GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
+
 s_monitor_sleep 1
 // GFX1250: s_monitor_sleep 1                       ; encoding: [0x01,0x00,0x84,0xbf]
 // GFX12-ERR: :[[@LINE-2]]:{{[0-9]+}}: error: instruction not supported on this GPU
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
index d3b44eb788444..8160544ddea4d 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3-fake16.s
@@ -218,64 +218,76 @@ v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp
 // GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_i32 v2, s4, v7, v8
-// GFX1250: v_add_min_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_min_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_i32 v2, v4, 0, 1
-// GFX1250: v_add_min_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_min_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_min_i32 v2, v4, 3, s2
-// GFX1250: v_add_min_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_min_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_min_i32 v2, s4, 4, v2
-// GFX1250: v_add_min_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_min_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_min_i32 v2, v4, v7, 12345
-// GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_min_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_min_i32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_max_i32 v2, s4, v7, v8
-// GFX1250: v_add_max_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_max_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_max_i32 v2, v4, 0, 1
-// GFX1250: v_add_max_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_max_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_max_i32 v2, v4, 3, s2
-// GFX1250: v_add_max_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_max_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_max_i32 v2, s4, 4, v2
-// GFX1250: v_add_max_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_max_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_max_i32 v2, v4, v7, 12345
-// GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_max_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_max_i32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_max_i32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5e,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_min_u32 v2, s4, v7, v8
-// GFX1250: v_add_min_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_min_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_u32 v2, v4, 0, 1
-// GFX1250: v_add_min_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_min_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_min_u32 v2, v4, 3, s2
-// GFX1250: v_add_min_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_min_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_min_u32 v2, s4, 4, v2
-// GFX1250: v_add_min_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_min_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_min_u32 v2, v4, v7, 12345
-// GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_min_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_min_u32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_max_u32 v2, s4, v7, v8
-// GFX1250: v_add_max_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_max_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_max_u32 v2, v4, 0, 1
-// GFX1250: v_add_max_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_max_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_max_u32 v2, v4, 3, s2
-// GFX1250: v_add_max_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_max_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_max_u32 v2, s4, 4, v2
-// GFX1250: v_add_max_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_max_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_max_u32 v2, v4, v7, 12345
-// GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_max_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_max_u32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04]
 
 v_cvt_pk_bf16_f32 v5, v1, v2
 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
index 98d07ac1ece27..d913bd2db504b 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_asm_vop3.s
@@ -218,64 +218,76 @@ v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp
 // GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_i32 v2, s4, v7, v8
-// GFX1250: v_add_min_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_min_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_i32 v2, v4, 0, 1
-// GFX1250: v_add_min_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_min_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_min_i32 v2, v4, 3, s2
-// GFX1250: v_add_min_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_min_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_min_i32 v2, s4, 4, v2
-// GFX1250: v_add_min_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_min_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_min_i32 v2, v4, v7, 12345
-// GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_min_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_min_i32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_max_i32 v2, s4, v7, v8
-// GFX1250: v_add_max_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_max_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_max_i32 v2, v4, 0, 1
-// GFX1250: v_add_max_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_max_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_max_i32 v2, v4, 3, s2
-// GFX1250: v_add_max_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_max_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_max_i32 v2, s4, 4, v2
-// GFX1250: v_add_max_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_max_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_max_i32 v2, v4, v7, 12345
-// GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_max_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_max_u32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_min_u32 v2, s4, v7, v8
-// GFX1250: v_add_min_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_min_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_min_u32 v2, v4, 0, 1
-// GFX1250: v_add_min_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_min_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_min_u32 v2, v4, 3, s2
-// GFX1250: v_add_min_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_min_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_min_u32 v2, s4, 4, v2
-// GFX1250: v_add_min_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_min_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_min_u32 v2, v4, v7, 12345
-// GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_min_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_min_u32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04]
 
 v_add_max_u32 v2, s4, v7, v8
-// GFX1250: v_add_max_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
+// GFX1250: v_add_max_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
 
 v_add_max_u32 v2, v4, 0, 1
-// GFX1250: v_add_max_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
+// GFX1250: v_add_max_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
 
 v_add_max_u32 v2, v4, 3, s2
-// GFX1250: v_add_max_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
+// GFX1250: v_add_max_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
 
 v_add_max_u32 v2, s4, 4, v2
-// GFX1250: v_add_max_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
+// GFX1250: v_add_max_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
 
 v_add_max_u32 v2, v4, v7, 12345
-// GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+// GFX1250: v_add_max_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+v_add_max_u32 v0, v1, v2, v3 clamp
+// GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04]
 
 v_cvt_pk_bf16_f32 v5, v1, v2
 // GFX1250: v_cvt_pk_bf16_f32 v5, v1, v2            ; encoding: [0x05,0x00,0x6d,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/AMDGPU/gfx1250_err.s b/llvm/test/MC/AMDGPU/gfx1250_err.s
index 9d1131ef9fb7a..676eb48cc5a7f 100644
--- a/llvm/test/MC/AMDGPU/gfx1250_err.s
+++ b/llvm/test/MC/AMDGPU/gfx1250_err.s
@@ -1,15 +1,5 @@
 // RUN: not llvm-mc -triple=amdgcn -mcpu=gfx1250 -show-encoding %s 2>&1 | FileCheck --check-prefixes=GFX1250-ERR --implicit-check-not=error: -strict-whitespace %s
 
-s_set_vgpr_msb -1
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb -1
-// GFX1250-ERR:                ^
-
-s_set_vgpr_msb 256
-// GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: s_set_vgpr_msb accepts values in range [0..255]
-// GFX1250-ERR: s_set_vgpr_msb 256
-// GFX1250-ERR:                ^
-
 s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
 // GFX1250-ERR: :[[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction
 // GFX1250-ERR: s_load_b32 s4, s[2:3], 10 th:TH_LOAD_NT th:TH_LOAD_NT
diff --git a/llvm/test/MC/AMDGPU/literals.s b/llvm/test/MC/AMDGPU/literals.s
index 78aa8f2b44251..3faea9921bb23 100644
--- a/llvm/test/MC/AMDGPU/literals.s
+++ b/llvm/test/MC/AMDGPU/literals.s
@@ -20,282 +20,282 @@
 //---------------------------------------------------------------------------//
 
 v_fract_f64 v[0:1], 0.5
-// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0.5             ; encoding: [0xf0,0x7c,0x00,0x7e]
 
 v_sqrt_f64 v[0:1], -4.0
-// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
-// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
-// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 // GFX11: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX12XX: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
+// GFX89: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x50,0x00,0x7e]
+// SICI: v_sqrt_f64_e32 v[0:1], -4.0             ; encoding: [0xf7,0x68,0x00,0x7e]
 
 v_log_clamp_f32 v1, 0.5
 // NOGFX8PLUS: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // SICI: v_log_clamp_f32_e32 v1, 0.5             ; encoding: [0xf0,0x4c,0x02,0x7e]
 
 v_trunc_f32 v0, 0.5
-// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0.5                 ; encoding: [0xf0,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], -1.0
-// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1.0            ; encoding: [0xf3,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, -1.0
-// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1.0                ; encoding: [0xf3,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 4.0
-// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 4.0             ; encoding: [0xf6,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 4.0
-// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 4.0                 ; encoding: [0xf6,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 0.0
-// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0               ; encoding: [0x80,0x7c,0x00,0x7e]
 
 v_trunc_f32 v0, 0.0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64 v[0:1], 1.5
-// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf8,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], 0x3ff80000      ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf8,0x3f]
 
 v_trunc_f32 v0, 1.5
-// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 // GFX11: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// GFX89: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0xc0,0x3f]
+// SICI: v_trunc_f32_e32 v0, 0x3fc00000          ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0xc0,0x3f]
 
 v_fract_f64 v[0:1], -3.1415
-// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX12: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // GFX1250: v_fract_f64_e32 v[0:1], 0xc00921cac083126f ; encoding: [0xfe,0x7c,0x00,0x7e,0x6f,0x12,0x83,0xc0,0xca,0x21,0x09,0xc0]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x64,0x00,0x7e,0xca,0x21,0x09,0xc0]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0xc00921ca      ; encoding: [0xff,0x7c,0x00,0x7e,0xca,0x21,0x09,0xc0]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, -3.1415
-// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 // GFX11: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// GFX89: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x38,0x00,0x7e,0x56,0x0e,0x49,0xc0]
+// SICI: v_trunc_f32_e32 v0, 0xc0490e56          ; encoding: [0xff,0x42,0x00,0x7e,0x56,0x0e,0x49,0xc0]
 
 v_fract_f64 v[0:1], 100000000000000000000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX12: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x44b52d02c7e14af6 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf6,0x4a,0xe1,0xc7,0x02,0x2d,0xb5,0x44]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x64,0x00,0x7e,0x02,0x2d,0xb5,0x44]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x44b52d02      ; encoding: [0xff,0x7c,0x00,0x7e,0x02,0x2d,0xb5,0x44]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 100000000000000000000000.0
-// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 // GFX11: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// GFX89: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x38,0x00,0x7e,0x16,0x68,0xa9,0x65]
+// SICI: v_trunc_f32_e32 v0, 0x65a96816          ; encoding: [0xff,0x42,0x00,0x7e,0x16,0x68,0xa9,0x65]
 
 v_fract_f64 v[0:1], 10000000.0
-// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 // GFX11: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// GFX89: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x64,0x00,0x7e,0xd0,0x12,0x63,0x41]
+// SICI: v_fract_f64_e32 v[0:1], 0x416312d0      ; encoding: [0xff,0x7c,0x00,0x7e,0xd0,0x12,0x63,0x41]
 
 v_trunc_f32 v0, 10000000.0
-// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 // GFX11: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// GFX89: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x38,0x00,0x7e,0x80,0x96,0x18,0x4b]
+// SICI: v_trunc_f32_e32 v0, 0x4b189680          ; encoding: [0xff,0x42,0x00,0x7e,0x80,0x96,0x18,0x4b]
 
 v_fract_f64 v[0:1], 3.402823e+38
-// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
-// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX12: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x47efffff966ad924 ; encoding: [0xfe,0x7c,0x00,0x7e,0x24,0xd9,0x6a,0x96,0xff,0xff,0xef,0x47]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xef,0x47]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x47efffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xef,0x47]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 3.402823e+38
-// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x38,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
+// SICI: v_trunc_f32_e32 v0, 0x7f7ffffd          ; encoding: [0xff,0x42,0x00,0x7e,0xfd,0xff,0x7f,0x7f]
 
 v_fract_f64 v[0:1], 2.3509886e-38
-// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX12: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x381fffffe8c9d9fb ; encoding: [0xfe,0x7c,0x00,0x7e,0xfb,0xd9,0xc9,0xe8,0xff,0xff,0x1f,0x38]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0x1f,0x38]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x381fffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0x1f,0x38]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-38
-// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x38,0x00,0x7e,0xff,0xff,0xff,0x00]
+// SICI: v_trunc_f32_e32 v0, 0xffffff            ; encoding: [0xff,0x42,0x00,0x7e,0xff,0xff,0xff,0x00]
 
 v_fract_f64 v[0:1], 2.3509886e-70
-// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
-// NOSICI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX89: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 // GFX11: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX12: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // GFX1250: v_fract_f64_e32 v[0:1], 0x3179f623c2d3cf3c ; encoding: [0xfe,0x7c,0x00,0x7e,0x3c,0xcf,0xd3,0xc2,0x23,0xf6,0x79,0x31]
-// NOGFX11: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// NOGFX12: :[[@LINE-9]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// GFX89: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x64,0x00,0x7e,0x23,0xf6,0x79,0x31]
+// NOGFX11: :[[@LINE-5]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX12: :[[@LINE-6]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOGFX89: :[[@LINE-7]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// NOSICI: :[[@LINE-8]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
+// SICI: v_fract_f64_e32 v[0:1], 0x3179f623      ; encoding: [0xff,0x7c,0x00,0x7e,0x23,0xf6,0x79,0x31]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 1.0
-// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1.0             ; encoding: [0xf2,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1.0)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xfe,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x64,0x00,0x7e,0x00,0x00,0xf0,0x3f]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x3ff00000) ; encoding: [0xff,0x7c,0x00,0x7e,0x00,0x00,0xf0,0x3f]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1.0 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0xca,0x1b]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x3c00)         ; encoding: [0xff,0xc2,0x0a,0x7e,0x00,0x3c,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1.0                 ; encoding: [0xf2,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x3f80)         ; encoding: [0xff,0x94,0x0a,0x7e,0x80,0x3f,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1.0
-// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1.0                 ; encoding: [0xf2,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1.0)
-// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x38,0x00,0x7e,0x00,0x00,0x80,0x3f]
+// SICI: v_trunc_f32_e32 v0, lit(0x3f800000)     ; encoding: [0xff,0x42,0x00,0x7e,0x00,0x00,0x80,0x3f]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1.0      ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xca,0x03]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x3f80) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x80,0x3f,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1.0, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1.0, v2          ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xe5,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1.0), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x3c00), v2  ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x00,0x3c,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 0x3c00           ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x3c00)      ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x00,0x3c,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // fp literal, expected int operand
@@ -309,118 +309,118 @@ s_mov_b64 s[0:1], lit(0.5)
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.5, v1
-// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0.5, v1               ; encoding: [0xf0,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.5, v1
-// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf0,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf0,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0.5, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf0,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -1.0
 // GFX8PLUS: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -1.0                  ; encoding: [0xf3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -1.0, v1
-// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1.0, v1              ; encoding: [0xf3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -1.0, v1
-// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x1b,0xd5,0xf3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x13,0xd1,0xf3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -1.0, v1              ; encoding: [0x00,0x00,0x36,0xd2,0xf3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 4.0
 // GFX8PLUS: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 4.0                   ; encoding: [0xf6,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 4.0, v1
-// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 4.0, v1               ; encoding: [0xf6,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 4.0, v1
-// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xf6,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xf6,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 4.0, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xf6,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 0.0
 // GFX8PLUS: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0.0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0.0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1.5
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 1.5, v1
-// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
-// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
-// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 // GFX11: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX12XX: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
+// GFX89: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0xc0,0x3f]
+// SICI: v_and_b32_e32 v0, 0x3fc00000, v1        ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0xc0,0x3f]
 
 s_mov_b64_e32 s[0:1], -3.1415
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, -3.1415, v1
-// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
-// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
-// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 // GFX11: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX12XX: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
+// GFX89: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x26,0x56,0x0e,0x49,0xc0]
+// SICI: v_and_b32_e32 v0, 0xc0490e56, v1        ; encoding: [0xff,0x02,0x00,0x36,0x56,0x0e,0x49,0xc0]
 
 s_mov_b64_e32 s[0:1], 100000000000000000000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 100000000000000000000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
-// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
-// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 // GFX11: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX12XX: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
+// GFX89: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x26,0x16,0x68,0xa9,0x65]
+// SICI: v_and_b32_e32 v0, 0x65a96816, v1        ; encoding: [0xff,0x02,0x00,0x36,0x16,0x68,0xa9,0x65]
 
 s_mov_b64_e32 s[0:1], 10000000.0
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 10000000.0, v1
-// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
-// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
-// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 // GFX11: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX12XX: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
+// GFX89: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x26,0x80,0x96,0x18,0x4b]
+// SICI: v_and_b32_e32 v0, 0x4b189680, v1        ; encoding: [0xff,0x02,0x00,0x36,0x80,0x96,0x18,0x4b]
 
 s_mov_b64_e32 s[0:1], 3.402823e+38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 3.402823e+38, v1
-// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
-// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
-// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 // GFX11: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX12XX: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
+// GFX89: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x26,0xfd,0xff,0x7f,0x7f]
+// SICI: v_and_b32_e32 v0, 0x7f7ffffd, v1        ; encoding: [0xff,0x02,0x00,0x36,0xfd,0xff,0x7f,0x7f]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-38
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 2.3509886e-38, v1
-// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
-// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 // GFX11: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
+// GFX89: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x26,0xff,0xff,0xff,0x00]
+// SICI: v_and_b32_e32 v0, 0xffffff, v1          ; encoding: [0xff,0x02,0x00,0x36,0xff,0xff,0xff,0x00]
 
 s_mov_b64_e32 s[0:1], 2.3509886e-70
 // NOGCN: :[[@LINE-1]]:23: error: invalid operand for instruction
@@ -429,322 +429,322 @@ v_and_b32_e32 v0, 2.3509886e-70, v1
 // NOGCN: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_not_b16 v5.l, 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1.0                 ; encoding: [0xf2,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x3f800000)     ; encoding: [0xff,0xd2,0x0a,0x7e,0x00,0x00,0x80,0x3f]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_and_b32_e32 v0, 1.0, v1
-// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1.0, v1               ; encoding: [0xf2,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1.0), v1
-// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
-// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
-// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 // GFX11: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX12XX: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
+// GFX89: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x26,0x00,0x00,0x80,0x3f]
+// SICI: v_and_b32_e32 v0, lit(0x3f800000), v1   ; encoding: [0xff,0x02,0x00,0x36,0x00,0x00,0x80,0x3f]
 
 v_pk_add_u16 v5, exec_lo, 1.0
+// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0xe4,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1.0           ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xe4,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1.0)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x3f800000) ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x00,0x00,0x80,0x3f]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1.0 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xca,0x03]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1.0)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x3f800000) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x00,0x00,0x80,0x3f]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected fp operand
 //---------------------------------------------------------------------------//
 
 v_trunc_f32_e32 v0, 0
-// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0                   ; encoding: [0x80,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 1
-// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 1               ; encoding: [0x81,0x7c,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], lit(1)
-// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX12: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xfe,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x64,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], lit(0x1)        ; encoding: [0xff,0x7c,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_trunc_f32_e64 v0, 0
-// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0xa1,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x5c,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 0                   ; encoding: [0x00,0x00,0x42,0xd3,0x80,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 0
-// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0xbe,0xd5,0x80,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x72,0xd1,0x80,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 0               ; encoding: [0x00,0x00,0x7c,0xd3,0x80,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, -13
-// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -13                 ; encoding: [0xcd,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], -13
-// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -13             ; encoding: [0xcd,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, -13
-// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0xa1,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x5c,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, -13                 ; encoding: [0x00,0x00,0x42,0xd3,0xcd,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], -13
-// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0xbe,0xd5,0xcd,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x72,0xd1,0xcd,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], -13             ; encoding: [0x00,0x00,0x7c,0xd3,0xcd,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 35
-// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 35                  ; encoding: [0xa3,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 35
-// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 35              ; encoding: [0xa3,0x7c,0x00,0x7e]
 
 v_trunc_f32_e64 v0, 35
-// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0xa1,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x5c,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e64 v0, 35                  ; encoding: [0x00,0x00,0x42,0xd3,0xa3,0x00,0x00,0x00]
 
 v_fract_f64_e64 v[0:1], 35
-// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
-// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
 // GFX11: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0xbe,0xd5,0xa3,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x72,0xd1,0xa3,0x00,0x00,0x00]
+// SICI: v_fract_f64_e64 v[0:1], 35              ; encoding: [0x00,0x00,0x7c,0xd3,0xa3,0x00,0x00,0x00]
 
 v_trunc_f32_e32 v0, 1234
-// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x38,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, 0x4d2               ; encoding: [0xff,0x42,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_fract_f64_e32 v[0:1], 1234
-// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 // GFX11: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// GFX89: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x64,0x00,0x7e,0xd2,0x04,0x00,0x00]
+// SICI: v_fract_f64_e32 v[0:1], 0x4d2           ; encoding: [0xff,0x7c,0x00,0x7e,0xd2,0x04,0x00,0x00]
 
 v_trunc_f32_e64 v0, 1234
+// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:21: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:21: error: literal operands are not supported
-// GFX11: v_trunc_f32_e64 v0, 0x4d2               ; encoding: [0x00,0x00,0xa1,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 1234
+// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x4d2           ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 v_trunc_f32_e32 v0, -54321
-// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x38,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_trunc_f32_e32 v0, 0xffff2bcf          ; encoding: [0xff,0x42,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_fract_f64_e32 v[0:1], -54321
-// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x64,0x00,0x7e,0xcf,0x2b,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffff2bcf      ; encoding: [0xff,0x7c,0x00,0x7e,0xcf,0x2b,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0xdeadbeef
-// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x38,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_trunc_f32_e32 v0, 0xdeadbeef          ; encoding: [0xff,0x42,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_fract_f64_e32 v[0:1], 0xdeadbeef
-// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 // GFX11: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// GFX89: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x64,0x00,0x7e,0xef,0xbe,0xad,0xde]
+// SICI: v_fract_f64_e32 v[0:1], 0xdeadbeef      ; encoding: [0xff,0x7c,0x00,0x7e,0xef,0xbe,0xad,0xde]
 
 v_trunc_f32_e32 v0, 0xffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffff
-// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 // GFX11: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
+// GFX89: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x64,0x00,0x7e,0xff,0xff,0xff,0xff]
+// SICI: v_fract_f64_e32 v[0:1], 0xffffffff      ; encoding: [0xff,0x7c,0x00,0x7e,0xff,0xff,0xff,0xff]
 
 v_trunc_f32_e32 v0, 0x123456789abcdef0
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:25: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:25: error: invalid operand for instruction
 // GFX1250: v_fract_f64_e32 v[0:1], 0x123456789abcdef0 ; encoding: [0xfe,0x7c,0x00,0x7e,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:25: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:25: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:25: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:25: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:25: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0xffffffffffffffff
-// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, -1                  ; encoding: [0xc1,0x42,0x00,0x7e]
 
 v_fract_f64_e32 v[0:1], 0xffffffffffffffff
-// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
-// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 // GFX11: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x64,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], -1              ; encoding: [0xc1,0x7c,0x00,0x7e]
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], 1 ; encoding: [0x08,0x40,0x44,0xcc,0x00,0x09,0x06,0x1a]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_wmma_i32_16x16x16_iu8 v[8:15], v[0:3], v[4:7], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:54: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-4]]:54: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-5]]:54: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:54: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:54: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:54: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
 // GFX1250: v_cos_f16_e32 v5.l, 1                   ; encoding: [0x81,0xc2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cos_f16_e32 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
 // GFX11: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_cos_f16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xc2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, 1                   ; encoding: [0x81,0x94,0x0a,0x7e]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_tanh_bf16 v5, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_tanh_bf16_e32 v5, lit(0x1)            ; encoding: [0xff,0x94,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_trunc_f32_e32 v0, 1
-// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
-// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 1                   ; encoding: [0x81,0x42,0x00,0x7e]
 
 v_trunc_f32_e32 v0, lit(1)
-// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 // GFX11: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x38,0x00,0x7e,0x01,0x00,0x00,0x00]
+// SICI: v_trunc_f32_e32 v0, lit(0x1)            ; encoding: [0xff,0x42,0x00,0x7e,0x01,0x00,0x00,0x00]
 
 v_dot2_bf16_bf16 v5.l, v1, v2, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, 1        ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0x06,0x02]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_bf16_bf16 v5.l, v1, v2, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_bf16_bf16 v5.l, v1, v2, lit(0x1) ; encoding: [0x05,0x00,0x67,0xd6,0x01,0x05,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, 1, v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
 // GFX12: v_dot2_f32_f16 v5, v1, 1, v2            ; encoding: [0x05,0x40,0x13,0xcc,0x01,0x03,0x09,0x1c]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_dot2_f32_f16 v5, v1, lit(1), v2
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
 // GFX12: v_dot2_f32_f16 v5, v1, lit(0x1), v2     ; encoding: [0x05,0x40,0x13,0xcc,0x01,0xff,0x09,0x1c,0x01,0x00,0x00,0x00]
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, 1                ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_cvt_pk_fp8_f16 v1.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_cvt_pk_fp8_f16 v1.l, lit(0x1)         ; encoding: [0x01,0x00,0x72,0xd7,0xff,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // int literal, expected int operand
@@ -755,111 +755,111 @@ s_mov_b64_e32 s[0:1], 0
 // SICI: s_mov_b64 s[0:1], 0                     ; encoding: [0x80,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0, v1
-// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0, v1                 ; encoding: [0x80,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 0, v1
-// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x1b,0xd5,0x80,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x13,0xd1,0x80,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 0, v1                 ; encoding: [0x00,0x00,0x36,0xd2,0x80,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], -13
 // GFX8PLUS: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], -13                   ; encoding: [0xcd,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, -13, v1
-// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -13, v1               ; encoding: [0xcd,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, -13, v1
-// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x1b,0xd5,0xcd,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x13,0xd1,0xcd,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, -13, v1               ; encoding: [0x00,0x00,0x36,0xd2,0xcd,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 35
 // GFX8PLUS: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 35                    ; encoding: [0xa3,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 35, v1
-// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 35, v1                ; encoding: [0xa3,0x02,0x00,0x36]
 
 v_and_b32_e64 v0, 35, v1
-// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
-// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
 // GFX11: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x1b,0xd5,0xa3,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x13,0xd1,0xa3,0x02,0x02,0x00]
+// SICI: v_and_b32_e64 v0, 35, v1                ; encoding: [0x00,0x00,0x36,0xd2,0xa3,0x02,0x02,0x00]
 
 s_mov_b64_e32 s[0:1], 1234
 // GFX8PLUS: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x01,0x80,0xbe,0xd2,0x04,0x00,0x00]
 // SICI: s_mov_b64 s[0:1], 0x4d2                 ; encoding: [0xff,0x04,0x80,0xbe,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e32 v0, 1234, v1
-// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x26,0xd2,0x04,0x00,0x00]
+// SICI: v_and_b32_e32 v0, 0x4d2, v1             ; encoding: [0xff,0x02,0x00,0x36,0xd2,0x04,0x00,0x00]
 
 v_and_b32_e64 v0, 1234, v1
+// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
 // GFX12XX: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:19: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:19: error: literal operands are not supported
-// GFX11: v_and_b32_e64 v0, 0x4d2, v1             ; encoding: [0x00,0x00,0x1b,0xd5,0xff,0x02,0x02,0x00,0xd2,0x04,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:19: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], -54321
-// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffffffff2bcf    ; encoding: [0xfe,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff,0xff,0xff,0xff,0xff]
+// GFX89: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x01,0x80,0xbe,0xcf,0x2b,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffff2bcf            ; encoding: [0xff,0x04,0x80,0xbe,0xcf,0x2b,0xff,0xff]
 
 v_and_b32_e32 v0, -54321, v1
-// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
-// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
-// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 // GFX11: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX12XX: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
+// GFX89: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x26,0xcf,0x2b,0xff,0xff]
+// SICI: v_and_b32_e32 v0, 0xffff2bcf, v1        ; encoding: [0xff,0x02,0x00,0x36,0xcf,0x2b,0xff,0xff]
 
 s_mov_b64_e32 s[0:1], 0xdeadbeef
-// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
-// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX11: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX12: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
 // GFX1250: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xfe,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x01,0x80,0xbe,0xef,0xbe,0xad,0xde]
+// SICI: s_mov_b64 s[0:1], 0xdeadbeef            ; encoding: [0xff,0x04,0x80,0xbe,0xef,0xbe,0xad,0xde]
 
 v_and_b32_e32 v0, 0xdeadbeef, v1
-// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
-// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
-// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 // GFX11: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX12XX: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
+// GFX89: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x26,0xef,0xbe,0xad,0xde]
+// SICI: v_and_b32_e32 v0, 0xdeadbeef, v1        ; encoding: [0xff,0x02,0x00,0x36,0xef,0xbe,0xad,0xde]
 
 s_mov_b64_e32 s[0:1], 0xffffffff
-// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
-// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX11: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX12: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
 // GFX1250: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x01,0x80,0xbe,0xff,0xff,0xff,0xff]
+// SICI: s_mov_b64 s[0:1], 0xffffffff            ; encoding: [0xff,0x04,0x80,0xbe,0xff,0xff,0xff,0xff]
 
 v_and_b32_e32 v0, 0xffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 s_mov_b64_e32 s[0:1], 0x123456789abcdef0
-// NOSICI: :[[@LINE-1]]:23: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:23: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x123456789abcdef0    ; encoding: [0xfe,0x01,0x80,0xbe,0xf0,0xde,0xbc,0x9a,0x78,0x56,0x34,0x12]
-// NOGFX11: :[[@LINE-4]]:23: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:23: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:23: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:23: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:23: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:23: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0x123456789abcdef0, v1
@@ -870,75 +870,75 @@ s_mov_b64_e32 s[0:1], 0xffffffffffffffff
 // SICI: s_mov_b64 s[0:1], -1                    ; encoding: [0xc1,0x04,0x80,0xbe]
 
 v_and_b32_e32 v0, 0xffffffffffffffff, v1
-// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, -1, v1                ; encoding: [0xc1,0x02,0x00,0x36]
 
 v_not_b16 v5.l, 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
 // GFX1250: v_not_b16_e32 v5.l, 1                   ; encoding: [0x81,0xd2,0x0a,0x7e]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_not_b16 v5.l, lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
 // GFX1250: v_not_b16_e32 v5.l, lit(0x1)            ; encoding: [0xff,0xd2,0x0a,0x7e,0x01,0x00,0x00,0x00]
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 s_mov_b64 s[0:1], 1
 // GFX8PLUS: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x01,0x80,0xbe]
 // SICI: s_mov_b64 s[0:1], 1                     ; encoding: [0x81,0x04,0x80,0xbe]
 
 s_mov_b64 s[0:1], lit(1)
-// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
-// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX11: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX12: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
 // GFX1250: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x01,0x80,0xbe,0x01,0x00,0x00,0x00]
+// SICI: s_mov_b64 s[0:1], lit(0x1)              ; encoding: [0xff,0x04,0x80,0xbe,0x01,0x00,0x00,0x00]
 
 v_and_b32_e32 v0, 1, v1
-// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
-// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 1, v1                 ; encoding: [0x81,0x02,0x00,0x36]
 
 v_and_b32_e32 v0, lit(1), v1
-// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
-// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
-// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 // GFX11: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX12XX: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
+// GFX89: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x26,0x01,0x00,0x00,0x00]
+// SICI: v_and_b32_e32 v0, lit(0x1), v1          ; encoding: [0xff,0x02,0x00,0x36,0x01,0x00,0x00,0x00]
 
 v_pk_add_u16 v5, exec_lo, 1
+// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
 // GFX12XX: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x8a,0xd3,0x7e,0x02,0x01,0x18]
-// GFX11: v_pk_add_u16 v5, exec_lo, 1             ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0x02,0x01,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_pk_add_u16 v5, exec_lo, lit(1)
-// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:31: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_u16 v5, exec_lo, lit(0x1)      ; encoding: [0x05,0x40,0x0a,0xcc,0x7e,0xfe,0x01,0x18,0x01,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:31: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], 1 ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0x06,0x02]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(1)
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX1250: v_perm_pk16_b6_u4 v[2:4], v4, v[4:5], lit(0x1) ; encoding: [0x02,0x00,0x42,0xd6,0x04,0x09,0xfe,0x03,0x01,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
 // 1/(2*PI)
@@ -948,46 +948,46 @@ v_trunc_f32_e32 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e32 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e32 v0, 0x3e22f983
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_fract_f64_e32 v[0:1], 0x3e22f983
-// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x64,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3e22f983      ; encoding: [0xff,0x7c,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32_e64 v0, 0x3fc45f306dc9c882
 // NOGCN: :[[@LINE-1]]:21: error: invalid operand for instruction
 
 v_fract_f64_e64 v[0:1], 0x3fc45f306dc9c882
-// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:25: error: invalid operand for instruction
 // GFX11: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0xbe,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_fract_f64_e64 v[0:1], 0.15915494309189532 ; encoding: [0x00,0x00,0x72,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:25: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-2]]:25: error: invalid operand for instruction
 
 v_trunc_f32_e64 v0, 0x3e22f983
-// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
-// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:21: error: literal operands are not supported
 // GFX11: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX12XX: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0xa1,0xd5,0xf8,0x00,0x00,0x00]
+// GFX89: v_trunc_f32_e64 v0, 0.15915494          ; encoding: [0x00,0x00,0x5c,0xd1,0xf8,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:21: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:21: error: literal operands are not supported
 
 v_fract_f64_e64 v[0:1], 0x3e22f983
+// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
 // GFX12XX: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
-// NOSICI: :[[@LINE-2]]:25: error: literal operands are not supported
 // NOGFX89: :[[@LINE-3]]:25: error: literal operands are not supported
-// GFX11: v_fract_f64_e64 v[0:1], 0x3e22f983      ; encoding: [0x00,0x00,0xbe,0xd5,0xff,0x00,0x00,0x00,0x83,0xf9,0x22,0x3e]
+// NOSICI: :[[@LINE-4]]:25: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-1]]:25: error: literal operands are not supported
 
 s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
@@ -996,37 +996,37 @@ s_mov_b64_e32 s[0:1], 0.159154943091895317852646485335
 // NOSICIVI: :[[@LINE-2]]:23: error: invalid operand for instruction
 
 v_and_b32_e32 v0, 0.159154943091895317852646485335, v1
-// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
-// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
-// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
 // GFX11: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX12XX: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x36]
+// GFX89: v_and_b32_e32 v0, 0.15915494, v1        ; encoding: [0xf8,0x02,0x00,0x26]
+// SICI: v_and_b32_e32 v0, 0x3e22f983, v1        ; encoding: [0xff,0x02,0x00,0x36,0x83,0xf9,0x22,0x3e]
 
 v_and_b32_e64 v0, 0.159154943091895317852646485335, v1
-// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
-// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
-// NOSICI: :[[@LINE-3]]:19: error: literal operands are not supported
 // GFX11: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX12XX: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x1b,0xd5,0xf8,0x02,0x02,0x00]
+// GFX89: v_and_b32_e64 v0, 0.15915494, v1        ; encoding: [0x00,0x00,0x13,0xd1,0xf8,0x02,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: literal operands are not supported
 // NOSICIVI: :[[@LINE-2]]:19: error: literal operands are not supported
 
 v_fract_f64 v[0:1], 0.159154943091895317852646485335
-// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
-// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
+// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
 // GFX12XX: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// GFX89: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x64,0x00,0x7e]
 // NOSICI: :[[@LINE-4]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
-// GFX11: v_fract_f64_e32 v[0:1], 0.15915494309189532 ; encoding: [0xf8,0x7c,0x00,0x7e]
+// SICI: v_fract_f64_e32 v[0:1], 0x3fc45f30      ; encoding: [0xff,0x7c,0x00,0x7e,0x30,0x5f,0xc4,0x3f]
 // NOSICIVI: :[[@LINE-3]]:1: warning: Can't encode literal as exact 64-bit floating-point operand. Low 32-bits will be set to zero
 
 v_trunc_f32 v0, 0.159154943091895317852646485335
-// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
-// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
 // GFX11: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX12XX: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x42,0x00,0x7e]
+// GFX89: v_trunc_f32_e32 v0, 0.15915494          ; encoding: [0xf8,0x38,0x00,0x7e]
+// SICI: v_trunc_f32_e32 v0, 0x3e22f983          ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 v_trunc_f32 v0, lit(0.159154943091895317852646485335)
-// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
-// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 // GFX11: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX12XX: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// GFX89: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x38,0x00,0x7e,0x83,0xf9,0x22,0x3e]
+// SICI: v_trunc_f32_e32 v0, lit(0x3e22f983)     ; encoding: [0xff,0x42,0x00,0x7e,0x83,0xf9,0x22,0x3e]
 
 //---------------------------------------------------------------------------//
 // integer literal truncation checks
@@ -1051,54 +1051,54 @@ v_trunc_f32 v0, 0x1fffffff000
 // NOGCN: :[[@LINE-1]]:17: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x101ffffffff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x101ffffffff         ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0xff,0xff,0xff,0x01,0x01,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000001
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000001          ; encoding: [0xfe,0x01,0x80,0xbe,0x01,0x00,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 s_mov_b64 s[0:1], 0x1000000fff
-// NOSICI: :[[@LINE-1]]:19: error: invalid operand for instruction
-// NOGFX89: :[[@LINE-2]]:19: error: invalid operand for instruction
 // GFX1250: s_mov_b64 s[0:1], 0x1000000fff          ; encoding: [0xfe,0x01,0x80,0xbe,0xff,0x0f,0x00,0x00,0x10,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-4]]:19: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:19: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-2]]:19: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-3]]:19: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-4]]:19: error: invalid operand for instruction
+// NOSICI: :[[@LINE-5]]:19: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:19: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffffff0
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffffff0   ; encoding: [0xfe,0x2e,0x00,0x7e,0xf0,0xff,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x100000001
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x100000001     ; encoding: [0xfe,0x2e,0x00,0x7e,0x01,0x00,0x00,0x00,0x01,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 v_trunc_f64 v[0:1], 0x1fffffff000
-// NOGFX89: :[[@LINE-1]]:21: error: invalid operand for instruction
 // GFX1250: v_trunc_f64_e32 v[0:1], 0x1fffffff000   ; encoding: [0xfe,0x2e,0x00,0x7e,0x00,0xf0,0xff,0xff,0xff,0x01,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-4]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-5]]:21: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-6]]:21: error: invalid operand for instruction
+// NOCI: :[[@LINE-2]]:21: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-3]]:21: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-4]]:21: error: invalid operand for instruction
+// NOGFX89: :[[@LINE-5]]:21: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOCIVI: :[[@LINE-4]]:21: error: invalid operand for instruction
 
 //---------------------------------------------------------------------------//
@@ -1106,210 +1106,210 @@ v_trunc_f64 v[0:1], 0x1fffffff000
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], scc offset:4095
-// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
-// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
-// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xfd]
+// GFX12XX: buffer_atomic_add_u32 v0, off, s[0:3], src_scc offset:4095 ; encoding: [0x7d,0x40,0x0d,0xc4,0x00,0x00,0x80,0x00,0x00,0xff,0x0f,0x00]
+// GFX89: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xfd]
+// SICI: buffer_atomic_add v0, off, s[0:3], src_scc offset:4095 ; encoding: [0xff,0x0f,0xc8,0xe0,0x00,0x00,0x00,0xfd]
 
 s_add_i32 s0, vccz, s0
-// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: s_add_i32 s0, src_vccz, s0              ; encoding: [0xfb,0x00,0x00,0x81]
 
 s_add_i32 s0, execz, s0
-// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 // GFX89: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// SICI: s_add_i32 s0, src_execz, s0             ; encoding: [0xfc,0x00,0x00,0x81]
 
 s_add_i32 s0, scc, s0
-// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
-// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
 // GFX11: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX12XX: s_add_co_i32 s0, src_scc, s0            ; encoding: [0xfd,0x00,0x00,0x81]
+// GFX89: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
+// SICI: s_add_i32 s0, src_scc, s0               ; encoding: [0xfd,0x00,0x00,0x81]
 
 s_and_b64 s[0:1], s[0:1], src_vccz
-// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_vccz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_vccz      ; encoding: [0x00,0xfb,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_execz
-// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 // GFX89: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x86]
-// NOGFX11: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:27: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_execz register not available on this GPU
+// SICI: s_and_b64 s[0:1], s[0:1], src_execz     ; encoding: [0x00,0xfc,0x80,0x87]
 
 s_and_b64 s[0:1], s[0:1], src_scc
-// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
-// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
-// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
 // GFX11: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX12XX: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x8b]
+// GFX89: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x86]
+// SICI: s_and_b64 s[0:1], s[0:1], src_scc       ; encoding: [0x00,0xfd,0x80,0x87]
 
 v_add_u16 v0, vccz, v0
 // GFX89: v_add_u16_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x4c]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_scc, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xfd,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_scc dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xfa,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x68]
-// NOVI: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX11: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, scc, v0
+// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_scc, v0           ; encoding: [0x00,0x00,0x34,0xd1,0xfd,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_scc, v0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, scc, v[0:1]
-// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 // GFX89: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0xc4,0x7d]
-// NOGFX11: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// SICI: v_cmp_eq_i64_e32 vcc, src_scc, v[0:1]   ; encoding: [0xfd,0x00,0x44,0x7d]
 
 v_max_f16 v0, execz, v0
 // GFX89: v_max_f16_e32 v0, src_execz, v0         ; encoding: [0xfc,0x00,0x00,0x5a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, vccz, v0
-// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 // GFX89: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x16]
-// NOGFX11: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:15: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_vccz register not available on this GPU
+// SICI: v_max_f32_e32 v0, src_vccz, v0          ; encoding: [0xfb,0x00,0x00,0x20]
 
 v_max_f64 v[0:1], scc, v[0:1]
-// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
-// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
-// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
 // GFX11: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x2a,0xd7,0xfd,0x00,0x02,0x00]
+// GFX12XX: v_max_num_f64_e32 v[0:1], src_scc, v[0:1] ; encoding: [0xfd,0x00,0x00,0x1c]
+// GFX89: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0x83,0xd2,0xfd,0x00,0x02,0x00]
+// SICI: v_max_f64 v[0:1], src_scc, v[0:1]       ; encoding: [0x00,0x00,0xce,0xd2,0xfd,0x00,0x02,0x00]
 
 v_pk_add_f16 v0, execz, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_execz, v0          ; encoding: [0x00,0x40,0x8f,0xd3,0xfc,0x00,0x02,0x18]
-// NOVI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:18: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:18: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:18: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:18: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:18: error: src_execz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(vccz)
 // GFX89: v_ceil_f16_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x85,0xd1,0xfb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(scc)
-// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
-// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
 // GFX11: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX12XX: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0xdc,0xd5,0xfd,0x00,0x00,0x00]
+// GFX89: v_ceil_f16_e64 v0, |src_scc|            ; encoding: [0x00,0x01,0x85,0xd1,0xfd,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |execz|
-// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
 // CI: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x30,0xd3,0xfc,0x00,0x00,0x00]
-// NOSI: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX11: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:21: error: src_execz register not available on this GPU
+// GFX89: v_ceil_f64_e64 v[5:6], |src_execz|      ; encoding: [0x05,0x01,0x58,0xd1,0xfc,0x00,0x00,0x00]
+// NOGFX11: :[[@LINE-3]]:21: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-4]]:21: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:21: error: src_execz register not available on this GPU
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], -vcc
-// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
 // CI: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x30,0xd3,0x6a,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x98,0xd5,0x6a,0x00,0x00,0x20]
-// NOSI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:12: error: invalid operand for instruction
+// GFX89: v_ceil_f64_e64 v[5:6], -vcc             ; encoding: [0x05,0x00,0x58,0xd1,0x6a,0x00,0x00,0x20]
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32 v0, -vccz
-// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 // GFX89: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x5d,0xd1,0xfb,0x00,0x00,0x20]
-// NOGFX11: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_vccz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, -src_vccz            ; encoding: [0x00,0x00,0x44,0xd3,0xfb,0x00,0x00,0x20]
 
 v_ceil_f32 v0, |execz|
-// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 // GFX89: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x5d,0xd1,0xfc,0x00,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:17: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:17: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:17: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:17: error: src_execz register not available on this GPU
+// SICI: v_ceil_f32_e64 v0, |src_execz|          ; encoding: [0x00,0x01,0x44,0xd3,0xfc,0x00,0x00,0x00]
 
 v_ceil_f16_sdwa v5, |vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_vccz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_scc dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xfd,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, vccz dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_vccz dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_execz| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xfc,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: invalid operand for instruction
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: invalid operand for instruction
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1317,266 +1317,266 @@ v_ceil_f32_sdwa v5, |execz| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095
-// NOSICI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
-// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
 // GFX11: buffer_atomic_add_u32 v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0xd4,0xe0,0x00,0x00,0x00,0xeb]
-// NOVI: :[[@LINE-4]]:36: error: src_shared_base register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// GFX9: buffer_atomic_add v0, off, s[0:3], src_shared_base offset:4095 ; encoding: [0xff,0x0f,0x08,0xe1,0x00,0x00,0x00,0xeb]
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:36: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:36: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:36: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_base, s0
+// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_base, s0    ; encoding: [0xeb,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_base, s0       ; encoding: [0xeb,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 s_add_i32 s0, src_shared_limit, s0
+// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_shared_limit, s0   ; encoding: [0xec,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_shared_limit, s0      ; encoding: [0xec,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_limit register not available on this GPU
 
 s_add_i32 s0, src_private_base, s0
+// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_base, s0   ; encoding: [0xed,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_base register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_base, s0      ; encoding: [0xed,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_base register not available on this GPU
 
 s_add_i32 s0, src_private_limit, s0
+// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
 // GFX12XX: s_add_co_i32 s0, src_private_limit, s0  ; encoding: [0xee,0x00,0x00,0x81]
-// NOSICI: :[[@LINE-2]]:15: error: src_private_limit register not available on this GPU
 // GFX9: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
-// GFX11: s_add_i32 s0, src_private_limit, s0     ; encoding: [0xee,0x00,0x00,0x81]
+// NOSICI: :[[@LINE-4]]:15: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_private_limit register not available on this GPU
 
 s_add_i32 s0, src_pops_exiting_wave_id, s0
-// NOSICI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_add_i32 s0, src_pops_exiting_wave_id, s0 ; encoding: [0xef,0x00,0x00,0x81]
-// NOVI: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:15: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_pops_exiting_wave_id register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_base ; encoding: [0x00,0xeb,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_shared_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_shared_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_shared_limit ; encoding: [0x00,0xec,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_shared_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_shared_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_base
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_base register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_base ; encoding: [0x00,0xed,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_base register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_private_limit
+// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
 // GFX12XX: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
-// NOSICI: :[[@LINE-2]]:27: error: src_private_limit register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x86]
-// GFX11: s_and_b64 s[0:1], s[0:1], src_private_limit ; encoding: [0x00,0xee,0x80,0x8b]
+// NOSICI: :[[@LINE-4]]:27: error: src_private_limit register not available on this GPU
 // NOVI: :[[@LINE-5]]:27: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_private_limit register not available on this GPU
 
 s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id
-// NOSICI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // GFX9: s_and_b64 s[0:1], s[0:1], src_pops_exiting_wave_id ; encoding: [0x00,0xef,0x80,0x86]
-// NOVI: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX11: :[[@LINE-2]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX12: :[[@LINE-3]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOGFX1250: :[[@LINE-4]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOSICI: :[[@LINE-5]]:27: error: src_pops_exiting_wave_id register not available on this GPU
+// NOVI: :[[@LINE-6]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:27: error: src_pops_exiting_wave_id register not available on this GPU
 
 v_add_u16 v0, src_shared_base, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x4c]
-// NOVI: :[[@LINE-3]]:15: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, src_shared_base, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0x00,0x00,0x4c,0xeb,0x06,0x86,0x06]
-// NOVI: :[[@LINE-3]]:20: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u16_sdwa v0, v0, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; encoding: [0xf9,0xd6,0x01,0x4c,0x00,0x06,0x06,0x86]
-// NOVI: :[[@LINE-3]]:24: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:24: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
 // GFX12XX: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x68]
-// GFX11: v_add_nc_u32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x4a]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32_e64 v0, src_shared_base, v0
+// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_add_u32_e64 v0, src_shared_base, v0   ; encoding: [0x00,0x00,0x34,0xd1,0xeb,0x00,0x02,0x00]
-// GFX11: v_add_nc_u32_e64 v0, src_shared_base, v0 ; encoding: [0x00,0x00,0x25,0xd5,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_i64 vcc, src_shared_base, v[0:1]
-// NOSICI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_cmp_eq_i64_e32 vcc, src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0xc4,0x7d]
-// NOVI: :[[@LINE-3]]:19: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX12: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
-// NOGFX1250: :[[@LINE-6]]:1: error: operands are not valid for this GPU or mode
+// NOGFX11: :[[@LINE-2]]:1: error: operands are not valid for this GPU or mode
+// NOGFX12: :[[@LINE-3]]:1: error: operands are not valid for this GPU or mode
+// NOGFX1250: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
+// NOSICI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_max_f16 v0, src_shared_base, v0
+// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
 // GFX12XX: v_max_num_f16_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x62]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x5a]
-// GFX11: v_max_f16_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x72]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_max_f32 v0, src_shared_base, v0
+// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
 // GFX12XX: v_max_num_f32_e32 v0, src_shared_base, v0 ; encoding: [0xeb,0x00,0x00,0x2c]
-// NOSICI: :[[@LINE-2]]:15: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x16]
-// GFX11: v_max_f32_e32 v0, src_shared_base, v0   ; encoding: [0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:15: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:15: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:15: error: src_shared_base register not available on this GPU
 
 v_max_f64 v[0:1], src_shared_base, v[0:1]
+// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
 // GFX12XX: v_max_num_f64_e32 v[0:1], src_shared_base, v[0:1] ; encoding: [0xeb,0x00,0x00,0x1c]
-// NOSICI: :[[@LINE-2]]:19: error: src_shared_base register not available on this GPU
 // GFX9: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x83,0xd2,0xeb,0x00,0x02,0x00]
-// GFX11: v_max_f64 v[0:1], src_shared_base, v[0:1] ; encoding: [0x00,0x00,0x2a,0xd7,0xeb,0x00,0x02,0x00]
+// NOSICI: :[[@LINE-4]]:19: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:19: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:19: error: src_shared_base register not available on this GPU
 
 v_pk_add_f16 v0, src_shared_base, v0
+// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
 // GFX12XX: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x8f,0xd3,0xeb,0x00,0x02,0x18]
-// GFX11: v_pk_add_f16 v0, src_shared_base, v0    ; encoding: [0x00,0x40,0x0f,0xcc,0xeb,0x00,0x02,0x18]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, neg(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x85,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f16_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xdc,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16 v0, abs(src_shared_base)
+// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x85,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f16_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xdc,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
 // NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f64 v[5:6], |src_shared_base|
-// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
 // GFX11: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x98,0xd5,0xeb,0x00,0x00,0x00]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], |src_shared_base| ; encoding: [0x05,0x01,0x58,0xd1,0xeb,0x00,0x00,0x00]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f64 v[5:6], -src_shared_base
-// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
 // GFX11: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x98,0xd5,0xeb,0x00,0x00,0x20]
-// NOSI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOCI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
-// NOGFX1250: :[[@LINE-7]]:12: error: invalid operand for instruction
+// GFX9: v_ceil_f64_e64 v[5:6], -src_shared_base ; encoding: [0x05,0x00,0x58,0xd1,0xeb,0x00,0x00,0x20]
+// NOCI: :[[@LINE-4]]:21: error: src_shared_base register not available on this GPU
+// NOGFX1250: :[[@LINE-5]]:12: error: invalid operand for instruction
+// NOSI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-7]]:21: error: src_shared_base register not available on this GPU
 // NOCIVI: :[[@LINE-5]]:21: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, -src_shared_base
+// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
 // GFX12XX: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0x5d,0xd1,0xeb,0x00,0x00,0x20]
-// GFX11: v_ceil_f32_e64 v0, -src_shared_base     ; encoding: [0x00,0x00,0xa2,0xd5,0xeb,0x00,0x00,0x20]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f32 v0, |src_shared_base|
+// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
 // GFX12XX: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
 // GFX9: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0x5d,0xd1,0xeb,0x00,0x00,0x00]
-// GFX11: v_ceil_f32_e64 v0, |src_shared_base|    ; encoding: [0x00,0x01,0xa2,0xd5,0xeb,0x00,0x00,0x00]
+// NOSICI: :[[@LINE-4]]:17: error: src_shared_base register not available on this GPU
 // NOVI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 // GFX9: v_ceil_f16_sdwa v5, -src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x8a,0x0a,0x7e,0xeb,0x16,0x96,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, src_shared_base dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0x86,0x00]
-// NOVI: :[[@LINE-3]]:21: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:21: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
-// NOSICI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 // GFX9: v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD dst_unused:UNUSED_PRESERVE src0_sel:DWORD ; encoding: [0xf9,0x3a,0x0a,0x7e,0xeb,0x16,0xa6,0x00]
-// NOVI: :[[@LINE-3]]:22: error: src_shared_base register not available on this GPU
-// NOGFX11: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX12: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
-// NOGFX1250: :[[@LINE-6]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX11: :[[@LINE-2]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX12: :[[@LINE-3]]:1: error: sdwa variant of this instruction is not supported
+// NOGFX1250: :[[@LINE-4]]:1: error: sdwa variant of this instruction is not supported
+// NOSICI: :[[@LINE-5]]:1: error: sdwa variant of this instruction is not supported
+// NOVI: :[[@LINE-6]]:22: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: sdwa variant of this instruction is not supported
 
 //---------------------------------------------------------------------------//
@@ -1584,206 +1584,206 @@ v_ceil_f32_sdwa v5, |src_shared_base| dst_sel:DWORD src0_sel:DWORD
 //---------------------------------------------------------------------------//
 
 v_add_u32 v0, private_base, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:15: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_private_base, s0 ; encoding: [0x00,0x00,0x25,0xd5,0xed,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:15: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_add_u32 v0, scc, s0
-// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
-// NOVI: :[[@LINE-4]]:1: error: operands are not valid for this GPU or mode
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_add_nc_u32_e64 v0, src_scc, s0        ; encoding: [0x00,0x00,0x25,0xd5,0xfd,0x00,0x00,0x00]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: operands are not valid for this GPU or mode
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, shared_base, v0, v1
-// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOSICI: :[[@LINE-2]]:20: error: src_shared_base register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
-// NOVI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, src_shared_base, v0, v1 ; encoding: [0x00,0x00,0x37,0xd6,0xeb,0x00,0x06,0x04]
+// NOGFX9: :[[@LINE-3]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:20: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-5]]:20: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:20: error: src_shared_base register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, shared_limit, v1
-// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: src_shared_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
-// NOVI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:24: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, src_shared_limit, v1 ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xd9,0x05,0x04]
+// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:24: error: src_shared_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:24: error: src_shared_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:24: error: src_shared_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, private_limit
-// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOSICI: :[[@LINE-2]]:28: error: src_private_limit register not available on this GPU
 // GFX11: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
-// NOVI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
-// NOGFX9: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_div_fmas_f32 v0, v0, v1, src_private_limit ; encoding: [0x00,0x00,0x37,0xd6,0x00,0x03,0xba,0x03]
+// NOGFX9: :[[@LINE-3]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:28: error: src_private_limit register not available on this GPU
+// NOVI: :[[@LINE-5]]:28: error: src_private_limit register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:28: error: src_private_limit register not available on this GPU
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, execz, v0, v1
-// NOSICI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:20: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:20: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:20: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:20: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:20: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:20: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, scc, v1
+// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
 // GFX12XX: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
-// NOSICI: :[[@LINE-2]]:24: error: invalid operand (violates constant bus restrictions)
 // NOGFX89: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// GFX11: v_div_fmas_f32 v0, v0, src_scc, v1      ; encoding: [0x00,0x00,0x37,0xd6,0x00,0xfb,0x05,0x04]
+// NOSICI: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:24: error: invalid operand (violates constant bus restrictions)
 
 // v_div_fmas implicitly reads VCC
 v_div_fmas_f32 v0, v0, v1, vccz
-// NOSICI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:28: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:28: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:28: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:28: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:28: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:28: error: src_vccz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:28: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:28: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:28: error: invalid operand (violates constant bus restrictions)
 
 // v_addc_co_u32 implicitly reads VCC (VOP2)
 v_addc_co_u32 v0, vcc, shared_base, v0, vcc
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:24: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:24: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f32 v0, shared_base, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:17: error: src_shared_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-5]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-6]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX9: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: src_shared_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:17: error: src_shared_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:17: error: src_shared_base register not available on this GPU
 
 v_madak_f32 v0, scc, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:17: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:17: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:17: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:17: error: invalid operand (violates constant bus restrictions)
 
 v_madak_f32 v0, 0xff32ff, v0, 0x11213141
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madak_f32 v0, 0xff32ff, v0, 1
-// NOSICI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:31: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:31: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:31: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:31: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, 0x11213141, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madmk_f32 v0, 0xff32ff, -1, v0
-// NOSICI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
-// NOGFX89: :[[@LINE-2]]:27: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:27: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:27: error: only one unique literal operand is allowed
 // NOSICIVI: :[[@LINE-1]]:27: error: only one unique literal operand is allowed
 
 v_madak_f16 v0, 0xff32, v0, 0x1122
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madak_f16 v0, 0xff32, v0, 0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:29: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 0x1122, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_madmk_f16 v0, 0xff32, 1, v0
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOGFX89: :[[@LINE-2]]:25: error: only one unique literal operand is allowed
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:25: error: only one unique literal operand is allowed
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, private_limit
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], private_base, s0
-// NOSICI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
-// NOVI: :[[@LINE-2]]:22: error: src_private_base register not available on this GPU
-// NOGFX9: :[[@LINE-3]]:36: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:14: error: invalid operand for instruction
-// NOGFX12: :[[@LINE-5]]:14: error: invalid operand for instruction
-// NOGFX1250: :[[@LINE-6]]:14: error: invalid operand for instruction
+// NOGFX11: :[[@LINE-1]]:14: error: invalid operand for instruction
+// NOGFX12: :[[@LINE-2]]:14: error: invalid operand for instruction
+// NOGFX1250: :[[@LINE-3]]:14: error: invalid operand for instruction
+// NOGFX9: :[[@LINE-4]]:36: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:22: error: src_private_base register not available on this GPU
+// NOVI: :[[@LINE-6]]:22: error: src_private_base register not available on this GPU
 // NOSICIVI: :[[@LINE-1]]:22: error: src_private_base register not available on this GPU
 
 v_cmp_eq_f32 s[0:1], execz, s0
-// NOSICI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX89: :[[@LINE-2]]:29: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
-// NOGFX12: :[[@LINE-4]]:22: error: src_execz register not available on this GPU
-// NOGFX1250: :[[@LINE-5]]:22: error: src_execz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:22: error: src_execz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:22: error: src_execz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:22: error: src_execz register not available on this GPU
+// NOGFX89: :[[@LINE-4]]:29: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:29: error: invalid operand (violates constant bus restrictions)
 // NOSICIVI: :[[@LINE-1]]:29: error: invalid operand (violates constant bus restrictions)
 
 v_pk_add_f16 v255, private_base, private_limit
-// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOSICI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
 // GFX11: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
-// NOVI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-5]]:34: error: invalid operand (violates constant bus restrictions)
+// GFX12XX: v_pk_add_f16 v255, src_private_base, src_private_limit ; encoding: [0xff,0x40,0x0f,0xcc,0xed,0xdc,0x01,0x18]
+// NOGFX9: :[[@LINE-3]]:34: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 v_pk_add_f16 v255, vccz, execz
-// NOSICI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
-// NOVI: :[[@LINE-2]]:1: error: instruction not supported on this GPU
-// NOGFX9: :[[@LINE-3]]:26: error: invalid operand (violates constant bus restrictions)
-// NOGFX11: :[[@LINE-4]]:20: error: src_vccz register not available on this GPU
-// NOGFX12: :[[@LINE-5]]:20: error: src_vccz register not available on this GPU
-// NOGFX1250: :[[@LINE-6]]:20: error: src_vccz register not available on this GPU
+// NOGFX11: :[[@LINE-1]]:20: error: src_vccz register not available on this GPU
+// NOGFX12: :[[@LINE-2]]:20: error: src_vccz register not available on this GPU
+// NOGFX1250: :[[@LINE-3]]:20: error: src_vccz register not available on this GPU
+// NOGFX9: :[[@LINE-4]]:26: error: invalid operand (violates constant bus restrictions)
+// NOSICI: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOVI: :[[@LINE-6]]:1: error: instruction not supported on this GPU
 // NOSICIVI: :[[@LINE-1]]:1: error: instruction not supported on this GPU
 
 //---------------------------------------------------------------------------//
@@ -1791,36 +1791,36 @@ v_pk_add_f16 v255, vccz, execz
 //---------------------------------------------------------------------------//
 
 v_sqrt_f32 v2, lit(123)
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, abs(lit(123))
-// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x4e,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f32_e32 v2, lit(0x7b)            ; encoding: [0xff,0x66,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit(123.0)
-// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
-// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 // GFX11: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX12XX: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// GFX89: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x4e,0x04,0x7e,0x00,0x00,0xf6,0x42]
+// SICI: v_sqrt_f32_e32 v2, lit(0x42f60000)      ; encoding: [0xff,0x66,0x04,0x7e,0x00,0x00,0xf6,0x42]
 
 v_sqrt_f64 v[2:3], lit(123.0)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xfe,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x50,0x04,0x7e,0x00,0xc0,0x5e,0x40]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x405ec000)  ; encoding: [0xff,0x68,0x04,0x7e,0x00,0xc0,0x5e,0x40]
 
 v_sqrt_f64 v[2:3], lit(123)
-// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
-// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX11: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX12: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 // GFX1250: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xfe,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00,0x00,0x00,0x00,0x00]
+// GFX89: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x50,0x04,0x7e,0x7b,0x00,0x00,0x00]
+// SICI: v_sqrt_f64_e32 v[2:3], lit(0x7b)        ; encoding: [0xff,0x68,0x04,0x7e,0x7b,0x00,0x00,0x00]
 
 v_sqrt_f32 v2, lit 123.0
 // NOGCN: :[[@LINE-1]]:20: error: expected left paren after lit
@@ -1834,16 +1834,16 @@ v_sqrt_f32 v2, lit(v1)
 // Make sure lit() is accepted on operands without modifiers.
 
 v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8)
-// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 // GFX89: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x30,0xe8,0x07,0x00,0x00]
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-4]]:1: error: instruction not supported on this GPU
+// SICI: v_madak_f32 v4, lit(0x7e8), v8, lit(0x7e8) ; encoding: [0xff,0x10,0x08,0x42,0xe8,0x07,0x00,0x00]
 
 v_madak_f32 v4, lit(lit(0x7e8)), v8, lit(0x7e8)
-// NOSICI: :[[@LINE-1]]:24: error: not a valid operand.
-// NOGFX89: :[[@LINE-2]]:24: error: not a valid operand.
-// NOGFX11: :[[@LINE-3]]:1: error: instruction not supported on this GPU
-// NOGFX12: :[[@LINE-4]]:1: error: instruction not supported on this GPU
-// NOGFX1250: :[[@LINE-5]]:1: error: instruction not supported on this GPU
+// NOGFX11: :[[@LINE-1]]:1: error: instruction not supported on this GPU
+// NOGFX12: :[[@LINE-2]]:1: error: instruction not supported on this GPU
+// NOGFX1250: :[[@LINE-3]]:1: error: instruction not supported on this GPU
+// NOGFX89: :[[@LINE-4]]:24: error: not a valid operand.
+// NOSICI: :[[@LINE-5]]:24: error: not a valid operand.
 // NOSICIVI: :[[@LINE-1]]:24: error: not a valid operand.
diff --git a/llvm/test/MC/AsmParser/comments-x86-darwin-eol-dropped.s b/llvm/test/MC/AsmParser/comments-x86-darwin-eol-dropped.s
new file mode 100644
index 0000000000000..662e5987db6e2
--- /dev/null
+++ b/llvm/test/MC/AsmParser/comments-x86-darwin-eol-dropped.s
@@ -0,0 +1,10 @@
+// RUN: llvm-mc -triple i386-apple-darwin %s 2>&1 | FileCheck %s
+.p2align 3
+// CHECK: .p2align 3
+test:
+// CHECK-LABEL: test:
+// CHECK: pushl %ebp
+// CHECK: movl %esp, %ebp
+# Check that the following line's comment # doesn't drop the movl after
+   pushl %ebp #
+   movl %esp, %ebp
diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
index ad22000bb8205..16eba25ab0cfb 100644
--- a/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
+++ b/llvm/test/MC/Disassembler/AArch64/armv8.4a-mpam.txt
@@ -65,35 +65,39 @@
 #CHECK:  mrs x0, MPAMVPM7_EL2
 #CHECK:  mrs x0, MPAMIDR_EL1
 
-#CHECK-NOV84:  msr S3_0_C10_C5_1, x0
-#CHECK-NOV84:  msr S3_0_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_6_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_5_C10_C5_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C4_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_0, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_1, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_2, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_3, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_4, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_5, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_6, x0
-#CHECK-NOV84:  msr S3_4_C10_C6_7, x0
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_1
-#CHECK-NOV84:  mrs x0, S3_0_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_6_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_5_C10_C5_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C4_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_0
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_1
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_2
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_3
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_4
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_5
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_6
-#CHECK-NOV84:  mrs x0, S3_4_C10_C6_7
-#CHECK-NOV84:  mrs x0, S3_0_C10_C4_4
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: msr MPAM0_EL1, x0
+#CHECK-NOV84: msr MPAM1_EL1, x0
+#CHECK-NOV84: msr MPAM2_EL2, x0
+#CHECK-NOV84: msr MPAM3_EL3, x0
+#CHECK-NOV84: msr MPAM1_EL12, x0
+#CHECK-NOV84: msr MPAMHCR_EL2, x0
 
+#CHECK-NOV84:  msr MPAMVPMV_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM0_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM1_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM2_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM3_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM4_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM5_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM6_EL2, x0
+#CHECK-NOV84:  msr MPAMVPM7_EL2, x0
+
+// Available outside MPAM from Armv9.7
+#CHECK-NOV84: mrs x0, MPAM0_EL1
+#CHECK-NOV84: mrs x0, MPAM1_EL1
+#CHECK-NOV84: mrs x0, MPAM2_EL2
+#CHECK-NOV84: mrs x0, MPAM3_EL3
+#CHECK-NOV84: mrs x0, MPAM1_EL12
+#CHECK-NOV84: mrs x0, MPAMHCR_EL2
+
+#CHECK-NOV84:  mrs x0, MPAMVPMV_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM0_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM1_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM2_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM3_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM4_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM5_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM6_EL2
+#CHECK-NOV84:  mrs x0, MPAMVPM7_EL2
+#CHECK-NOV84:  mrs x0, MPAMIDR_EL1
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
index a8627d64001c2..b84324b567ef5 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_sopp.txt
@@ -33,6 +33,9 @@
 # GFX1250: s_set_vgpr_msb 0xff ; encoding: [0xff,0x00,0x86,0xbf]
 0xff,0x00,0x86,0xbf
 
+# GFX1250: s_set_vgpr_msb 0xffff ; encoding: [0xff,0xff,0x86,0xbf]
+0xff,0xff,0x86,0xbf
+
 # GFX1250: s_monitor_sleep 0                       ; encoding: [0x00,0x00,0x84,0xbf]
 0x00,0x00,0x84,0xbf
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
index 29bfa54f2c10d..7af0bfe5b5327 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx1250_dasm_vop3.txt
@@ -237,64 +237,76 @@
 # GFX1250: v_mad_nc_i64_i32 v[2:3], s4, v7, v[8:9] clamp ; encoding: [0x02,0x80,0xfb,0xd6,0x04,0x0e,0x22,0x04]
 
 0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04
-# GFX1250: v_add_min_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
+# GFX1250: v_add_min_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x08,0x09,0x04]
 
 0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04
-# GFX1250: v_add_min_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
+# GFX1250: v_add_min_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0e,0x22,0x04]
 
 0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02
-# GFX1250: v_add_min_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
+# GFX1250: v_add_min_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x01,0x05,0x02]
 
 0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00
-# GFX1250: v_add_min_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
+# GFX1250: v_add_min_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x07,0x09,0x00]
 
 0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
-# GFX1250: v_add_min_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+# GFX1250: v_add_min_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x60,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04
+# GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04]
+
+0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04
+# GFX1250: v_add_min_i32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x60,0xd6,0x01,0x05,0x0e,0x04]
 
 0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04
-# GFX1250: v_add_max_i32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
+# GFX1250: v_add_max_i32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x08,0x09,0x04]
 
 0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04
-# GFX1250: v_add_max_i32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
+# GFX1250: v_add_max_i32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0e,0x22,0x04]
 
 0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02
-# GFX1250: v_add_max_i32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
+# GFX1250: v_add_max_i32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x01,0x05,0x02]
 
 0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00
-# GFX1250: v_add_max_i32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
+# GFX1250: v_add_max_i32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x07,0x09,0x00]
 
 0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
-# GFX1250: v_add_max_i32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+# GFX1250: v_add_max_i32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5e,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
 
 0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04
-# GFX1250: v_add_min_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
+# GFX1250: v_add_min_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x08,0x09,0x04]
 
 0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04
-# GFX1250: v_add_min_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
+# GFX1250: v_add_min_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0e,0x22,0x04]
 
 0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02
-# GFX1250: v_add_min_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
+# GFX1250: v_add_min_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x01,0x05,0x02]
 
 0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00
-# GFX1250: v_add_min_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
+# GFX1250: v_add_min_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x07,0x09,0x00]
 
 0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
-# GFX1250: v_add_min_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+# GFX1250: v_add_min_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x61,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04
+# GFX1250: v_add_min_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x61,0xd6,0x01,0x05,0x0e,0x04]
 
 0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04
-# GFX1250: v_add_max_u32_e64 v2, s4, 4, v2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
+# GFX1250: v_add_max_u32 v2, s4, 4, v2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x08,0x09,0x04]
 
 0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04
-# GFX1250: v_add_max_u32_e64 v2, s4, v7, v8        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
+# GFX1250: v_add_max_u32 v2, s4, v7, v8            ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0e,0x22,0x04]
 
 0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02
-# GFX1250: v_add_max_u32_e64 v2, v4, 0, 1          ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
+# GFX1250: v_add_max_u32 v2, v4, 0, 1              ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x01,0x05,0x02]
 
 0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00
-# GFX1250: v_add_max_u32_e64 v2, v4, 3, s2         ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
+# GFX1250: v_add_max_u32 v2, v4, 3, s2             ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x07,0x09,0x00]
 
 0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00
-# GFX1250: v_add_max_u32_e64 v2, v4, v7, 0x3039    ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+# GFX1250: v_add_max_u32 v2, v4, v7, 0x3039        ; encoding: [0x02,0x00,0x5f,0xd6,0x04,0x0f,0xfe,0x03,0x39,0x30,0x00,0x00]
+
+0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04
+# GFX1250: v_add_max_u32 v0, v1, v2, v3 clamp      ; encoding: [0x00,0x80,0x5f,0xd6,0x01,0x05,0x0e,0x04]
 
 0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf
 # GFX1250: v_cvt_pk_bf16_f32 v255, -|0xaf123456|, vcc_hi clamp div:2 ; encoding: [0xff,0x81,0x6d,0xd7,0xff,0xd6,0x00,0x38,0x56,0x34,0x12,0xaf]
diff --git a/llvm/test/MC/Disassembler/Xtensa/debug.txt b/llvm/test/MC/Disassembler/Xtensa/debug.txt
index 1321f09a973c3..5438760c43cfa 100644
--- a/llvm/test/MC/Disassembler/Xtensa/debug.txt
+++ b/llvm/test/MC/Disassembler/Xtensa/debug.txt
@@ -9,7 +9,7 @@
 # CHECK-DEBUG: break 1, 1
 # CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
-[0x2c,0xf1]
+[0x2d,0xf1]
 # CHECK-DEBUG: break.n 1
 # CHECK-CORE: [[#@LINE-2]]:2: warning: invalid instruction encoding
 
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
new file mode 100644
index 0000000000000..cb44a76f820b9
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape-diagnostics.s
@@ -0,0 +1,36 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o 2>&1 | FileCheck %s
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s --check-prefix=NOFDES
+
+## Tests that .cfi_escape sequences that are unrepresentable in sframe warn
+## and do not produce FDEs.
+
+        .align 1024
+cfi_escape_sp:
+        .cfi_startproc
+        .long 0
+## Setting SP via other registers makes it unrepresentable in sframe
+## DW_CFA_expression,reg 0x7,length 2,DW_OP_breg6,SLEB(-8)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_expression with SP reg 7
+        .cfi_escape 0x10, 0x7, 0x2, 0x76, 0x78
+        .long 0
+.cfi_endproc
+
+cfi_escape_args_sp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is not OK if cfa is SP
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE; .cfi_escape DW_CFA_GNU_args_size with non frame-pointer CFA
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_val_offset:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## DW_CFA_val_offset,rbp,ULEB scaled offset(16)
+# CHECK: {{.*}}.s:[[#@LINE+1]]:9: warning: skipping SFrame FDE;  .cfi_escape DW_CFA_val_offset with FP reg 6
+        .cfi_escape 0x14,0x6,0x2
+        .long 0
+        .cfi_endproc
+
+# NOFDES: Num FDEs: 0
diff --git a/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
new file mode 100644
index 0000000000000..df8e7d292503b
--- /dev/null
+++ b/llvm/test/MC/ELF/cfi-sframe-cfi-escape.s
@@ -0,0 +1,46 @@
+# RUN: llvm-mc --filetype=obj --gsframe -triple x86_64 %s -o %t.o
+# RUN: llvm-readelf --sframe %t.o | FileCheck %s
+
+## Tests that .cfi_escape sequences that are ok to pass through work.
+
+        .align 1024
+cfi_escape_ok:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+        ## Uninteresting register
+## DW_CFA_expression,reg 0xc,length 2,DW_OP_breg6,SLEB(-8)
+        .cfi_escape 0x10,0xc,0x2,0x76,0x78
+## DW_CFA_nop
+        .cfi_escape 0x0
+        .cfi_escape 0x0,0x0,0x0,0x0
+        ## Uninteresting register
+## DW_CFA_val_offset,reg 0xc,ULEB scaled offset
+        .cfi_escape 0x14,0xc,0x4
+        .long 0
+        .cfi_endproc
+
+cfi_escape_gnu_args_fp:
+        .cfi_startproc
+        .long 0
+## DW_CFA_GNU_args_size is OK if arg size is zero
+        .cfi_escape 0x2e, 0x0
+        .long 0
+        .cfi_def_cfa_register 6
+        .long 0
+## DW_CFA_GNU_args_size is OK if cfa is FP
+        .cfi_escape 0x2e, 0x20
+        .cfi_endproc
+
+cfi_escape_long_expr:
+        .cfi_startproc
+        .long 0
+        .cfi_def_cfa_offset 16
+## This is a long, but valid, dwarf expression without sframe
+## implications. An FDE can still be created.
+## DW_CFA_val_offset,rcx,ULEB scaled offset(16), DW_CFA_expr,r10,length,DW_OP_deref,SLEB(-8)
+        .cfi_escape 0x14,0x2,0x2,0x10,0xa,0x2,0x76,0x78
+        .long 0
+        .cfi_endproc
+
+# CHECK: Num FDEs: 3
diff --git a/llvm/test/MC/Hexagon/arch-support.s b/llvm/test/MC/Hexagon/arch-support.s
index eb362a7db3caf..94a6eb1714f30 100644
--- a/llvm/test/MC/Hexagon/arch-support.s
+++ b/llvm/test/MC/Hexagon/arch-support.s
@@ -10,6 +10,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V73 %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V75 %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V79 %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-V81 %s
 
 ## Check which arch version llvm-mc sets when the user does not provide one.
 # RUN: llvm-mc -triple=hexagon -filetype=obj %s | llvm-readelf -h - | FileCheck --check-prefix=CHECK-DEFAULT %s
@@ -26,6 +27,7 @@
 # RUN: llvm-mc -triple=hexagon -mv73 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv75 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
 # RUN: llvm-mc -triple=hexagon -mv79 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
+# RUN: llvm-mc -triple=hexagon -mv81 -filetype=obj %s | llvm-objdump --disassemble - | FileCheck --check-prefix=CHECK-OBJDUMP %s
     .text
 r1 = r1
 
@@ -41,6 +43,7 @@ r1 = r1
 # CHECK-V73: Flags:{{.*}}0x73
 # CHECK-V75: Flags:{{.*}}0x75
 # CHECK-V79: Flags:{{.*}}0x79
+# CHECK-V81: Flags:{{.*}}0x81
 # CHECK-DEFAULT: Flags:{{.*}}0x68
 
 # CHECK-OBJDUMP: { r1 = r1 }
diff --git a/llvm/test/MC/Hexagon/v81_arch.s b/llvm/test/MC/Hexagon/v81_arch.s
new file mode 100644
index 0000000000000..0cd5d6b25a824
--- /dev/null
+++ b/llvm/test/MC/Hexagon/v81_arch.s
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+# RUN: llvm-mc -arch=hexagon -mcpu=hexagonv81 -mhvx -filetype=obj %s | llvm-objdump -d - | FileCheck %s
+
+r1=memw(r0)
+{ r0=r0
+  memw(r0)=r0.new }
+
+# CHECK: { r1 = memw(r0+#0x0) }
+# CHECK: { r0 = r0
+# CHECK:   memw(r0+#0x0) = r0.new }
diff --git a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
index 959f3c53b0a19..666222028bdf8 100644
--- a/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
+++ b/llvm/test/MC/PowerPC/ppc64-encoding-ext.s
@@ -3491,12 +3491,18 @@
 # CHECK-BE: mfamr 2                         # encoding: [0x7c,0x5d,0x02,0xa6]
 # CHECK-LE: mfamr 2                         # encoding: [0xa6,0x02,0x5d,0x7c]
             mfamr 2
-# CHECK-BE: mtpid 2                         # encoding: [0x7c,0x50,0x0b,0xa6]
-# CHECK-LE: mtpid 2                         # encoding: [0xa6,0x0b,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
             mtpid 2
-# CHECK-BE: mfpid 2                         # encoding: [0x7c,0x50,0x0a,0xa6]
-# CHECK-LE: mfpid 2                         # encoding: [0xa6,0x0a,0x50,0x7c]
+# CHECK-BE: mtspr 48, 2                     # encoding: [0x7c,0x50,0x0b,0xa6]
+# CHECK-LE: mtspr 48, 2                     # encoding: [0xa6,0x0b,0x50,0x7c]
+            mtpidr 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
             mfpid 2
+# CHECK-BE: mfspr 2, 48                     # encoding: [0x7c,0x50,0x0a,0xa6]
+# CHECK-LE: mfspr 2, 48                     # encoding: [0xa6,0x0a,0x50,0x7c]
+            mfpidr 2
 # CHECK-BE: mtlr 2                          # encoding: [0x7c,0x48,0x03,0xa6]
 # CHECK-LE: mtlr 2                          # encoding: [0xa6,0x03,0x48,0x7c]
             mtlr 2
diff --git a/llvm/test/MC/RISCV/xsfvfexp.s b/llvm/test/MC/RISCV/xsfvfexp.s
new file mode 100644
index 0000000000000..bd6aecdf4381a
--- /dev/null
+++ b/llvm/test/MC/RISCV/xsfvfexp.s
@@ -0,0 +1,29 @@
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp32e %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \
+# RUN:        | llvm-objdump -d --mattr=+xsfvfexp32e - \
+# RUN:        | FileCheck %s --check-prefix=CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp32e %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexp16e %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \
+# RUN:        | llvm-objdump -d --mattr=+xsfvfexp16e - \
+# RUN:        | FileCheck %s --check-prefix=CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexp16e %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+zvfbfmin,+xsfvfbfexp16e %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \
+# RUN:        | llvm-objdump -d --mattr=+xsfvfbfexp16e - \
+# RUN:        | FileCheck %s --check-prefix=CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+zvfbfmin,+xsfvfbfexp16e %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+sf.vfexp.v v2, v5, v0.t
+# CHECK-INST: sf.vfexp.v v2, v5, v0.t
+# CHECK-ENCODING: [0x57,0x91,0x53,0x4c]
+# CHECK-ERROR: instruction requires the following: 'Xsfvfbfexp16e', 'Xsfvfexp16e', or 'Xsfvfexp32e' (SiFive Vector Floating-Point Exponential Function Instruction){{$}}
+# CHECK-UNKNOWN: 4c539157 <unknown>
diff --git a/llvm/test/MC/RISCV/xsfvfexpa.s b/llvm/test/MC/RISCV/xsfvfexpa.s
new file mode 100644
index 0000000000000..317a103122b9c
--- /dev/null
+++ b/llvm/test/MC/RISCV/xsfvfexpa.s
@@ -0,0 +1,15 @@
+# RUN: llvm-mc -triple=riscv64 -show-encoding --mattr=+xsfvfexpa %s \
+# RUN:        | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST
+# RUN: not llvm-mc -triple=riscv64 -show-encoding %s 2>&1 \
+# RUN:        | FileCheck %s --check-prefix=CHECK-ERROR
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \
+# RUN:        | llvm-objdump -d --mattr=+xsfvfexpa - \
+# RUN:        | FileCheck %s --check-prefix=CHECK-INST
+# RUN: llvm-mc -triple=riscv64 -filetype=obj --mattr=+xsfvfexpa %s \
+# RUN:        | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN
+
+sf.vfexpa.v v2, v5, v0.t
+# CHECK-INST: sf.vfexpa.v v2, v5, v0.t
+# CHECK-ENCODING: [0x57,0x11,0x53,0x4c]
+# CHECK-ERROR: instruction requires the following: 'Xsfvfexpa' (SiFive Vector Floating-Point Exponential Approximation Instruction){{$}}
+# CHECK-UNKNOWN: 4c531157 <unknown>
diff --git a/llvm/test/MC/Xtensa/debug.s b/llvm/test/MC/Xtensa/debug.s
index 36b1f110d120b..4ca6368d19f1c 100644
--- a/llvm/test/MC/Xtensa/debug.s
+++ b/llvm/test/MC/Xtensa/debug.s
@@ -11,7 +11,7 @@ break 1, 1
 
 # Instruction format RRRN
 # CHECK-INST: break.n 1
-# CHECK: encoding: [0x2c,0xf1]
+# CHECK: encoding: [0x2d,0xf1]
 break.n 1
 
 # Instruction format RRR
diff --git a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
index 6aba6731c9fee..c4ca2d21cc5f0 100644
--- a/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
+++ b/llvm/test/MachineVerifier/test_g_shuffle_vector.mir
@@ -50,10 +50,16 @@ body:             |
     %20:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %19, shufflemask(1, 0)
 
     ; CHECK: Bad machine code: G_SHUFFLE_VECTOR cannot change element type
-    %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0)
+    %21:_(s16) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 1)
+
+    ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+    %22:_(<2 x s32>) = G_SHUFFLE_VECTOR %3, %4, shufflemask(0, 0)
+
+    %23:_(p0) = G_IMPLICIT_DEF
+    ; CHECK: Bad machine code: G_SHUFFLE_VECTOR must have vector src
+    %24:_(<2 x p0>) = G_SHUFFLE_VECTOR %23, %23, shufflemask(0, 0)
 
     ; CHECK: Bad machine code: Out of bounds shuffle index
-    %22:_(s32) = G_IMPLICIT_DEF
-    %20:_(<2 x s32>) = G_SHUFFLE_VECTOR %22, %22, shufflemask(0, 2)
+    %26:_(<2 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 7)
 
 ...
diff --git a/llvm/test/TableGen/intrinsic-manual-name.td b/llvm/test/TableGen/intrinsic-manual-name.td
new file mode 100644
index 0000000000000..5751fc2874b97
--- /dev/null
+++ b/llvm/test/TableGen/intrinsic-manual-name.td
@@ -0,0 +1,6 @@
+// RUN: llvm-tblgen -gen-intrinsic-impl -I %p/../../include %s -DTEST_INTRINSICS_SUPPRESS_DEFS 2>&1 | FileCheck %s -DFILE=%s
+
+include "llvm/IR/Intrinsics.td"
+
+// CHECK: [[FILE]]:[[@LINE+1]]:5: note: Explicitly specified name matches default name, consider dropping it
+def int_foo0 : Intrinsic<[llvm_anyint_ty], [], [], "llvm.foo0">;
diff --git a/llvm/test/ThinLTO/AArch64/aarch64_inline.ll b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll
new file mode 100644
index 0000000000000..401f66d48c224
--- /dev/null
+++ b/llvm/test/ThinLTO/AArch64/aarch64_inline.ll
@@ -0,0 +1,86 @@
+;; Test verifies inlining happens cross module when module flags are upgraded.
+;; `foo` and `main` are both old semantic while bar is the new semantic.
+;; Regression test for #82763
+
+; RUN: split-file %s %t
+; RUN: opt -module-summary %t/foo.ll -o %t/foo.o
+; RUN: opt -module-summary %t/bar.ll -o %t/bar.o
+; RUN: opt -module-summary %t/main.ll -o %t/main.o
+; RUN: llvm-lto2 run %t/main.o %t/foo.o %t/bar.o -save-temps \
+; RUN:   -o %t/t.exe \
+; RUN:   -r=%t/foo.o,foo,plx \
+; RUN:   -r=%t/bar.o,bar,plx \
+; RUN:   -r=%t/main.o,foo,l \
+; RUN:   -r=%t/main.o,bar,l \
+; RUN:   -r=%t/main.o,main,plx 2>&1
+; RUN: llvm-dis %t/t.exe.1.4.opt.bc -o - | FileCheck %s
+
+; CHECK:      define dso_local noundef i32 @main() local_unnamed_addr #0 {
+; CHECK-NEXT: entry:
+; CHECK-NEXT:  ret i32 35
+; CHECK-NEXT: }
+
+; CHECK:  attributes #0 = { {{.*}}"branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" }
+
+; CHECK: !llvm.module.flags = !{!0, !1, !2, !3}
+
+; CHECK: !0 = !{i32 8, !"branch-target-enforcement", i32 2}
+; CHECK: !1 = !{i32 8, !"sign-return-address", i32 2}
+; CHECK: !2 = !{i32 8, !"sign-return-address-all", i32 2}
+; CHECK: !3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
+
+
+;--- foo.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local noundef i32 @foo() local_unnamed_addr #0 {
+entry:
+  ret i32 34
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 1}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
+
+;--- bar.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+define dso_local noundef i32 @bar() local_unnamed_addr #0 {
+entry:
+  ret i32 1
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) "branch-target-enforcement" "sign-return-address"="all" "sign-return-address-key"="b_key" }
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 2}
+!1 = !{i32 8, !"sign-return-address", i32 2}
+!2 = !{i32 8, !"sign-return-address-all", i32 2}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 2}
+
+;--- main.ll
+target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64-unknown-linux-gnu"
+
+declare i32 @foo();
+declare i32 @bar();
+
+define i32 @main() #0 {
+entry:
+  %1 = call i32 @foo()
+  %2 = call i32 @bar()
+  %3 = add i32 %1, %2
+  ret i32 %3
+}
+
+attributes #0 = { mustprogress nofree norecurse nosync nounwind willreturn memory(none) }
+
+!llvm.module.flags = !{!0, !1, !2, !3 }
+!0 = !{i32 8, !"branch-target-enforcement", i32 1}
+!1 = !{i32 8, !"sign-return-address", i32 1}
+!2 = !{i32 8, !"sign-return-address-all", i32 1}
+!3 = !{i32 8, !"sign-return-address-with-bkey", i32 1}
diff --git a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
index 1f0737b719254..d0b8d14777f52 100644
--- a/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
+++ b/llvm/test/ThinLTO/X86/devirt_external_comdat_same_guid.ll
@@ -24,9 +24,9 @@
 ; RUN: llvm-dis %t5.1.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR1
 ; RUN: llvm-dis %t5.2.4.opt.bc -o - | FileCheck %s --check-prefix=CHECK-IR2
 
-; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1A1nEi)
+; PRINT-DAG: Devirtualized call to {{.*}} (_ZN1B1nEi)
 
-; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1B1nEi
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-grtev4-linux-gnu"
@@ -55,7 +55,7 @@ entry:
   ret i32 0
 }
 
-; CHECK-IR1: define i32 @test(
+; CHECK-IR1: define noundef i32 @test(
 define i32 @test(ptr %obj, i32 %a) {
 entry:
   %vtable = load ptr, ptr %obj
@@ -65,7 +65,7 @@ entry:
   %fptr1 = load ptr, ptr %fptrptr, align 8
 
   ; Check that the call was devirtualized.
-  ; CHECK-IR1: tail call i32 {{.*}}@_ZN1A1nEi
+  ; CHECK-IR1: tail call i32 {{.*}}@_ZN1B1nEi
   %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
 
   ret i32 %call
@@ -73,7 +73,7 @@ entry:
 
 ; CHECK-IR2: define i32 @test2
 ; Check that the call was devirtualized.
-; CHECK-IR2:   tail call i32 @_ZN1A1nEi
+; CHECK-IR2:   tail call i32 @_ZN1B1nEi
 
 declare i1 @llvm.type.test(ptr, metadata)
 declare void @llvm.assume(i1)
diff --git a/llvm/test/ThinLTO/X86/dtlto/json.ll b/llvm/test/ThinLTO/X86/dtlto/json.ll
index 1a38438e7b791..ee1c428c31c7c 100644
--- a/llvm/test/ThinLTO/X86/dtlto/json.ll
+++ b/llvm/test/ThinLTO/X86/dtlto/json.ll
@@ -7,21 +7,33 @@ RUN: rm -rf %t && split-file %s %t && cd %t
 RUN: opt -thinlto-bc t1.ll -o t1.bc
 RUN: opt -thinlto-bc t2.ll -o t2.bc
 
-; Perform DTLTO.
+; Perform DTLTO with clang.
 RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
 RUN:     -r=t1.bc,t1,px -r=t2.bc,t2,px \
 RUN:     -dtlto-distributor=%python \
 RUN:     -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
 RUN:     -dtlto-compiler=my_clang.exe \
 RUN:     -dtlto-compiler-arg=--rota1=10,--rota2=20 \
-RUN:   2>&1 | FileCheck %s
+RUN:   2>&1 | FileCheck --check-prefixes=CHECK,CLANG %s
+
+; Perform DTLTO with LLVM driver.
+RUN: not llvm-lto2 run t1.bc t2.bc -o my.output \
+RUN:     -r=t1.bc,t1,px -r=t2.bc,t2,px \
+RUN:     -dtlto-distributor=%python \
+RUN:     -dtlto-distributor-arg=%llvm_src_root/utils/dtlto/validate.py,--da1=10,--da2=10 \
+RUN:     -dtlto-compiler=llvm \
+RUN:     -dtlto-compiler-prepend-arg=clang \
+RUN:     -dtlto-compiler-arg=--rota1=10,--rota2=20 \
+RUN:   2>&1 | FileCheck --check-prefixes=CHECK,LLVM %s
 
 CHECK: distributor_args=['--da1=10', '--da2=10']
 
 ; Check the common object.
 CHECK:      "linker_output": "my.output"
 CHECK:      "args":
-CHECK-NEXT: "my_clang.exe"
+CLANG-NEXT: "my_clang.exe"
+LLVM-NEXT:  "llvm"
+LLVM-NEXT:  "clang"
 CHECK-NEXT: "-c"
 CHECK-NEXT: "--target=x86_64-unknown-linux-gnu"
 CHECK-NEXT: "-O2"
@@ -30,7 +42,7 @@ CHECK-NEXT: "-Wno-unused-command-line-argument"
 CHECK-NEXT: "--rota1=10"
 CHECK-NEXT: "--rota2=20"
 CHECK-NEXT: ]
-CHECK: "inputs": []
+CHECK:      "inputs": []
 
 ; Check the first job entry.
 CHECK:      "args":
diff --git a/llvm/test/Transforms/ADCE/2016-09-06.ll b/llvm/test/Transforms/ADCE/2016-09-06.ll
index 850f4126e0bd4..1329ac60e7b94 100644
--- a/llvm/test/Transforms/ADCE/2016-09-06.ll
+++ b/llvm/test/Transforms/ADCE/2016-09-06.ll
@@ -5,7 +5,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define i32 @foo(i32, i32, i32) #0 {
+define i32 @foo(i32, i32, i32) {
   %4 = alloca i32, align 4
   %5 = alloca i32, align 4
   %6 = alloca i32, align 4
@@ -48,8 +48,6 @@ B21:
   ret i32 %I22
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 4.0.0"}
diff --git a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
index 9708be975fd6f..5e844b443a79b 100644
--- a/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
+++ b/llvm/test/Transforms/ADCE/blocks-with-dead-term-nondeterministic.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-apple-macosx10.10.0"
 
 ; CHECK: uselistorder label %bb16, { 1, 0 }
 ; Function Attrs: noinline nounwind ssp uwtable
-define void @ham(i1 %arg) local_unnamed_addr #0 {
+define void @ham(i1 %arg) local_unnamed_addr {
 bb:
   br i1 false, label %bb1, label %bb22
 
@@ -64,8 +64,6 @@ bb22:                                             ; preds = %bb21, %bb
   ret void
 }
 
-attributes #0 = { noinline nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 
 !0 = !{i32 7, !"PIC Level", i32 2}
diff --git a/llvm/test/Transforms/AddDiscriminators/basic.ll b/llvm/test/Transforms/AddDiscriminators/basic.ll
index 518653715c837..fc4c10a0da7be 100644
--- a/llvm/test/Transforms/AddDiscriminators/basic.ll
+++ b/llvm/test/Transforms/AddDiscriminators/basic.ll
@@ -11,7 +11,7 @@
 ;         if (i < 10) x = i;
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -35,8 +35,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:   ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable noinline optnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call-nested.ll b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
index 99340a5d3c710..f1373e462f619 100644
--- a/llvm/test/Transforms/AddDiscriminators/call-nested.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call-nested.ll
@@ -9,7 +9,7 @@
 ; #6 }
 
 ; Function Attrs: uwtable
-define i32 @_Z3bazv() #0 !dbg !4 {
+define i32 @_Z3bazv() !dbg !4 {
   %1 = call i32 @_Z3barv(), !dbg !11
 ; CHECK: %1 = call i32 @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %2 = call i32 @_Z3barv(), !dbg !12
@@ -19,12 +19,9 @@ define i32 @_Z3bazv() #0 !dbg !4 {
   ret i32 %3, !dbg !14
 }
 
-declare i32 @_Z3fooii(i32, i32) #1
+declare i32 @_Z3fooii(i32, i32)
 
-declare i32 @_Z3barv() #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare i32 @_Z3barv()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/call.ll b/llvm/test/Transforms/AddDiscriminators/call.ll
index 93d3aa4c4a1c2..11b21effb7926 100644
--- a/llvm/test/Transforms/AddDiscriminators/call.ll
+++ b/llvm/test/Transforms/AddDiscriminators/call.ll
@@ -8,7 +8,7 @@
 ; #5 }
 
 ; Function Attrs: uwtable
-define void @_Z3foov() #0 !dbg !4 {
+define void @_Z3foov() !dbg !4 {
   call void @_Z3barv(), !dbg !10
 ; CHECK:  call void @_Z3barv(), !dbg ![[CALL0:[0-9]+]]
   %a = alloca [100 x i8], align 16
@@ -21,13 +21,10 @@ define void @_Z3foov() #0 !dbg !4 {
   ret void, !dbg !13
 }
 
-declare void @_Z3barv() #1
+declare void @_Z3barv()
 declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
 declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/diamond.ll b/llvm/test/Transforms/AddDiscriminators/diamond.ll
index c93a57aef04b1..9edcf3968c5bb 100644
--- a/llvm/test/Transforms/AddDiscriminators/diamond.ll
+++ b/llvm/test/Transforms/AddDiscriminators/diamond.ll
@@ -12,7 +12,7 @@
 ; bar(3):     discriminator 2
 
 ; Function Attrs: uwtable
-define void @_Z3fooi(i32 %i) #0 !dbg !4 {
+define void @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   store i32 %i, ptr %1, align 4
   call void @llvm.dbg.declare(metadata ptr %1, metadata !11, metadata !12), !dbg !13
@@ -34,13 +34,9 @@ define void @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @_Z3bari(i32) #2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z3bari(i32)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/AddDiscriminators/first-only.ll b/llvm/test/Transforms/AddDiscriminators/first-only.ll
index 7ae9ed0a0116a..415e5f0bb8610 100644
--- a/llvm/test/Transforms/AddDiscriminators/first-only.ll
+++ b/llvm/test/Transforms/AddDiscriminators/first-only.ll
@@ -13,7 +13,7 @@
 ;         }
 ;       }
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -44,8 +44,6 @@ if.end:                                           ; preds = %if.then, %entry
 ; CHECK:  ret void, !dbg ![[END:[0-9]+]]
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/invoke.ll b/llvm/test/Transforms/AddDiscriminators/invoke.ll
index d39014de1f00f..a3989b6d9ac6d 100644
--- a/llvm/test/Transforms/AddDiscriminators/invoke.ll
+++ b/llvm/test/Transforms/AddDiscriminators/invoke.ll
@@ -5,14 +5,14 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.14.0"
 
 ; Function Attrs: ssp uwtable
-define void @_Z3foov() #0 personality ptr @__gxx_personality_v0 !dbg !8 {
+define void @_Z3foov() personality ptr @__gxx_personality_v0 !dbg !8 {
 entry:
   %exn.slot = alloca ptr
   %ehselector.slot = alloca i32
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL1:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !11
+  call void @_Z12bar_noexceptv(), !dbg !11
   ; CHECK: call void @_Z12bar_noexceptv({{.*}} !dbg ![[CALL2:[0-9]+]]
-  call void @_Z12bar_noexceptv() #4, !dbg !13
+  call void @_Z12bar_noexceptv(), !dbg !13
   invoke void @_Z3barv()
   ; CHECK: unwind label {{.*}} !dbg ![[INVOKE:[0-9]+]]
           to label %invoke.cont unwind label %lpad, !dbg !14
@@ -31,8 +31,8 @@ lpad:                                             ; preds = %entry
 
 catch:                                            ; preds = %lpad
   %exn = load ptr, ptr %exn.slot, align 8, !dbg !15
-  %3 = call ptr @__cxa_begin_catch(ptr %exn) #4, !dbg !15
-  invoke void @__cxa_rethrow() #5
+  %3 = call ptr @__cxa_begin_catch(ptr %exn), !dbg !15
+  invoke void @__cxa_rethrow()
           to label %unreachable unwind label %lpad1, !dbg !17
 
 lpad1:                                            ; preds = %catch
@@ -62,7 +62,7 @@ terminate.lpad:                                   ; preds = %lpad1
   %7 = landingpad { ptr, i32 }
           catch ptr null, !dbg !20
   %8 = extractvalue { ptr, i32 } %7, 0, !dbg !20
-  call void @__clang_call_terminate(ptr %8) #6, !dbg !20
+  call void @__clang_call_terminate(ptr %8), !dbg !20
   unreachable, !dbg !20
 
 unreachable:                                      ; preds = %catch
@@ -70,9 +70,9 @@ unreachable:                                      ; preds = %catch
 }
 
 ; Function Attrs: nounwind
-declare void @_Z12bar_noexceptv() #1
+declare void @_Z12bar_noexceptv()
 
-declare void @_Z3barv() #2
+declare void @_Z3barv()
 
 declare i32 @__gxx_personality_v0(...)
 
@@ -83,22 +83,14 @@ declare void @__cxa_rethrow()
 declare void @__cxa_end_catch()
 
 ; Function Attrs: noinline noreturn nounwind
-define linkonce_odr hidden void @__clang_call_terminate(ptr) #3 {
-  %2 = call ptr @__cxa_begin_catch(ptr %0) #4
-  call void @_ZSt9terminatev() #6
+define linkonce_odr hidden void @__clang_call_terminate(ptr) {
+  %2 = call ptr @__cxa_begin_catch(ptr %0)
+  call void @_ZSt9terminatev()
   unreachable
 }
 
 declare void @_ZSt9terminatev()
 
-attributes #0 = { ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline noreturn nounwind }
-attributes #4 = { nounwind }
-attributes #5 = { noreturn }
-attributes #6 = { noreturn nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
 !llvm.ident = !{!7}
diff --git a/llvm/test/Transforms/AddDiscriminators/multiple.ll b/llvm/test/Transforms/AddDiscriminators/multiple.ll
index 54c1a5d77f102..8e8ca6a968633 100644
--- a/llvm/test/Transforms/AddDiscriminators/multiple.ll
+++ b/llvm/test/Transforms/AddDiscriminators/multiple.ll
@@ -10,7 +10,7 @@
 ; The two stores inside the if-then-else line must have different discriminator
 ; values.
 
-define void @foo(i32 %i) #0 !dbg !4 {
+define void @foo(i32 %i) !dbg !4 {
 entry:
   %i.addr = alloca i32, align 4
   %x = alloca i32, align 4
@@ -45,8 +45,6 @@ if.end:                                           ; preds = %if.else, %if.then
   ret void, !dbg !12
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
index c23edd6c6d734..f84579b4c08c2 100644
--- a/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
+++ b/llvm/test/Transforms/AddDiscriminators/no-discriminators.ll
@@ -12,7 +12,7 @@
 ; altered. If they are, it means that the discriminators pass added a
 ; new lexical scope.
 
-define i32 @foo(i64 %i) #0 !dbg !4 {
+define i32 @foo(i64 %i) !dbg !4 {
 entry:
   %retval = alloca i32, align 4
   %i.addr = alloca i64, align 8
@@ -39,10 +39,7 @@ return:                                           ; preds = %if.else, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; We should be able to add discriminators even in the absence of llvm.dbg.cu.
 ; When using sample profiles, the front end will generate line tables but it
diff --git a/llvm/test/Transforms/AddDiscriminators/oneline.ll b/llvm/test/Transforms/AddDiscriminators/oneline.ll
index 533d5474e1112..fc1675b4609ce 100644
--- a/llvm/test/Transforms/AddDiscriminators/oneline.ll
+++ b/llvm/test/Transforms/AddDiscriminators/oneline.ll
@@ -10,7 +10,7 @@
 ; return 100: discriminator 4
 ; return 99:  discriminator 6
 
-define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
+define i32 @_Z3fooi(i32 %i) !dbg !4 {
   %1 = alloca i32, align 4
   %2 = alloca i32, align 4
   store i32 %i, ptr %2, align 4, !tbaa !13
@@ -49,10 +49,7 @@ define i32 @_Z3fooi(i32 %i) #0 !dbg !4 {
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Attributor/range-and-constant-fold.ll b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
new file mode 100644
index 0000000000000..a8f33092d0d0b
--- /dev/null
+++ b/llvm/test/Transforms/Attributor/range-and-constant-fold.ll
@@ -0,0 +1,38 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -S -passes=attributor %s -o - | FileCheck %s
+
+@g = internal unnamed_addr addrspace(4) constant [3 x i8] c"12\00", align 16
+
+define void @foo(i32 %a, i32 %b, ptr %p.0, ptr %p.1) {
+; CHECK-LABEL: define void @foo(
+; CHECK-SAME: i32 [[A:%.*]], i32 [[B:%.*]], ptr nofree nonnull writeonly captures(none) dereferenceable(1) [[P_0:%.*]], ptr nofree nonnull writeonly align 4 captures(none) dereferenceable(8) [[P_1:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[A]], [[B]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[L1:.*]], label %[[L2:.*]]
+; CHECK:       [[L1]]:
+; CHECK-NEXT:    br label %[[L3:.*]]
+; CHECK:       [[L2]]:
+; CHECK-NEXT:    br label %[[L3]]
+; CHECK:       [[L3]]:
+; CHECK-NEXT:    [[PHI:%.*]] = phi ptr addrspace(4) [ @g, %[[L1]] ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %[[L2]] ]
+; CHECK-NEXT:    [[LOAD_SMALL:%.*]] = load i8, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT:    store i8 [[LOAD_SMALL]], ptr [[P_0]], align 1
+; CHECK-NEXT:    [[LOAD_LARGE:%.*]] = load i64, ptr addrspace(4) [[PHI]], align 4
+; CHECK-NEXT:    store i64 [[LOAD_LARGE]], ptr [[P_1]], align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp = icmp ne i32 %a, %b
+  br i1 %cmp, label %l1, label %l2
+l1:
+  br label %l3
+l2:
+  br label %l3
+l3:
+  %phi = phi ptr addrspace(4) [ @g, %l1 ], [ getelementptr inbounds nuw (i8, ptr addrspace(4) @g, i64 1), %l2 ]
+  %load.small = load i8, ptr addrspace(4) %phi
+  store i8 %load.small, ptr %p.0
+  %load.large = load i64, ptr addrspace(4) %phi
+  store i64 %load.large, ptr %p.1
+  ret void
+}
diff --git a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
index eb7d78f51018f..470423874441b 100644
--- a/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
+++ b/llvm/test/Transforms/Attributor/reduced/register_benchmark_test.ll
@@ -1557,24 +1557,24 @@ declare dso_local void @_GLOBAL__sub_I_register_benchmark_test.cc() #0 section "
 ; Function Attrs: cold noreturn nounwind
 declare void @llvm.trap() #20
 
-attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #3 = { nounwind }
-attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #4 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #5 = { argmemonly nounwind willreturn }
-attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #6 = { alwaysinline uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #7 = { alwaysinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #8 = { nobuiltin "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #9 = { nobuiltin nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #10 = { inlinehint uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #11 = { inlinehint nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #12 = { noreturn nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #13 = { norecurse uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #14 = { nounwind readnone willreturn }
-attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #15 = { noreturn "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #16 = { noinline noreturn nounwind }
 attributes #17 = { argmemonly nounwind willreturn writeonly }
-attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #18 = { noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
+attributes #19 = { inlinehint noreturn uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 attributes #20 = { cold noreturn nounwind }
diff --git a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
index d272fef54d7e3..189186ba9cf07 100644
--- a/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/ARM/bitreverse-recognize.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
 ; CHECK-LABEL: @f
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 ; CHECK: call i32 @llvm.bitreverse.i32
 entry:
   br label %for.body
@@ -25,8 +25,6 @@ for.body:                                         ; preds = %for.body, %entry
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
index eec0967df7d4f..35115cf0ba7bb 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/bitreverse-hang.ll
@@ -21,7 +21,7 @@
 @b = common global i32 0, align 4
 
 ; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
 entry:
   %b.promoted = load i32, ptr @b, align 4, !tbaa !2
   br label %for.body
@@ -40,8 +40,6 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
index 63fd1845594c6..2570b3b2efc31 100644
--- a/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/X86/sink-addrmode-base.ll
@@ -951,3 +951,41 @@ fallthrough:
   %v = add i32 %v1, %v2
   ret i32 %v
 }
+
+; Make sure we don't simplify an incomplete expression tree.
+define i8 @pr163453(ptr %p, i1 %cond) {
+; CHECK-LABEL: define i8 @pr163453(
+; CHECK-SAME: ptr [[P:%.*]], i1 [[COND:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P_ADDR_0:%.*]] = getelementptr i8, ptr [[P]], i64 1
+; CHECK-NEXT:    [[TMP0:%.*]] = load i8, ptr [[P]], align 1
+; CHECK-NEXT:    [[INCDEC_PTR11:%.*]] = getelementptr i8, ptr [[P]], i64 2
+; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[COND]], ptr [[P_ADDR_0]], ptr [[INCDEC_PTR11]]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i8, ptr [[SPEC_SELECT]], align 1
+; CHECK-NEXT:    ret i8 [[LOAD]]
+;
+entry:
+  br label %for.cond
+
+for.cond:
+  %p.pn = phi ptr [ %p, %entry ], [ %p.addr.0, %for.inc ]
+  %p.addr.0 = getelementptr i8, ptr %p.pn, i64 1
+  br i1 false, label %exit, label %for.body
+
+for.body:
+  %1 = load i8, ptr %p.pn, align 1
+  br i1 false, label %for.inc, label %if.else
+
+if.else:
+  %incdec.ptr11 = getelementptr i8, ptr %p.pn, i64 2
+  %spec.select = select i1 %cond, ptr %p.addr.0, ptr %incdec.ptr11
+  br label %exit
+
+for.inc:
+  br label %for.cond
+
+exit:
+  %p.addr.3 = phi ptr [ %spec.select, %if.else ], [ %p.addr.0, %for.cond ]
+  %load = load i8, ptr %p.addr.3, align 1
+  ret i8 %load
+}
diff --git a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
index 1c990ff7073a4..14360feee8bc3 100644
--- a/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
+++ b/llvm/test/Transforms/CodeGenPrepare/dom-tree.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-gnueabihf"
 
-define i32 @f(i32 %a) #0 {
+define i32 @f(i32 %a) {
 entry:
   br label %for.body
 
@@ -30,8 +30,6 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !3
 }
 
-attributes #0 = { norecurse nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
index 46fa066a1b52a..78760dbe7293c 100644
--- a/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
+++ b/llvm/test/Transforms/ConstantHoisting/X86/ehpad.ll
@@ -20,7 +20,7 @@ target triple = "x86_64-pc-windows-msvc"
 ; BFIHOIST: br label %endif
 
 ; Function Attrs: norecurse
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr #0 personality ptr @__CxxFrameHandler3 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) local_unnamed_addr personality ptr @__CxxFrameHandler3 {
   %call = tail call i64 @fn(i64 0)
   %call1 = tail call i64 @fn(i64 1)
   %tobool = icmp eq i32 %argc, 0
@@ -62,9 +62,6 @@ endif:
   ret i32 0
 }
 
-declare i64 @fn(i64) local_unnamed_addr #1
+declare i64 @fn(i64) local_unnamed_addr
 
 declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { norecurse "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
index 5127e92a7c345..4b8ac09801ab2 100644
--- a/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
+++ b/llvm/test/Transforms/ConstraintElimination/add-nsw.ll
@@ -757,8 +757,7 @@ define i1 @add_neg_1_known_sge_ult_1(i32 %a) {
 ; CHECK-NEXT:    [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 1
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_SGE]])
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[A]], -1
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a.sge = icmp sge i32 %a, 1
@@ -823,8 +822,7 @@ define i1 @add_neg_3_known_sge_ult_1(i32 %a) {
 ; CHECK-NEXT:    [[A_SGE:%.*]] = icmp sge i32 [[A:%.*]], 3
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[A_SGE]])
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[A]], -3
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[SUB]], [[A]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %a.sge = icmp sge i32 %a, 3
diff --git a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
index 52adc78b4e159..8dcac7832179a 100644
--- a/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
+++ b/llvm/test/Transforms/ConstraintElimination/gep-arithmetic-add.ll
@@ -389,8 +389,7 @@ define i1 @gep_count_add_1_sge_known_ult_1(i32 %count, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
 ; CHECK-NEXT:    [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT:    [[C:%.*]] = icmp ult ptr [[GEP_SUB]], [[GEP_COUNT]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %sge = icmp sge i32 %count, 1
@@ -415,8 +414,7 @@ define i1 @gep_count_add_1_sge_known_uge_1(i32 %count, ptr %p) {
 ; CHECK-NEXT:    [[SUB:%.*]] = add nsw i32 [[COUNT]], -1
 ; CHECK-NEXT:    [[SUB_EXT:%.*]] = zext i32 [[SUB]] to i64
 ; CHECK-NEXT:    [[GEP_SUB:%.*]] = getelementptr inbounds i32, ptr [[P]], i64 [[SUB_EXT]]
-; CHECK-NEXT:    [[C:%.*]] = icmp uge ptr [[GEP_SUB]], [[P]]
-; CHECK-NEXT:    ret i1 [[C]]
+; CHECK-NEXT:    ret i1 true
 ;
 entry:
   %sge = icmp sge i32 %count, 1
diff --git a/llvm/test/Transforms/Coroutines/coro-debug.ll b/llvm/test/Transforms/Coroutines/coro-debug.ll
index d1f1922c2a92f..109be51ffea85 100644
--- a/llvm/test/Transforms/Coroutines/coro-debug.ll
+++ b/llvm/test/Transforms/Coroutines/coro-debug.ll
@@ -14,7 +14,7 @@ entry:
   %0 = call token @llvm.coro.id(i32 0, ptr null, ptr @flink, ptr null), !dbg !16
   %1 = call i64 @llvm.coro.size.i64(), !dbg !16
   %call = call ptr @malloc(i64 %1), !dbg !16
-  %2 = call ptr @llvm.coro.begin(token %0, ptr %call) #7, !dbg !16
+  %2 = call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !16
   store ptr %2, ptr %coro_hdl, align 8, !dbg !16
   %3 = call i8 @llvm.coro.suspend(token none, i1 false), !dbg !17
   %conv = sext i8 %3 to i32, !dbg !17
@@ -69,7 +69,7 @@ coro_Cleanup:                                     ; preds = %sw.epilog, %sw.bb1
   br label %coro_Suspend, !dbg !24
 
 coro_Suspend:                                     ; preds = %coro_Cleanup, %sw.default
-  call void @llvm.coro.end(ptr null, i1 false, token none) #7, !dbg !24
+  call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !24
   %7 = load ptr, ptr %coro_hdl, align 8, !dbg !24
   store i32 0, ptr %late_local, !dbg !24
   ret ptr %7, !dbg !24
@@ -82,47 +82,40 @@ ehcleanup:
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #2
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
-declare ptr @malloc(i64) #3
+declare ptr @malloc(i64)
 declare ptr @allocate() 
 declare void @print({ ptr, i32 })
 declare void @log()
 
 ; Function Attrs: nounwind readnone
-declare i64 @llvm.coro.size.i64() #4
+declare i64 @llvm.coro.size.i64()
 
 ; Function Attrs: nounwind
-declare ptr @llvm.coro.begin(token, ptr writeonly) #5
+declare ptr @llvm.coro.begin(token, ptr writeonly)
 
 ; Function Attrs: nounwind
-declare i8 @llvm.coro.suspend(token, i1) #5
+declare i8 @llvm.coro.suspend(token, i1)
 
-declare void @free(ptr) #3
+declare void @free(ptr)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #2
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
 
 ; Function Attrs: nounwind
-declare void @llvm.coro.end(ptr, i1, token) #5
+declare void @llvm.coro.end(ptr, i1, token)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #2
-
-attributes #0 = { noinline nounwind presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
-attributes #2 = { argmemonly nounwind readonly }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone }
-attributes #5 = { nounwind }
-attributes #6 = { alwaysinline }
-attributes #7 = { noduplicate }
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
+
+attributes #0 = { noinline nounwind presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
index c53bea899ee51..577ca9a7bb60c 100644
--- a/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
+++ b/llvm/test/Transforms/Coroutines/coro-split-dbg.ll
@@ -6,9 +6,9 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @bar(...) local_unnamed_addr #2
+declare void @bar(...) local_unnamed_addr
 
 ; Function Attrs: nounwind uwtable
 define ptr @f() #3 !dbg !16 {
@@ -16,14 +16,14 @@ entry:
   %0 = tail call token @llvm.coro.id(i32 0, ptr null, ptr @f, ptr null), !dbg !26
   %1 = tail call i64 @llvm.coro.size.i64(), !dbg !26
   %call = tail call ptr @malloc(i64 %1), !dbg !26
-  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call) #9, !dbg !26
+  %2 = tail call ptr @llvm.coro.begin(token %0, ptr %call), !dbg !26
   tail call void @llvm.dbg.value(metadata ptr %2, metadata !21, metadata !12), !dbg !26
   br label %for.cond, !dbg !27
 
 for.cond:                                         ; preds = %for.cond, %entry
   tail call void @llvm.dbg.value(metadata i32 undef, metadata !22, metadata !12), !dbg !28
-  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12) #7, !dbg !29
-  tail call void (...) @bar() #7, !dbg !33
+  tail call void @llvm.dbg.value(metadata i32 undef, metadata !11, metadata !12), !dbg !29
+  tail call void (...) @bar(), !dbg !33
   %3 = tail call token @llvm.coro.save(ptr null), !dbg !34
   %4 = tail call i8 @llvm.coro.suspend(token %3, i1 false), !dbg !34
   %conv = sext i8 %4 to i32, !dbg !34
@@ -38,40 +38,31 @@ coro_Cleanup:                                     ; preds = %for.cond
   br label %coro_Suspend, !dbg !36
 
 coro_Suspend:                                     ; preds = %for.cond, %if.then, %coro_Cleanup
-  tail call void @llvm.coro.end(ptr null, i1 false, token none) #9, !dbg !38
+  tail call void @llvm.coro.end(ptr null, i1 false, token none), !dbg !38
   ret ptr %2, !dbg !39
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #4
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind readonly
-declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr) #5
+declare token @llvm.coro.id(i32, ptr readnone, ptr nocapture readonly, ptr)
 
 ; Function Attrs: nounwind
-declare noalias ptr @malloc(i64) local_unnamed_addr #6
-declare i64 @llvm.coro.size.i64() #1
-declare ptr @llvm.coro.begin(token, ptr writeonly) #7
-declare token @llvm.coro.save(ptr) #7
-declare i8 @llvm.coro.suspend(token, i1) #7
-declare void @llvm.lifetime.end.p0(ptr nocapture) #4
-declare ptr @llvm.coro.free(token, ptr nocapture readonly) #5
-declare void @free(ptr nocapture) local_unnamed_addr #6
-declare void @llvm.coro.end(ptr, i1, token) #7
-declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8) #5
+declare noalias ptr @malloc(i64) local_unnamed_addr
+declare i64 @llvm.coro.size.i64()
+declare ptr @llvm.coro.begin(token, ptr writeonly)
+declare token @llvm.coro.save(ptr)
+declare i8 @llvm.coro.suspend(token, i1)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
+declare ptr @llvm.coro.free(token, ptr nocapture readonly)
+declare void @free(ptr nocapture) local_unnamed_addr
+declare void @llvm.coro.end(ptr, i1, token)
+declare ptr @llvm.coro.subfn.addr(ptr nocapture readonly, i8)
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable presplitcoroutine "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind }
-attributes #5 = { argmemonly nounwind readonly }
-attributes #6 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #7 = { nounwind }
-attributes #8 = { alwaysinline nounwind }
-attributes #9 = { noduplicate }
+attributes #3 = { nounwind uwtable presplitcoroutine }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/DeadArgElim/dbginfo.ll b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
index a27ca9dd70c24..f6313c7957f88 100644
--- a/llvm/test/Transforms/DeadArgElim/dbginfo.ll
+++ b/llvm/test/Transforms/DeadArgElim/dbginfo.ll
@@ -21,14 +21,14 @@
 ; updated LLVM functions.
 
 ; Function Attrs: uwtable
-define void @_Z2f2v() #0 !dbg !4 {
+define void @_Z2f2v() !dbg !4 {
 entry:
   call void (i32, ...) @_ZL2f1iz(i32 1), !dbg !15
   ret void, !dbg !16
 }
 
 ; Function Attrs: nounwind uwtable
-define internal void @_ZL2f1iz(i32, ...) #1 !dbg !8 {
+define internal void @_ZL2f1iz(i32, ...) !dbg !8 {
 entry:
   call void @llvm.dbg.value(metadata i32 %0, metadata !17, metadata !18), !dbg !19
   ret void, !dbg !20
@@ -40,8 +40,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, metadata, metadata) #2
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..2eaa275c33211
--- /dev/null
+++ b/llvm/test/Transforms/DeadStoreElimination/matrix-intrinsics.ll
@@ -0,0 +1,332 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=dse -S %s | FileCheck %s
+
+define void @dead_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_unstrided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 200, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 200, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 100, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 100, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store(ptr %ptr, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store(
+; CHECK-SAME: ptr [[PTR:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @dead_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = load double, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L_1]], ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %src
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l.1, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_dynamically_strided_store_non_matrix_load(ptr noalias %src, ptr noalias %dst, i32 %stride) {
+; CHECK-LABEL: define void @live_dynamically_strided_store_non_matrix_load(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]], i32 [[STRIDE:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L_1:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 [[STRIDE]], i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = load double, ptr [[DST]], align 8
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l.1 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  %l.2 = load double, ptr %dst
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 %stride, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_unstrided_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_unstrided_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_unstrided_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_unstrided_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_non_matrix_store(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_non_matrix_store(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[DST_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[DST_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %dst.offset = getelementptr inbounds double, ptr %src, i32 6
+  store double 42.0, ptr %dst.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @live_non_matrix_store(ptr %ptr) {
+; CHECK-LABEL: define void @live_non_matrix_store(
+; CHECK-SAME: ptr [[PTR:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[PTR_OFFSET:%.*]] = getelementptr inbounds double, ptr [[PTR]], i32 6
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[PTR_OFFSET]], align 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[PTR]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ptr.offset = getelementptr inbounds double, ptr %ptr, i32 6
+  store double 42.0, ptr %ptr.offset
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %ptr, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <16 x double> zeroinitializer, ptr [[DST]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <16 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_unstrided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_unstrided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> zeroinitializer, ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <4 x double> zeroinitializer, ptr [[DST]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  store <4 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @live_matrix_store_non_matrix_overwrite_strided(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_non_matrix_overwrite_strided(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store <8 x double> zeroinitializer, ptr [[DST]], align 64
+; CHECK-NEXT:    ret void
+;
+entry:
+  call void @llvm.matrix.column.major.store(<8 x double> zeroinitializer, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  store <8 x double> zeroinitializer, ptr %dst
+  ret void
+}
+
+define void @dead_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @dead_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  ret void
+}
+
+define void @live_matrix_store_dimension_change(ptr noalias %src, ptr noalias %dst) {
+; CHECK-LABEL: define void @live_matrix_store_dimension_change(
+; CHECK-SAME: ptr noalias [[SRC:%.*]], ptr noalias [[DST:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr [[DST]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[DST]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v9f64.i32(<9 x double> zeroinitializer, ptr %dst, i32 3, i1 false, i32 3, i32 3)
+  call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> %l, ptr %dst, i32 4, i1 false, i32 4, i32 2)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
diff --git a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
index f66aa0f81ec75..2cec6b5d805b7 100644
--- a/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
+++ b/llvm/test/Transforms/DeadStoreElimination/mda-with-dbg-values.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @g = common global [1 x i8] zeroinitializer, align 1, !dbg !0
 
 ; Function Attrs: noinline nounwind uwtable
-define void @foo() #0 !dbg !14 {
+define void @foo() !dbg !14 {
 entry:
   %i = alloca i8, align 1
   store i8 1, ptr %i, align 1, !dbg !19
@@ -37,7 +37,6 @@ declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #2
 
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
 
diff --git a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
index 4d3a241352630..628669d6f4260 100644
--- a/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/Inputs/funcimport_debug.ll
@@ -3,13 +3,11 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @func() #0 !dbg !4 {
+define void @func() !dbg !4 {
 entry:
     ret void, !dbg !10
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
diff --git a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
index f449047096ed3..713dc4ebf8e89 100644
--- a/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
+++ b/llvm/test/Transforms/FunctionImport/funcimport_debug.ll
@@ -24,16 +24,13 @@ target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define i32 @main() #0 !dbg !4 {
+define i32 @main() !dbg !4 {
 entry:
   call void (...) @func(), !dbg !11
   ret i32 0, !dbg !12
 }
 
-declare void @func(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @func(...)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/llvm/test/Transforms/GCOVProfiling/exit-block.ll b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
index 1840f045b3ffe..543edac243def 100644
--- a/llvm/test/Transforms/GCOVProfiling/exit-block.ll
+++ b/llvm/test/Transforms/GCOVProfiling/exit-block.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @A = common global i32 0, align 4, !dbg !9
 
 ; Function Attrs: nounwind uwtable
-define void @test() #0 !dbg !4 {
+define void @test() !dbg !4 {
 entry:
   tail call void (...) @f() #2, !dbg !14
   %0 = load i32, ptr @A, align 4, !dbg !15
@@ -28,12 +28,10 @@ if.end:                                           ; preds = %entry, %if.then
   ret void, !dbg !18
 }
 
-declare void @f(...) #1
+declare void @f(...)
 
-declare void @g(...) #1
+declare void @g(...)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #2 = { nounwind }
 
 !llvm.gcov = !{!19}
diff --git a/llvm/test/Transforms/GCOVProfiling/linezero.ll b/llvm/test/Transforms/GCOVProfiling/linezero.ll
index 8bfeabdf906a1..1eae41303f5d4 100644
--- a/llvm/test/Transforms/GCOVProfiling/linezero.ll
+++ b/llvm/test/Transforms/GCOVProfiling/linezero.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct.vector = type { i8 }
 
 ; Function Attrs: nounwind
-define i32 @_Z4testv() #0 !dbg !15 {
+define i32 @_Z4testv() !dbg !15 {
 entry:
   %retval = alloca i32, align 4
   %__range = alloca ptr, align 8
@@ -63,19 +63,19 @@ return:                                           ; No predecessors!
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare void @_Z13TagFieldSpecsv() #2
+declare void @_Z13TagFieldSpecsv()
 
-declare ptr @_ZN6vector5beginEv(ptr) #2
+declare ptr @_ZN6vector5beginEv(ptr)
 
-declare ptr @_ZN6vector3endEv(ptr) #2
+declare ptr @_ZN6vector3endEv(ptr)
 
 ; Function Attrs: noreturn nounwind
-declare void @llvm.trap() #3
+declare void @llvm.trap()
 
 ; Function Attrs: nounwind
-define void @_Z2f1v() #0 !dbg !20 {
+define void @_Z2f1v() !dbg !20 {
 entry:
   br label %0
 
@@ -83,11 +83,6 @@ entry:
   ret void, !dbg !45
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noreturn nounwind }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!23, !24}
 !llvm.gcov = !{!25}
diff --git a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
index 7f169e2d1ebc3..98bdd748ed03c 100644
--- a/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
+++ b/llvm/test/Transforms/GCOVProfiling/split-indirectbr-critical-edges.ll
@@ -10,7 +10,7 @@
 ; CHECK-NEXT:    load {{.*}} @__llvm_gcov_ctr
 ; CHECK-NOT:     load {{.*}} @__llvm_gcov_ctr
 
-define dso_local i32 @cannot_split(ptr nocapture readonly %p) #0 !dbg !7 {
+define dso_local i32 @cannot_split(ptr nocapture readonly %p) !dbg !7 {
 entry:
   %targets = alloca <2 x ptr>, align 16
   store <2 x ptr> <ptr blockaddress(@cannot_split, %indirect), ptr blockaddress(@cannot_split, %end)>, ptr %targets, align 16, !dbg !9
@@ -42,8 +42,6 @@ end:                                              ; preds = %indirect
   ret i32 0, !dbg !22
 }
 
-attributes #0 = { norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 
diff --git a/llvm/test/Transforms/GVN/cond_br2.ll b/llvm/test/Transforms/GVN/cond_br2.ll
index 6ceec95845b7a..4811ad021b109 100644
--- a/llvm/test/Transforms/GVN/cond_br2.ll
+++ b/llvm/test/Transforms/GVN/cond_br2.ll
@@ -11,12 +11,12 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
 
 ; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
 ; CHECK-LABEL: define void @_Z4testv(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
+; CHECK-SAME: ) personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[SV:%.*]] = alloca %"class.llvm::SmallVector", align 16
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SV]]) #[[ATTR4:[0-9]+]]
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(ptr [[SV]])
 ; CHECK-NEXT:    [[FIRSTEL_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
 ; CHECK-NEXT:    store ptr [[FIRSTEL_I_I_I_I_I_I]], ptr [[SV]], align 16, !tbaa [[ANYPTR_TBAA0:![0-9]+]]
 ; CHECK-NEXT:    [[ENDX_I_I_I_I_I_I:%.*]] = getelementptr inbounds %"class.llvm::SmallVector", ptr [[SV]], i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -66,10 +66,10 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[CMP_I_I_I_I19:%.*]] = icmp eq ptr [[TMP0]], [[FIRSTEL_I_I_I_I_I_I]]
 ; CHECK-NEXT:    br i1 [[CMP_I_I_I_I19]], label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21:.*]], label %[[IF_THEN_I_I_I20:.*]]
 ; CHECK:       [[IF_THEN_I_I_I20]]:
-; CHECK-NEXT:    call void @free(ptr [[TMP0]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @free(ptr [[TMP0]])
 ; CHECK-NEXT:    br label %[[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]
 ; CHECK:       [[_ZN4LLVM11SMALLVECTORIILJ8EED1EV_EXIT21]]:
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SV]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(ptr [[SV]])
 ; CHECK-NEXT:    ret void
 ; CHECK:       [[LPAD]]:
 ; CHECK-NEXT:    [[TMP1:%.*]] = landingpad { ptr, i32 }
@@ -78,7 +78,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 ; CHECK-NEXT:    [[CMP_I_I_I_I:%.*]] = icmp eq ptr [[TMP2]], [[FIRSTEL_I_I_I_I_I_I]]
 ; CHECK-NEXT:    br i1 [[CMP_I_I_I_I]], label %[[EH_RESUME:.*]], label %[[IF_THEN_I_I_I:.*]]
 ; CHECK:       [[IF_THEN_I_I_I]]:
-; CHECK-NEXT:    call void @free(ptr [[TMP2]]) #[[ATTR4]]
+; CHECK-NEXT:    call void @free(ptr [[TMP2]])
 ; CHECK-NEXT:    br label %[[EH_RESUME]]
 ; CHECK:       [[EH_RESUME]]:
 ; CHECK-NEXT:    resume { ptr, i32 } [[TMP1]]
@@ -86,7 +86,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
-  call void @llvm.lifetime.start.p0(ptr %sv) #1
+  call void @llvm.lifetime.start.p0(ptr %sv)
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
   %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -151,11 +151,11 @@ invoke.cont3:                                     ; preds = %invoke.cont2
   br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
 
 if.then.i.i.i20:                                  ; preds = %invoke.cont3
-  call void @free(ptr %5) #1
+  call void @free(ptr %5)
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end.p0(ptr %sv) #1
+  call void @llvm.lifetime.end.p0(ptr %sv)
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -166,7 +166,7 @@ lpad:                                             ; preds = %if.end.i14, %if.end
   br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
 
 if.then.i.i.i:                                    ; preds = %lpad
-  call void @free(ptr %7) #1
+  call void @free(ptr %7)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %if.then.i.i.i, %lpad
@@ -174,24 +174,19 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 declare i32 @__gxx_personality_v0(...)
 
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
 
 ; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
 
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/GVN/matrix-intrinsics.ll b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
new file mode 100644
index 0000000000000..03bd45b7fcde7
--- /dev/null
+++ b/llvm/test/Transforms/GVN/matrix-intrinsics.ll
@@ -0,0 +1,132 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -passes=gvn -S %s | FileCheck %s
+
+define void @redundant_unstrided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 8
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 8
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_unstrided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_unstrided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 1
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 1
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 4, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @redundant_strided_load(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @llvm.matrix.column.major.store.v8f64.i32(<8 x double> [[L]], ptr [[SRC]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @llvm.matrix.column.major.store.v8f64(<8 x double> %l, ptr %src, i32 8, i1 false, i32 4, i32 2)
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+
+}
+
+define void @redundant_strided_load_non_matrix_store(ptr %src) {
+; CHECK-LABEL: define void @redundant_strided_load_non_matrix_store(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SRC_OFFSET:%.*]] = getelementptr inbounds double, ptr [[SRC]], i32 16
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC_OFFSET]], i32 8, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    store double 4.200000e+01, ptr [[SRC]], align 8
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %src.offset = getelementptr inbounds double, ptr %src, i32 16
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  store double 42.0, ptr %src
+  %l.2 = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src.offset, i32 8, i1 false, i32 4, i32 2)
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.2)
+  ret void
+}
+
+define void @repeat_load_dimension_change_project(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_project(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+define void @repeat_load_dimension_change_shuffle(ptr %src) {
+; CHECK-LABEL: define void @repeat_load_dimension_change_shuffle(
+; CHECK-SAME: ptr [[SRC:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[L:%.*]] = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr [[SRC]], i32 4, i1 false, i32 4, i32 2)
+; CHECK-NEXT:    [[L_2:%.*]] = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr [[SRC]], i32 3, i1 false, i32 3, i32 3)
+; CHECK-NEXT:    [[L_3:%.*]] = shufflevector <9 x double> [[L_2]], <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    call void @use(<8 x double> [[L]])
+; CHECK-NEXT:    call void @use(<8 x double> [[L_3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %l = call <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr %src, i32 4, i1 false, i32 4, i32 2)
+  %l.2 = call <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr %src, i32 3, i1 false, i32 3, i32 3)
+  %l.3 = shufflevector <9 x double> %l.2, <9 x double> zeroinitializer, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+  call void @use(<8 x double> %l)
+  call void @use(<8 x double> %l.3)
+  ret void
+}
+
+declare <8 x double> @llvm.matrix.column.major.load.v8f64.i32(ptr, i32, i1, i32, i32)
+declare <9 x double> @llvm.matrix.column.major.load.v9f64.i32(ptr, i32, i1, i32, i32)
+declare void @llvm.matrix.column.major.store.v8f64.i32(<8 x double>, ptr, i32, i1, i32, i32)
+declare void @use(<8 x double>)
diff --git a/llvm/test/Transforms/GVN/pr33549.ll b/llvm/test/Transforms/GVN/pr33549.ll
index a8ce37c4f86a6..a1e28f760a193 100644
--- a/llvm/test/Transforms/GVN/pr33549.ll
+++ b/llvm/test/Transforms/GVN/pr33549.ll
@@ -4,9 +4,9 @@
 @Data = common local_unnamed_addr global [32 x i32] zeroinitializer, align 4
 
 ; Function Attrs: norecurse nounwind
-define void @testshl() local_unnamed_addr #0 {
+define void @testshl() local_unnamed_addr {
 ; CHECK-LABEL: define void @testshl(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
 ; CHECK:       [[FOR_BODY]]:
@@ -78,8 +78,6 @@ for.end10:                                        ; preds = %for.inc8
   ret void
 }
 
-attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m7" "target-features"="+d16,+dsp,+fp-armv8,+hwdiv,+strict-align,+thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/llvm/test/Transforms/GVN/pr42605.ll b/llvm/test/Transforms/GVN/pr42605.ll
index 3e6241c35ac60..98b447f8add5a 100644
--- a/llvm/test/Transforms/GVN/pr42605.ll
+++ b/llvm/test/Transforms/GVN/pr42605.ll
@@ -114,7 +114,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void
 }
 
-attributes #0 = { noinline norecurse nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline norecurse nounwind readonly uwtable }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
index b9c8537781a82..082d01682ad4c 100644
--- a/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
+++ b/llvm/test/Transforms/GVNHoist/hoist-unsafe-pr31729.ll
@@ -14,7 +14,7 @@
 @res = common global i32 0, align 4
 
 ; Function Attrs:
-define i64 @func() #0 {
+define i64 @func() {
 entry:
   ret i64 1
 }
@@ -27,7 +27,7 @@ entry:
   %2 = load volatile i32, ptr @g_x_u, align 4
   %3 = load volatile i32, ptr @g_z_u, align 4
   %4 = load volatile i32, ptr @g_m, align 4
-  %call = call i64 @func() #4
+  %call = call i64 @func()
   %conv = sext i32 %1 to i64
   %cmp = icmp ne i64 %call, %conv
   br i1 %cmp, label %if.end, label %lor.lhs.false
@@ -42,7 +42,7 @@ if.then:
   br label %cleanup
 
 if.end:
-  %call4 = call i64 @func() #4
+  %call4 = call i64 @func()
   %conv5 = zext i32 %3 to i64
   %cmp6 = icmp ne i64 %call4, %conv5
   br i1 %cmp6, label %if.end14, label %lor.lhs.false8
@@ -57,7 +57,7 @@ if.then13:
   br label %cleanup
 
 if.end14:
-  %call15 = call i64 @func() #4
+  %call15 = call i64 @func()
   %cmp17 = icmp ne i64 %call15, %conv
   br i1 %cmp17, label %if.end25, label %lor.lhs.false19
 
@@ -77,5 +77,3 @@ cleanup:
   %retval.0 = phi i32 [ 0, %if.end25 ], [ 1, %if.then24 ], [ 1, %if.then13 ], [ 1, %if.then ]
   ret i32 %retval.0
 }
-
-attributes #0 = { minsize noinline nounwind optsize uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/GVNHoist/pr30499.ll b/llvm/test/Transforms/GVNHoist/pr30499.ll
index bd6f98c0f8a22..6df9484b2471b 100644
--- a/llvm/test/Transforms/GVNHoist/pr30499.ll
+++ b/llvm/test/Transforms/GVNHoist/pr30499.ll
@@ -1,6 +1,6 @@
 ; RUN: opt -S -passes=gvn-hoist < %s
 
-define void @_Z3fn2v() #0 {
+define void @_Z3fn2v() {
 entry:
   %a = alloca ptr, align 8
   %b = alloca i32, align 4
@@ -11,7 +11,7 @@ entry:
   br i1 %tobool, label %if.then, label %if.end
 
 if.then:                                          ; preds = %entry
-  %call = call i64 @_Z3fn1v() #2
+  %call = call i64 @_Z3fn1v()
   %conv = trunc i64 %call to i32
   store i32 %conv, ptr %b, align 4
   br label %if.end
@@ -23,8 +23,4 @@ if.end:                                           ; preds = %if.then, %entry
 }
 
 ; Function Attrs: nounwind readonly
-declare i64 @_Z3fn1v() #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readonly }
+declare i64 @_Z3fn1v()
diff --git a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
index b3f2e8102fc1e..15ce3e3c440fe 100644
--- a/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
+++ b/llvm/test/Transforms/IROutliner/outlining-compatible-and-attribute-transfer.ll
@@ -5,7 +5,7 @@
 ; attributes that should be transferred only if it is on all of the regions.
 
 ; This includes the attributes, no-nans-fp-math,
-; no-signed-zeros-fp-math, less-precise-fpmad, unsafe-fp-math, and
+; no-signed-zeros-fp-math, less-precise-fpmad, and
 ; no-infs-fp-math.  Only when each instance of similarity has these attributes
 ; can we say that the outlined function can have these attributes since that
 ; is the more general case for these attributes.
@@ -101,7 +101,7 @@ entry:
 }
 
 attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "less-precise-fpmad"="true"
-"unsafe-fp-math"="true" "no-infs-fp-math"="true"}
+"no-infs-fp-math"="true"}
 
 ; CHECK: define internal void @outlined_ir_func_0(ptr [[ARG0:%.*]], ptr [[ARG1:%.*]], ptr [[ARG2:%.*]]) [[ATTR1:#[0-9]+]] {
 ; CHECK: entry_to_outline:
@@ -122,5 +122,5 @@ attributes #0 = { "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "les
 ; CHECK-NEXT:    [[CL:%.*]] = load i32, ptr [[ARG2]], align 4
 
 
-; CHECK: attributes [[ATTR1]] =   { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "unsafe-fp-math"="false" }
-; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="true" }
+; CHECK: attributes [[ATTR1]] =   { minsize optsize "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" }
+; CHECK: attributes [[ATTR]] = { minsize optsize "less-precise-fpmad"="true" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
index 7cba30dd8a1f4..81a93232b4350 100644
--- a/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
+++ b/llvm/test/Transforms/IndVarSimplify/X86/widen-nsw.ll
@@ -4,7 +4,7 @@ target triple = "x86_64-apple-macosx"
 
 ; CHECK-LABEL: @test1
 ; CHECK: %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
-define i32 @test1(ptr %a) #0 {
+define i32 @test1(ptr %a) {
 entry:
   br label %for.cond
 
@@ -25,5 +25,3 @@ for.body:                                         ; preds = %for.cond
 for.end:                                          ; preds = %for.cond
   ret i32 %sum.0
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/Inline/always-inline-attr.ll b/llvm/test/Transforms/Inline/always-inline-attr.ll
index 08ea3079d3ff2..4e698229a1ee1 100644
--- a/llvm/test/Transforms/Inline/always-inline-attr.ll
+++ b/llvm/test/Transforms/Inline/always-inline-attr.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-grtev4-linux-gnu"
 
 ; After AlwaysInline the callee's attributes should be merged into caller's attibutes.
 
-; CHECK:  define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0) #0
+; CHECK:  define dso_local <2 x i64> @foo(ptr byval(<8 x i64>) align 64 %0)
 ; CHECK:  attributes #0 = { mustprogress uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512"
 
 ; Function Attrs: uwtable mustprogress
@@ -36,12 +36,10 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16) #2
-
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512bw,+avx512dq,+avx512f,+avx512vl,+bmi2,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "prefer-vector-width"="128" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+aes,+avx,+avx2,+avx512f,+cx16,+cx8,+f16c,+fma,+fxsr,+mmx,+pclmul,+popcnt,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare <16 x i8> @llvm.x86.avx512.mask.pmovs.db.512(<16 x i32>, <16 x i8>, i16)
 
+attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="128" "prefer-vector-width"="128" }
+attributes #1 = { alwaysinline nounwind uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="512" "prefer-vector-width"="128" }
 
 !2 = !{!3, !3, i64 0}
 !3 = !{!"omnipotent char", !4, i64 0}
diff --git a/llvm/test/Transforms/Inline/attributes.ll b/llvm/test/Transforms/Inline/attributes.ll
index 55ab430f201d6..da7eeda543c79 100644
--- a/llvm/test/Transforms/Inline/attributes.ll
+++ b/llvm/test/Transforms/Inline/attributes.ll
@@ -601,46 +601,6 @@ define i32 @test_no-signed-zeros-fp-math3(i32 %i) "no-signed-zeros-fp-math"="tru
 ; CHECK-NEXT: ret i32
 }
 
-define i32 @unsafe-fp-math_callee0(i32 %i) "unsafe-fp-math"="false" {
-  ret i32 %i
-; CHECK: @unsafe-fp-math_callee0(i32 %i) [[UNSAFE_FPMATH_FALSE:#[0-9]+]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @unsafe-fp-math_callee1(i32 %i) "unsafe-fp-math"="true" {
-  ret i32 %i
-; CHECK: @unsafe-fp-math_callee1(i32 %i) [[UNSAFE_FPMATH_TRUE:#[0-9]+]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math0(i32 %i) "unsafe-fp-math"="false" {
-  %1 = call i32 @unsafe-fp-math_callee0(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math0(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math1(i32 %i) "unsafe-fp-math"="false" {
-  %1 = call i32 @unsafe-fp-math_callee1(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math1(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math2(i32 %i) "unsafe-fp-math"="true" {
-  %1 = call i32 @unsafe-fp-math_callee0(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math2(i32 %i) [[UNSAFE_FPMATH_FALSE]] {
-; CHECK-NEXT: ret i32
-}
-
-define i32 @test_unsafe-fp-math3(i32 %i) "unsafe-fp-math"="true" {
-  %1 = call i32 @unsafe-fp-math_callee1(i32 %i)
-  ret i32 %1
-; CHECK: @test_unsafe-fp-math3(i32 %i) [[UNSAFE_FPMATH_TRUE]] {
-; CHECK-NEXT: ret i32
-}
-
 ; Test that fn_ret_thunk_extern has no CompatRule; inlining is permitted.
 ; Test that fn_ret_thunk_extern has no MergeRule; fn_ret_thunk_extern is not
 ; propagated or dropped on the caller after inlining.
@@ -693,6 +653,4 @@ define i32 @loader_replaceable_caller() {
 ; CHECK: attributes [[NO_NANS_FPMATH_TRUE]] = { "no-nans-fp-math"="true" }
 ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_FALSE]] = { "no-signed-zeros-fp-math"="false" }
 ; CHECK: attributes [[NO_SIGNED_ZEROS_FPMATH_TRUE]] = { "no-signed-zeros-fp-math"="true" }
-; CHECK: attributes [[UNSAFE_FPMATH_FALSE]] = { "unsafe-fp-math"="false" }
-; CHECK: attributes [[UNSAFE_FPMATH_TRUE]] = { "unsafe-fp-math"="true" }
 ; CHECK: attributes [[FNRETTHUNK_EXTERN]] = { fn_ret_thunk_extern }
diff --git a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
index a4c41341df951..bf0dfa2c9add0 100644
--- a/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
+++ b/llvm/test/Transforms/Inline/debug-info-duplicate-calls.ll
@@ -89,11 +89,10 @@ entry:
   ret void, !dbg !20
 }
 
-declare void @_Z2f1v() #2
+attributes #0 = { uwtable }
+attributes #1 = { alwaysinline inlinehint uwtable }
 
-attributes #0 = { uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { alwaysinline inlinehint uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_Z2f1v()
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
diff --git a/llvm/test/Transforms/Inline/inline-vla.ll b/llvm/test/Transforms/Inline/inline-vla.ll
index 8e4bb3d140bca..1e3c2351e6ed3 100644
--- a/llvm/test/Transforms/Inline/inline-vla.ll
+++ b/llvm/test/Transforms/Inline/inline-vla.ll
@@ -9,7 +9,7 @@
 @.str1 = private unnamed_addr constant [3 x i8] c"ab\00", align 1
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @main(i32 %argc, ptr nocapture readnone %argv) #0 {
+define i32 @main(i32 %argc, ptr nocapture readnone %argv) {
 entry:
   %data = alloca [2 x i8], align 1
   call fastcc void @memcpy2(ptr %data, ptr @.str, i64 1)
@@ -18,7 +18,7 @@ entry:
 }
 
 ; Function Attrs: inlinehint nounwind ssp uwtable
-define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) #1 {
+define internal fastcc void @memcpy2(ptr nocapture %dst, ptr nocapture readonly %src, i64 %size) {
 entry:
   %vla = alloca i64, i64 %size, align 16
   call void @llvm.memcpy.p0.p0.i64(ptr %vla, ptr %src, i64 %size, i1 false)
@@ -27,11 +27,7 @@ entry:
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1) #2
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture, ptr nocapture readonly, i64, i1)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
index 3021935cdd025..2b4aedd7331e8 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-hotness-threshold.ll
@@ -27,20 +27,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 {
+define i32 @bar() !dbg !10 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
index d3947137b4d80..f547c37259314 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-deleted-callee-yaml.ll
@@ -63,20 +63,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define internal i32 @foo() #0 !dbg !7 {
+define internal i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
index b0a238ff8efee..64b305b0d4749 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks-passed-yaml.ll
@@ -72,20 +72,18 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.11.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo() #0 !dbg !7 {
+define i32 @foo() !dbg !7 {
 entry:
   ret i32 1, !dbg !9
 }
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar() #0 !dbg !10 !prof !13 {
+define i32 @bar() !dbg !10 !prof !13 {
 entry:
   %call = call i32 @foo(), !dbg !11
   ret i32 %call, !dbg !12
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
 !llvm.ident = !{!6}
diff --git a/llvm/test/Transforms/Inline/optimization-remarks.ll b/llvm/test/Transforms/Inline/optimization-remarks.ll
index bc1e690ee61fe..135d835431e59 100644
--- a/llvm/test/Transforms/Inline/optimization-remarks.ll
+++ b/llvm/test/Transforms/Inline/optimization-remarks.ll
@@ -81,9 +81,9 @@ entry:
   ret i32 %add
 }
 
-attributes #0 = { alwaysinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { alwaysinline nounwind uwtable  }
+attributes #1 = { noinline nounwind uwtable }
+attributes #2 = { nounwind uwtable }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
index ecedbdb3522db..abe1ed0fa37b8 100644
--- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
+++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-to-svbool-binops.ll
@@ -124,6 +124,39 @@ define <vscale x 8 x i1> @try_combine_svbool_binop_orr(<vscale x 8 x i1> %a, <vs
   ret <vscale x 8 x i1> %t3
 }
 
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 8 x half> @try_combine_svbool_binop_fadd(<vscale x 8 x half> %a, <vscale x 8 x half> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fadd(
+; CHECK-NEXT:    [[T2:%.*]] = fadd <vscale x 8 x half> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 8 x half> [[T2]]
+;
+  %t1 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 8 x half> @llvm.aarch64.sve.fadd.nxv8f16(<vscale x 8 x i1> %t1, <vscale x 8 x half> %a, <vscale x 8 x half> %b)
+  ret <vscale x 8 x half> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 4 x float> @try_combine_svbool_binop_fmul(<vscale x 4 x float> %a, <vscale x 4 x float> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fmul(
+; CHECK-NEXT:    [[T2:%.*]] = fmul <vscale x 4 x float> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 4 x float> [[T2]]
+;
+  %t1 = tail call <vscale x 4 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 4 x float> @llvm.aarch64.sve.fmul.nxv4f32(<vscale x 4 x i1> %t1, <vscale x 4 x float> %a, <vscale x 4 x float> %b)
+  ret <vscale x 4 x float> %t2
+}
+
+; Verify predicate cast does not hinder "isAllActive" knowledge.
+define <vscale x 2 x double> @try_combine_svbool_binop_fsub(<vscale x 2 x double> %a, <vscale x 2 x double> %b) {
+; CHECK-LABEL: @try_combine_svbool_binop_fsub(
+; CHECK-NEXT:    [[T2:%.*]] = fsub <vscale x 2 x double> [[A:%.*]], [[B:%.*]]
+; CHECK-NEXT:    ret <vscale x 2 x double> [[T2]]
+;
+  %t1 = tail call <vscale x 2 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv2i1(<vscale x 16 x i1> splat (i1 true))
+  %t2 = tail call <vscale x 2 x double> @llvm.aarch64.sve.fsub.nxv2f64(<vscale x 2 x i1> %t1, <vscale x 2 x double> %a, <vscale x 2 x double> %b)
+  ret <vscale x 2 x double> %t2
+}
+
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1>)
 declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv2i1(<vscale x 2 x i1>)
diff --git a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
index bb0129926799f..3f29509cc3525 100644
--- a/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
+++ b/llvm/test/Transforms/InstCombine/bitreverse-hang.ll
@@ -21,7 +21,7 @@
 @b = common global i32 0, align 4
 
 ; CHECK: define i32 @fn1
-define i32 @fn1() #0 {
+define i32 @fn1() {
 entry:
   %b.promoted = load i32, ptr @b, align 4, !tbaa !2
   br label %for.body
@@ -40,8 +40,6 @@ for.end:                                          ; preds = %for.body
   ret i32 undef
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/InstCombine/constant-vector-insert.ll b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
new file mode 100644
index 0000000000000..268854054bd7f
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/constant-vector-insert.ll
@@ -0,0 +1,156 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt -S -passes=instcombine %s | FileCheck %s
+; RUN: opt -S -passes=instcombine %s \
+; RUN:   -use-constant-int-for-fixed-length-splat \
+; RUN    -use-constant-fp-for-fixed-length-splat \
+; RUN:   -use-constant-int-for-scalable-splat \
+; RUN:   -use-constant-fp-for-scalable-splat | FileCheck %s
+
+define <vscale x 4 x i32> @insert_div() {
+; CHECK-LABEL: @insert_div(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 9), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_splat_lhs() {
+; CHECK-LABEL: @insert_div_splat_lhs(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 5), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 2), <4 x i32> splat (i32 5), i64 0)
+  %div = udiv <vscale x 4 x i32> splat (i32 10), %0
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_div_mixed_splat() {
+; CHECK-LABEL: @insert_div_mixed_splat(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[DIV:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 6), <4 x i32> splat (i32 3), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[DIV]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 18), <4 x i32> splat (i32 9), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  ret <vscale x 4 x i32> %div
+}
+
+define <vscale x 4 x i32> @insert_mul() {
+; CHECK-LABEL: @insert_mul(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[MUL:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 7), i64 4)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[MUL]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 1), i64 4)
+  %mul = mul <vscale x 4 x i32> %0, splat (i32 7)
+  ret <vscale x 4 x i32> %mul
+}
+
+define <vscale x 4 x i32> @insert_add() {
+; CHECK-LABEL: @insert_add(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 16), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 5), i64 0)
+  %add = add <vscale x 4 x i32> %0, splat (i32 11)
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_add_non_splat_subvector() {
+; CHECK-LABEL: @insert_add_non_splat_subvector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 101, i32 102, i32 103, i32 104>, i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 3, i32 4>, i64 0)
+  %add = add <vscale x 4 x i32> %0, splat (i32 100)
+  ret <vscale x 4 x i32> %add
+}
+
+define <vscale x 4 x float> @insert_add_fp() {
+; CHECK-LABEL: @insert_add_fp(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat (float 6.250000e+00), <4 x float> splat (float 5.500000e+00), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x float> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x float> @llvm.vector.insert.nxv4f32.v4f32(<vscale x 4 x float> splat(float 1.25), <4 x float> splat (float 0.5), i64 0)
+  %add = fadd <vscale x 4 x float> %0, splat (float 5.0)
+  ret <vscale x 4 x float> %add
+}
+
+define <vscale x 8 x i32> @insert_add_scalable_subvector() {
+; CHECK-LABEL: @insert_add_scalable_subvector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat (i32 20), <vscale x 4 x i32> splat (i32 -4), i64 0)
+; CHECK-NEXT:    ret <vscale x 8 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 8 x i32> @llvm.vector.insert.nxv8i32.nxv4i32(<vscale x 8 x i32> splat(i32 16), <vscale x 4 x i32> splat (i32 -8), i64 0)
+  %add = add <vscale x 8 x i32> %0, splat (i32 4)
+  ret <vscale x 8 x i32> %add
+}
+
+define <vscale x 4 x i32> @insert_sub() {
+; CHECK-LABEL: @insert_sub(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SUB:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> zeroinitializer, i64 8)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[SUB]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> poison, <4 x i32> splat (i32 11), i64 8)
+  %sub = add <vscale x 4 x i32> %0, splat (i32 -11)
+  ret <vscale x 4 x i32> %sub
+}
+
+define <vscale x 4 x i32> @insert_and_partially_undef() {
+; CHECK-LABEL: @insert_and_partially_undef(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[AND:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> zeroinitializer, <4 x i32> splat (i32 4), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[AND]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> undef, <4 x i32> splat (i32 6), i64 0)
+  %and = and <vscale x 4 x i32> %0, splat (i32 4)
+  ret <vscale x 4 x i32> %and
+}
+
+define <vscale x 4 x i32> @insert_fold_chain() {
+; CHECK-LABEL: @insert_fold_chain(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[ADD:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 11), <4 x i32> splat (i32 8), i64 0)
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 21), <4 x i32> splat (i32 12), i64 0)
+  %div = udiv <vscale x 4 x i32> %0, splat (i32 3)
+  %add = add <vscale x 4 x i32> %div, splat (i32 4)
+  ret <vscale x 4 x i32> %add
+}
+
+; TODO: This could be folded more.
+define <vscale x 4 x i32> @insert_add_both_insert_vector() {
+; CHECK-LABEL: @insert_add_both_insert_vector(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 10), <4 x i32> splat (i32 5), i64 0)
+; CHECK-NEXT:    [[TMP1:%.*]] = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat (i32 -1), <4 x i32> splat (i32 2), i64 0)
+; CHECK-NEXT:    [[ADD:%.*]] = add <vscale x 4 x i32> [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    ret <vscale x 4 x i32> [[ADD]]
+;
+entry:
+  %0 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 10), <4 x i32> splat (i32 5), i64 0)
+  %1 = call <vscale x 4 x i32> @llvm.vector.insert.nxv4i32.v4i32(<vscale x 4 x i32> splat(i32 -1), <4 x i32> splat (i32 2), i64 0)
+  %add = add <vscale x 4 x i32> %0, %1
+  ret <vscale x 4 x i32> %add
+}
diff --git a/llvm/test/Transforms/InstCombine/ctlz-cttz.ll b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
new file mode 100644
index 0000000000000..871fb342360fd
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/ctlz-cttz.ll
@@ -0,0 +1,145 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+
+; ctpop(~i & (i - 1)) -> bitwidth - cttz(i, false)
+define i8 @ctlz_to_sub_bw_cttz(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_poison(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_poison(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 true)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_add(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_add(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], 1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, 1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_different_xor(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_different_xor(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], 1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, 1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+declare void @use(i8)
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_dec(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[DEC]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  call void @use(i8 %dec)
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_not(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_not(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    call void @use(i8 [[NOT]])
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  call void @use(i8 %not)
+  %and = and i8 %dec, %not
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_multi_use_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_multi_use_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[DEC:%.*]] = add i8 [[A0]], -1
+; CHECK-NEXT:    [[NOT:%.*]] = xor i8 [[A0]], -1
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[DEC]], [[NOT]]
+; CHECK-NEXT:    call void @use(i8 [[AND]])
+; CHECK-NEXT:    [[CLZ:%.*]] = tail call range(i8 0, 9) i8 @llvm.ctlz.i8(i8 [[AND]], i1 false)
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %dec, %not
+  call void @use(i8 %and)
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define i8 @ctlz_to_sub_bw_cttz_commute_and(i8 %a0) {
+; CHECK-LABEL: define i8 @ctlz_to_sub_bw_cttz_commute_and(
+; CHECK-SAME: i8 [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.cttz.i8(i8 [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw i8 8, [[TMP1]]
+; CHECK-NEXT:    ret i8 [[CLZ]]
+;
+  %dec = add i8 %a0, -1
+  %not = xor i8 %a0, -1
+  %and = and i8 %not, %dec
+  %clz = tail call i8 @llvm.ctlz.i8(i8 %and, i1 false)
+  ret i8 %clz
+}
+
+define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(<2 x i8> %a0) {
+; CHECK-LABEL: define <2 x i8> @ctlz_to_sub_bw_cttz_vec_splat(
+; CHECK-SAME: <2 x i8> [[A0:%.*]]) {
+; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.cttz.v2i8(<2 x i8> [[A0]], i1 false)
+; CHECK-NEXT:    [[CLZ:%.*]] = sub nuw nsw <2 x i8> splat (i8 8), [[TMP1]]
+; CHECK-NEXT:    ret <2 x i8> [[CLZ]]
+;
+  %dec = add <2 x i8> %a0, <i8 -1, i8 -1>
+  %not = xor <2 x i8> %a0, <i8 -1, i8 -1>
+  %and = and <2 x i8> %dec, %not
+  %clz = tail call <2 x i8>@llvm.ctlz.v2i8(<2 x i8> %and, i1 false)
+  ret <2 x i8> %clz
+}
diff --git a/llvm/test/Transforms/InstCombine/intrinsic-select.ll b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
index 2f1f9fc2b6f9e..fc9ab9f4bf1d3 100644
--- a/llvm/test/Transforms/InstCombine/intrinsic-select.ll
+++ b/llvm/test/Transforms/InstCombine/intrinsic-select.ll
@@ -222,8 +222,7 @@ declare i32 @llvm.vector.reduce.add.v2i32(<2 x i32>)
 
 define i32 @vec_to_scalar_select_scalar(i1 %b) {
 ; CHECK-LABEL: @vec_to_scalar_select_scalar(
-; CHECK-NEXT:    [[S:%.*]] = select i1 [[B:%.*]], <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
-; CHECK-NEXT:    [[C:%.*]] = call i32 @llvm.vector.reduce.add.v2i32(<2 x i32> [[S]])
+; CHECK-NEXT:    [[C:%.*]] = select i1 [[B:%.*]], i32 3, i32 7
 ; CHECK-NEXT:    ret i32 [[C]]
 ;
   %s = select i1 %b, <2 x i32> <i32 1, i32 2>, <2 x i32> <i32 3, i32 4>
@@ -371,3 +370,36 @@ define float @test_fabs_select_multiuse_both_constant(i1 %cond, float %x) {
   %fabs = call float @llvm.fabs.f32(float %select)
   ret float %fabs
 }
+
+; Negative test: Don't replace with select between vector mask and zeroinitializer.
+define <16 x i1> @test_select_of_active_lane_mask_bound(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound(
+; CHECK-NEXT:    [[S:%.*]] = select i1 [[COND:%.*]], i64 [[N:%.*]], i64 0
+; CHECK-NEXT:    [[MASK:%.*]] = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 [[BASE:%.*]], i64 [[S]])
+; CHECK-NEXT:    ret <16 x i1> [[MASK]]
+;
+  %s = select i1 %cond, i64 %n, i64 0
+  %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 %base, i64 %s)
+  ret <16 x i1> %mask
+}
+
+define <16 x i1> @test_select_of_active_lane_mask_bound_both_constant(i64 %base, i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_active_lane_mask_bound_both_constant(
+; CHECK-NEXT:    [[MASK:%.*]] = select i1 [[COND:%.*]], <16 x i1> splat (i1 true), <16 x i1> zeroinitializer
+; CHECK-NEXT:    ret <16 x i1> [[MASK]]
+;
+  %s = select i1 %cond, i64 16, i64 0
+  %mask = call <16 x i1> @llvm.get.active.lane.mask.v16i1.i64(i64 0, i64 %s)
+  ret <16 x i1> %mask
+}
+
+define { i64, i1 } @test_select_of_overflow_intrinsic_operand(i64 %n, i1 %cond) {
+; CHECK-LABEL: @test_select_of_overflow_intrinsic_operand(
+; CHECK-NEXT:    [[TMP1:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[N:%.*]], i64 42)
+; CHECK-NEXT:    [[ADD_OVERFLOW:%.*]] = select i1 [[COND:%.*]], { i64, i1 } [[TMP1]], { i64, i1 } { i64 42, i1 false }
+; CHECK-NEXT:    ret { i64, i1 } [[ADD_OVERFLOW]]
+;
+  %s = select i1 %cond, i64 %n, i64 0
+  %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %s, i64 42)
+  ret { i64, i1 } %add_overflow
+}
diff --git a/llvm/test/Transforms/InstCombine/phi.ll b/llvm/test/Transforms/InstCombine/phi.ll
index 3454835d3ad65..1c8b21ceb029b 100644
--- a/llvm/test/Transforms/InstCombine/phi.ll
+++ b/llvm/test/Transforms/InstCombine/phi.ll
@@ -3026,6 +3026,56 @@ join:
   ret i32 %umax
 }
 
+define i32 @cross_lane_intrinsic_over_phi(i1 %c, i1 %c2, <4 x i32> %a) {
+; CHECK-LABEL: @cross_lane_intrinsic_over_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[A:%.*]])
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi i32 [ [[TMP0]], [[IF]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @may_exit()
+; CHECK-NEXT:    ret i32 [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi <4 x i32> [ %a, %if ], [ zeroinitializer, %entry ]
+  call void @may_exit()
+  %sum = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> %phi)
+  ret i32 %sum
+}
+
+define { i64, i1 } @overflow_intrinsic_over_phi(i1 %c, i64 %a) {
+; CHECK-LABEL: @overflow_intrinsic_over_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C:%.*]], label [[IF:%.*]], label [[JOIN:%.*]]
+; CHECK:       if:
+; CHECK-NEXT:    [[TMP0:%.*]] = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 [[A:%.*]], i64 1)
+; CHECK-NEXT:    br label [[JOIN]]
+; CHECK:       join:
+; CHECK-NEXT:    [[PHI:%.*]] = phi { i64, i1 } [ [[TMP0]], [[IF]] ], [ { i64 1, i1 false }, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    call void @may_exit()
+; CHECK-NEXT:    ret { i64, i1 } [[PHI]]
+;
+entry:
+  br i1 %c, label %if, label %join
+
+if:
+  br label %join
+
+join:
+  %phi = phi i64 [ %a, %if ], [ 0, %entry ]
+  call void @may_exit()
+  %add_overflow = call { i64, i1 } @llvm.uadd.with.overflow.i64(i64 %phi, i64 1)
+  ret { i64, i1 } %add_overflow
+}
+
 define i32 @multiple_intrinsics_with_multiple_phi_uses(i1 %c, i32 %arg) {
 ; CHECK-LABEL: @multiple_intrinsics_with_multiple_phi_uses(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
index 410c43c807ed9..f19cca8f808c8 100644
--- a/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
+++ b/llvm/test/Transforms/InstCombine/ptrtoaddr.ll
@@ -40,3 +40,134 @@ define i128 @ptrtoaddr_sext(ptr %p) {
   %ext = sext i64 %p.addr to i128
   ret i128 %ext
 }
+
+define i64 @sub_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.addr
+  ret i64 %sub
+}
+
+define i64 @sub_ptrtoint_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i64 @sub_ptrtoint_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i64 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.int = ptrtoint ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %sub = sub i64 %p2.addr, %p.int
+  ret i64 %sub
+}
+
+define i32 @sub_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i32 @sub_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    ret i32 [[OFFSET]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %sub = sub i32 %p2.addr, %p.addr
+  ret i32 %sub
+}
+
+define i32 @sub_trunc_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i32 @sub_trunc_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i64 [[OFFSET]] to i32
+; CHECK-NEXT:    ret i32 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr.trunc = trunc i64 %p.addr to i32
+  %p2.addr.trunc = trunc i64 %p2.addr to i32
+  %sub = sub i32 %p2.addr.trunc, %p.addr.trunc
+  ret i32 %sub
+}
+
+define i16 @sub_trunc_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT:    ret i16 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.addr.trunc = trunc i32 %p.addr to i16
+  %p2.addr.trunc = trunc i32 %p2.addr to i16
+  %sub = sub i16 %p2.addr.trunc, %p.addr.trunc
+  ret i16 %sub
+}
+
+define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i16 @sub_trunc_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = trunc i32 [[OFFSET]] to i16
+; CHECK-NEXT:    ret i16 [[SUB]]
+;
+  %p2 = getelementptr i8, ptr addrspace(1) %p, i32 %offset
+  %p.int = ptrtoint ptr addrspace(1) %p to i64
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.int.trunc = trunc i64 %p.int to i16
+  %p2.addr.trunc = trunc i32 %p2.addr to i16
+  %sub = sub i16 %p2.addr.trunc, %p.int.trunc
+  ret i16 %sub
+}
+
+define i128 @sub_zext_ptrtoaddr(ptr %p, i64 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoaddr(
+; CHECK-SAME: ptr [[P:%.*]], i64 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = zext i64 [[OFFSET]] to i128
+; CHECK-NEXT:    ret i128 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr %p, i64 %offset
+  %p.addr = ptrtoaddr ptr %p to i64
+  %p2.addr = ptrtoaddr ptr %p2 to i64
+  %p.addr.ext = zext i64 %p.addr to i128
+  %p2.addr.ext = zext i64 %p2.addr to i128
+  %sub = sub i128 %p2.addr.ext, %p.addr.ext
+  ret i128 %sub
+}
+
+define i64 @sub_zext_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i64 @sub_zext_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[SUB:%.*]] = zext i32 [[OFFSET]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+  %p.addr = ptrtoaddr ptr addrspace(1) %p to i32
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.addr.ext = zext i32 %p.addr to i64
+  %p2.addr.ext = zext i32 %p2.addr to i64
+  %sub = sub i64 %p2.addr.ext, %p.addr.ext
+  ret i64 %sub
+}
+
+define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(ptr addrspace(1) %p, i32 %offset) {
+; CHECK-LABEL: define i128 @sub_zext_ptrtoint_ptrtoaddr_addrsize(
+; CHECK-SAME: ptr addrspace(1) [[P:%.*]], i32 [[OFFSET:%.*]]) {
+; CHECK-NEXT:    [[P2:%.*]] = getelementptr nuw i8, ptr addrspace(1) [[P]], i32 [[OFFSET]]
+; CHECK-NEXT:    [[P_INT:%.*]] = ptrtoint ptr addrspace(1) [[P]] to i64
+; CHECK-NEXT:    [[P2_ADDR:%.*]] = ptrtoaddr ptr addrspace(1) [[P2]] to i32
+; CHECK-NEXT:    [[P_INT_EXT:%.*]] = zext i64 [[P_INT]] to i128
+; CHECK-NEXT:    [[P2_ADDR_EXT:%.*]] = zext i32 [[P2_ADDR]] to i128
+; CHECK-NEXT:    [[SUB:%.*]] = sub nsw i128 [[P2_ADDR_EXT]], [[P_INT_EXT]]
+; CHECK-NEXT:    ret i128 [[SUB]]
+;
+  %p2 = getelementptr nuw i8, ptr addrspace(1) %p, i32 %offset
+  %p.int = ptrtoint ptr addrspace(1) %p to i64
+  %p2.addr = ptrtoaddr ptr addrspace(1) %p2 to i32
+  %p.int.ext = zext i64 %p.int to i128
+  %p2.addr.ext = zext i32 %p2.addr to i128
+  %sub = sub i128 %p2.addr.ext, %p.int.ext
+  ret i128 %sub
+}
diff --git a/llvm/test/Transforms/InstCombine/scmp.ll b/llvm/test/Transforms/InstCombine/scmp.ll
index c0be5b986b7fd..2ae062cdc7033 100644
--- a/llvm/test/Transforms/InstCombine/scmp.ll
+++ b/llvm/test/Transforms/InstCombine/scmp.ll
@@ -519,9 +519,7 @@ define <3 x i2> @scmp_unary_shuffle_ops(<3 x i8> %x, <3 x i8> %y) {
 define i32 @scmp_sgt_slt(i32 %a) {
 ; CHECK-LABEL: define i32 @scmp_sgt_slt(
 ; CHECK-SAME: i32 [[A:%.*]]) {
-; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i32 [[A]], 31
-; CHECK-NEXT:    [[CMP_INV:%.*]] = icmp slt i32 [[A]], 1
-; CHECK-NEXT:    [[RETVAL_0:%.*]] = select i1 [[CMP_INV]], i32 [[A_LOBIT]], i32 1
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
 ; CHECK-NEXT:    ret i32 [[RETVAL_0]]
 ;
   %cmp = icmp sgt i32 %a, 0
@@ -747,3 +745,55 @@ define i8 @scmp_from_select_eq_and_gt_neg3(i32 %x, i32 %y) {
   %r = select i1 %eq, i8 0, i8 %sel1
   ret i8 %r
 }
+
+define i32 @scmp_ashr(i32 %a) {
+; CHECK-LABEL: define i32 @scmp_ashr(
+; CHECK-SAME: i32 [[A:%.*]]) {
+; CHECK-NEXT:    [[RETVAL_0:%.*]] = call i32 @llvm.scmp.i32.i32(i32 [[A]], i32 0)
+; CHECK-NEXT:    ret i32 [[RETVAL_0]]
+;
+  %a.lobit = ashr i32 %a, 31
+  %cmp.inv = icmp slt i32 %a, 1
+  %retval.0 = select i1 %cmp.inv, i32 %a.lobit, i32 1
+  ret i32 %retval.0
+}
+
+; select (icmp sgt X, 0), 1, ashr X, bitwidth-1 -> scmp(X, 0)
+define i8 @scmp_ashr_sgt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_sgt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp sgt i8 %a, 0
+  %retval = select i1 %cmp, i8 1, i8 %a.lobit
+  ret i8 %retval
+}
+
+; select (icmp slt X, 1), ashr X, bitwidth-1, 1 -> scmp(X, 0)
+define i8 @scmp_ashr_slt_pattern(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[R:%.*]] = call i8 @llvm.scmp.i8.i8(i8 [[A]], i8 0)
+; CHECK-NEXT:    ret i8 [[R]]
+;
+  %a.lobit = ashr i8 %a, 7
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
+
+define i8 @scmp_ashr_slt_pattern_neg(i8 %a) {
+; CHECK-LABEL: define i8 @scmp_ashr_slt_pattern_neg(
+; CHECK-SAME: i8 [[A:%.*]]) {
+; CHECK-NEXT:    [[A_LOBIT:%.*]] = ashr i8 [[A]], 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i8 [[A]], 1
+; CHECK-NEXT:    [[RETVAL:%.*]] = select i1 [[CMP]], i8 [[A_LOBIT]], i8 1
+; CHECK-NEXT:    ret i8 [[RETVAL]]
+;
+  %a.lobit = ashr i8 %a, 4
+  %cmp = icmp slt i8 %a, 1
+  %retval = select i1 %cmp, i8 %a.lobit, i8 1
+  ret i8 %retval
+}
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
index 2348490265e1b..2d06f4298baed 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement-inseltpoison.ll
@@ -208,6 +208,3 @@ define <4 x i32> @extract_cond_type_mismatch(<4 x i32> %x, <4 x i32> %y, <5 x i1
   %r = select i1 %cond, <4 x i32> %x, <4 x i32> %y
   ret <4 x i32> %r
 }
-
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select-extractelement.ll b/llvm/test/Transforms/InstCombine/select-extractelement.ll
index 621d278dee6e5..6f80b7d021715 100644
--- a/llvm/test/Transforms/InstCombine/select-extractelement.ll
+++ b/llvm/test/Transforms/InstCombine/select-extractelement.ll
@@ -1,9 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
-declare void @v4float_user(<4 x float>) #0
+declare void @v4float_user(<4 x float>)
 
-define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_one_select(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -17,7 +17,7 @@ define float @extract_one_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
 }
 
 ; Multiple extractelements
-define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_two_select(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -34,7 +34,7 @@ define <2 x float> @extract_two_select(<4 x float> %a, <4 x float> %b, i32 %c) #
 }
 
 ; Select has an extra non-extractelement user, don't change it
-define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0 {
+define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) {
 ; CHECK-LABEL: @extract_one_select_user(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[C:%.*]], 0
 ; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -49,7 +49,7 @@ define float @extract_one_select_user(<4 x float> %a, <4 x float> %b, i32 %c) #0
   ret float %extract
 }
 
-define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_one_vselect_user(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -67,7 +67,7 @@ define float @extract_one_vselect_user(<4 x float> %a, <4 x float> %b, <4 x i32>
 ; Do not convert the vector select into a scalar select. That would increase
 ; the instruction count and potentially obfuscate a vector min/max idiom.
 
-define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_one_vselect(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SELECT:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -81,7 +81,7 @@ define float @extract_one_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 }
 
 ; Multiple extractelements from a vector select
-define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @extract_two_vselect(
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[SEL:%.*]] = select <4 x i1> [[CMP_NOT]], <4 x float> [[B:%.*]], <4 x float> [[A:%.*]]
@@ -100,7 +100,7 @@ define <2 x float> @extract_two_vselect(<4 x float> %a, <4 x float> %b, <4 x i32
 ; The vector selects are not decomposed into scalar selects because that would increase
 ; the instruction count. Extract+insert is converted to non-lane-crossing shuffles.
 ; Test multiple extractelements
-define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_vector_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_vector_select(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <4 x i32> [[C:%.*]], i64 0
@@ -232,5 +232,3 @@ define i32 @inf_loop_partial_undef(<2 x i1> %a, <2 x i1> %b, <2 x i32> %x, <2 x
   %t11 = extractelement <2 x i32> %p, i32 0
   ret i32 %t11
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/InstCombine/select_frexp.ll b/llvm/test/Transforms/InstCombine/select_frexp.ll
index d025aedda7170..ccfcca1fe7cb5 100644
--- a/llvm/test/Transforms/InstCombine/select_frexp.ll
+++ b/llvm/test/Transforms/InstCombine/select_frexp.ll
@@ -115,10 +115,10 @@ define float @test_select_frexp_no_const(float %x, float %y, i1 %cond) {
 define i32 @test_select_frexp_extract_exp(float %x, i1 %cond) {
 ; CHECK-LABEL: define i32 @test_select_frexp_extract_exp(
 ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
-; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT:    [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[SEL]])
+; CHECK-NEXT:    [[FREXP:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 ; CHECK-NEXT:    [[FREXP_1:%.*]] = extractvalue { float, i32 } [[FREXP]], 1
-; CHECK-NEXT:    ret i32 [[FREXP_1]]
+; CHECK-NEXT:    [[FREXP_2:%.*]] = select i1 [[COND]], i32 1, i32 [[FREXP_1]]
+; CHECK-NEXT:    ret i32 [[FREXP_2]]
 ;
   %sel = select i1 %cond, float 1.000000e+00, float %x
   %frexp = call { float, i32 } @llvm.frexp.f32.i32(float %sel)
@@ -132,7 +132,7 @@ define float @test_select_frexp_fast_math_select(float %x, i1 %cond) {
 ; CHECK-SAME: float [[X:%.*]], i1 [[COND:%.*]]) {
 ; CHECK-NEXT:    [[FREXP1:%.*]] = call { float, i32 } @llvm.frexp.f32.i32(float [[X]])
 ; CHECK-NEXT:    [[MANTISSA:%.*]] = extractvalue { float, i32 } [[FREXP1]], 0
-; CHECK-NEXT:    [[SELECT_FREXP:%.*]] = select nnan ninf nsz i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
+; CHECK-NEXT:    [[SELECT_FREXP:%.*]] = select i1 [[COND]], float 5.000000e-01, float [[MANTISSA]]
 ; CHECK-NEXT:    ret float [[SELECT_FREXP]]
 ;
   %sel = select nnan ninf nsz i1 %cond, float 1.000000e+00, float %x
diff --git a/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
new file mode 100644
index 0000000000000..db4965b2f03f0
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select_with_identical_phi.ll
@@ -0,0 +1,417 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -S -passes=instcombine | FileCheck %s
+@A = extern_weak global float, align 4
+
+; %same.as.v1 is a select with two phis %v1 and %phi.to.remove as the true
+; and false values, while %v1 and %phi.to.remove are actually the same.
+; Fold the selection instruction %same.as.v1 to %v1.
+define void @select_with_identical_phi(ptr %m, ptr %n, i32 %count) {
+; CHECK-LABEL: @select_with_identical_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Q:%.*]] = phi ptr [ [[M:%.*]], [[ENTRY]] ], [ [[Q_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[C:%.*]] = phi ptr [ [[N:%.*]], [[ENTRY]] ], [ [[C_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[Q_LOAD:%.*]] = load float, ptr [[Q]], align 4
+; CHECK-NEXT:    [[C_LOAD:%.*]] = load float, ptr [[C]], align 4
+; CHECK-NEXT:    [[SUB:%.*]] = fsub float [[Q_LOAD]], [[C_LOAD]]
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp olt float [[SUB]], [[V0]]
+; CHECK-NEXT:    [[V0_1]] = select i1 [[CMP1]], float [[SUB]], float [[V0]]
+; CHECK-NEXT:    [[CMP2:%.*]] = fcmp ogt float [[SUB]], [[V1]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2]], float [[SUB]], float [[V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[Q_NEXT]] = getelementptr inbounds nuw i8, ptr [[Q]], i64 4
+; CHECK-NEXT:    [[C_NEXT]] = getelementptr inbounds nuw i8, ptr [[C]], i64 4
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], [[COUNT:%.*]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %q = phi ptr [ %m, %entry ], [ %q.next, %for.body ]
+  %c = phi ptr [ %n, %entry ], [ %c.next, %for.body ]
+  %q.load = load float, ptr %q
+  %c.load = load float, ptr %c
+  %sub = fsub float %q.load, %c.load
+  %cmp1 = fcmp olt float %sub, %v0
+  %v0.1 = select i1 %cmp1, float %sub, float %v0
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %cmp2 = fcmp ogt float  %sub, %same.as.v1
+  %v1.1 = select i1 %cmp2, float %sub, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %sub, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %q.next = getelementptr inbounds i8, ptr %q, i64 4
+  %c.next = getelementptr inbounds i8, ptr %c, i64 4
+  %exitcond = icmp eq i32 %inc.i, %count
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; Reduced from select_with_identical_phi().
+; Check that %same.as.v1 can be folded.
+define void @select_with_identical_phi_2(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[X:%.*]], float [[V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %x, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; The difference from select_with_identical_phi_2() is that the true and false values in
+; %phi.to.remove.next and %v1.1 are swapped.
+; Check that %same.as.v1 can be folded.
+define void @select_with_identical_phi_3(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[V1]], float [[X:%.*]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %v1, float %x
+  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %x
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; The difference from select_with_identical_phi_2() is that the true and false values in
+; same.as.v1 are swapped.
+; Check that %same.as.v1 can be folded.
+define void @select_with_identical_phi_4(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[X:%.*]], float [[V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
+  %v1.1 = select i1 %cmp2, float %x, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; The difference from select_with_identical_phi() is that the true and false values in
+; %same.as.v1, %phi.to.remove.next and %v1.1 are swapped.
+; Check that %same.as.v1 can be folded.
+define void @select_with_identical_phi_5(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_5(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[V1]], float [[X:%.*]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %phi.to.remove, float %v1
+  %v1.1 = select i1 %cmp2, float %v1, float %x
+  %phi.to.remove.next = select i1 %cmp2, float %same.as.v1, float %x
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; The difference from select_with_identical_phi_2() is that not all phis are sorted with
+; the same order of incoming BBs.
+; Check that %same.as.v1 can be folded.
+define void @select_with_identical_phi_6(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_6(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[X:%.*]], float [[V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ %phi.to.remove.next, %for.body ], [ 0xC415AF1D80000000, %entry ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %x, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; %v1 and %phi.to.remove do not have the same start value.
+; Cannot fold %same.as.v1.
+define void @select_with_identical_phi_negative_1(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_negative_1(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PHI_TO_REMOVE:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[PHI_TO_REMOVE_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SAME_AS_V1:%.*]] = select i1 [[CMP1:%.*]], float [[V1]], float [[PHI_TO_REMOVE]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[X:%.*]], float [[V1]]
+; CHECK-NEXT:    [[PHI_TO_REMOVE_NEXT]] = select i1 [[CMP2]], float [[X]], float [[SAME_AS_V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0x4415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %x, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; %v1 and %phi.to.remove do not act as the same phi since %v1.1 and %phi.to.remove.next do not evolve the same.
+; Cannot fold %same.as.v1.
+define void @select_with_identical_phi_negative_2(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_negative_2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PHI_TO_REMOVE:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[PHI_TO_REMOVE_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SAME_AS_V1:%.*]] = select i1 [[CMP1:%.*]], float [[V1]], float [[PHI_TO_REMOVE]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[V1]], float [[X:%.*]]
+; CHECK-NEXT:    [[PHI_TO_REMOVE_NEXT]] = select i1 [[CMP2]], float [[X]], float [[SAME_AS_V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %v1, float %x
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; %v1 and %phi.to.remove do not act as the same phi since %v1.1 and %phi.to.remove.next do not
+; have the same condition.
+; Cannot fold %same.as.v1.
+define void @select_with_identical_phi_negative_3(i1 %cmp1, i1 %cmp2, i1 %cmp3, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_negative_3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY:%.*]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PHI_TO_REMOVE:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[PHI_TO_REMOVE_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SAME_AS_V1:%.*]] = select i1 [[CMP1:%.*]], float [[V1]], float [[PHI_TO_REMOVE]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP3:%.*]], float [[V1]], float [[X:%.*]]
+; CHECK-NEXT:    [[PHI_TO_REMOVE_NEXT]] = select i1 [[CMP2:%.*]], float [[X]], float [[SAME_AS_V1]]
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v1, float %phi.to.remove
+  %v1.1 = select i1 %cmp3, float %v1, float %x
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
+
+; The true and false values of %same.as.v1 are not really the same phi.
+; Cannot fold %same.as.v1.
+define void @select_with_identical_phi_negative_4(i1 %cmp1, i1 %cmp2, float %x) {
+; CHECK-LABEL: @select_with_identical_phi_negative_4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[V0:%.*]] = phi float [ 0x4415AF1D80000000, [[ENTRY:%.*]] ], [ [[V0_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[V1:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[V1_1:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[PHI_TO_REMOVE:%.*]] = phi float [ 0xC415AF1D80000000, [[ENTRY]] ], [ [[PHI_TO_REMOVE_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[I:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC_I:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[SAME_AS_V1:%.*]] = select i1 [[CMP1:%.*]], float [[V0]], float [[PHI_TO_REMOVE]]
+; CHECK-NEXT:    [[V1_1]] = select i1 [[CMP2:%.*]], float [[X:%.*]], float [[V1]]
+; CHECK-NEXT:    [[PHI_TO_REMOVE_NEXT]] = select i1 [[CMP2]], float [[X]], float [[SAME_AS_V1]]
+; CHECK-NEXT:    [[V0_1]] = fadd float [[V0]], 1.000000e+00
+; CHECK-NEXT:    [[INC_I]] = add nuw nsw i32 [[I]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i32 [[INC_I]], 100
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[EXIT:%.*]], label [[FOR_BODY]]
+; CHECK:       exit:
+; CHECK-NEXT:    store float [[V1_1]], ptr @A, align 4
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %for.body
+
+for.body:                                    ; preds = %entry, %for.body
+  %v0 = phi float [ 0x4415AF1D80000000, %entry ], [ %v0.1, %for.body ]
+  %v1 = phi float [ 0xC415AF1D80000000, %entry ], [ %v1.1, %for.body ]
+  %phi.to.remove = phi float [ 0xC415AF1D80000000, %entry ], [ %phi.to.remove.next, %for.body ]
+  %i = phi i32 [ 0, %entry ], [ %inc.i, %for.body ]
+  %same.as.v1 = select i1 %cmp1, float %v0, float %phi.to.remove
+  %v1.1 = select i1 %cmp2, float %x, float %v1
+  %phi.to.remove.next = select i1 %cmp2, float %x, float %same.as.v1
+  %v0.1 = fadd float %v0, 1.0
+  %inc.i = add nuw nsw i32 %i, 1
+  %exitcond = icmp eq i32 %inc.i, 100
+  br i1 %exitcond, label %exit, label %for.body
+
+exit:
+  %vl.1.lcssa = phi float [ %v1.1, %for.body ]
+  store float %vl.1.lcssa, ptr @A
+  ret void
+}
diff --git a/llvm/test/Transforms/InstCombine/sub-gep.ll b/llvm/test/Transforms/InstCombine/sub-gep.ll
index 8eeaea1533bcf..ee70137e8fbd7 100644
--- a/llvm/test/Transforms/InstCombine/sub-gep.ll
+++ b/llvm/test/Transforms/InstCombine/sub-gep.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt -S -passes=instcombine < %s | FileCheck %s
 
-target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32"
+target datalayout = "e-p:64:64:64-p1:16:16:16-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-p2:32:32-p3:32:32:32:16"
 
 define i64 @test_inbounds(ptr %base, i64 %idx) {
 ; CHECK-LABEL: @test_inbounds(
@@ -505,6 +505,23 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw(i32 %offset) {
   ret i64 %D
 }
 
+define i64 @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(i32 %offset) {
+; CHECK-LABEL: @negative_zext_ptrtoint_sub_zext_ptrtoint_as2_nuw_truncating(
+; CHECK-NEXT:    [[A:%.*]] = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 [[OFFSET:%.*]]
+; CHECK-NEXT:    [[A_IDX:%.*]] = ptrtoint ptr addrspace(2) [[A]] to i32
+; CHECK-NEXT:    [[E:%.*]] = zext i32 [[A_IDX]] to i64
+; CHECK-NEXT:    [[D:%.*]] = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+; CHECK-NEXT:    [[E1:%.*]] = sub nsw i64 [[E]], [[D]]
+; CHECK-NEXT:    ret i64 [[E1]]
+;
+  %A = getelementptr nuw bfloat, ptr addrspace(2) @Arr_as2, i32 %offset
+  %B = ptrtoint ptr addrspace(2) %A to i32
+  %C = zext i32 %B to i64
+  %D = zext i16 ptrtoint (ptr addrspace(2) @Arr_as2 to i16) to i64
+  %E = sub i64 %C, %D
+  ret i64 %E
+}
+
 define i64 @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(ptr addrspace(2) %p, i32 %offset) {
 ; CHECK-LABEL: @ptrtoint_sub_zext_ptrtoint_as2_inbounds_local(
 ; CHECK-NEXT:    [[A:%.*]] = getelementptr inbounds bfloat, ptr addrspace(2) [[P:%.*]], i32 [[OFFSET:%.*]]
@@ -614,6 +631,20 @@ define i64 @negative_zext_ptrtoint_sub_ptrtoint_as2_nuw_local(ptr addrspace(2) %
   ret i64 %D
 }
 
+define i64 @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(ptr addrspace(3) %p, i16 %offset) {
+; CHECK-LABEL: @zext_ptrtoint_sub_ptrtoint_as3_nuw_local(
+; CHECK-NEXT:    [[SUB:%.*]] = zext i16 [[GEP_IDX:%.*]] to i64
+; CHECK-NEXT:    ret i64 [[SUB]]
+;
+  %gep = getelementptr nuw i8, ptr addrspace(3) %p, i16 %offset
+  %gep.int = ptrtoint ptr addrspace(3) %gep to i32
+  %p.int = ptrtoint ptr addrspace(3) %p to i32
+  %gep.int.ext = zext i32 %gep.int to i64
+  %p.int.ext = zext i32 %p.int to i64
+  %sub = sub i64 %gep.int.ext, %p.int.ext
+  ret i64 %sub
+}
+
 define i64 @test30(ptr %foo, i64 %i, i64 %j) {
 ; CHECK-LABEL: @test30(
 ; CHECK-NEXT:    [[GEP1_IDX:%.*]] = shl nsw i64 [[I:%.*]], 2
diff --git a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
index edaade329e9ce..26ba85755523d 100644
--- a/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
+++ b/llvm/test/Transforms/JumpThreading/ddt-crash3.ll
@@ -9,9 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
 @global.2 = external local_unnamed_addr global i64, align 8
 
 ; Function Attrs: norecurse noreturn nounwind uwtable
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
 ; CHECK-LABEL: define void @hoge(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[BB:.*:]]
 ; CHECK-NEXT:    br label %[[BB1:.*]]
 ; CHECK:       [[BB1]]:
@@ -48,8 +48,6 @@ bb27:                                             ; preds = %bb1
   br label %bb26
 }
 
-attributes #0 = { norecurse noreturn nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 7.0.0 "}
diff --git a/llvm/test/Transforms/LICM/volatile-alias.ll b/llvm/test/Transforms/LICM/volatile-alias.ll
index 410f3becc5bac..35e88445ff759 100644
--- a/llvm/test/Transforms/LICM/volatile-alias.ll
+++ b/llvm/test/Transforms/LICM/volatile-alias.ll
@@ -10,7 +10,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) #0 {
+define i32 @foo(ptr dereferenceable(4) nonnull %p, ptr %q, i32 %n) {
 entry:
   %p.addr = alloca ptr, align 8
   %q.addr = alloca ptr, align 8
@@ -51,5 +51,3 @@ for.end:                                          ; preds = %for.cond
   %8 = load i32, ptr %s, align 4
   ret i32 %8
 }
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopRotate/noalias.ll b/llvm/test/Transforms/LoopRotate/noalias.ll
index f709bba1cb4b1..d6b2414b426d8 100644
--- a/llvm/test/Transforms/LoopRotate/noalias.ll
+++ b/llvm/test/Transforms/LoopRotate/noalias.ll
@@ -146,11 +146,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: inaccessiblememonly nounwind
-declare void @llvm.experimental.noalias.scope.decl(metadata) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inaccessiblememonly nounwind }
-attributes #2 = { nounwind readnone speculatable }
+declare void @llvm.experimental.noalias.scope.decl(metadata)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
index 54233686feb87..25ded4bc64c6f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/AMDGPU/different-addrspace-addressing-mode-loops.ll
@@ -157,4 +157,4 @@ bb:
   br i1 %exitcond, label %._crit_edge.loopexit, label %.lr.ph
 }
 
-attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hawaii" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
index 6c34d1b390cce..a878fc2c39e82 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr17473.ll
@@ -62,6 +62,6 @@ for.end:                                          ; preds = %fn3.exit
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
diff --git a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
index 914ea7aabe0d1..9c20685669374 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/pr18165.ll
@@ -71,8 +71,8 @@ fn1.exit:                                         ; preds = %lor.end.i
 ; Function Attrs: nounwind optsize
 declare i32 @printf(ptr nocapture readonly, ...) #1
 
-attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind optsize ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
+attributes #1 = { nounwind optsize "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 attributes #2 = { nounwind optsize }
 
 !llvm.ident = !{!0}
diff --git a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
index 3364465898643..37e2f68e89a6f 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/two-combinations-bug.ll
@@ -46,7 +46,7 @@ for.body3:                                        ; preds = %for.body3, %for.bod
   br i1 %exitcond, label %for.cond.loopexit, label %for.body3
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !0 = !{!1, !2, i64 16}
 !1 = !{!"planet", !2, i64 0, !2, i64 8, !2, i64 16, !2, i64 24, !2, i64 32, !2, i64 40, !2, i64 48}
diff --git a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
index ee28aa1b0008b..df4dfa6a2101b 100644
--- a/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
+++ b/llvm/test/Transforms/LoopUnroll/runtime-epilog-debuginfo.ll
@@ -76,7 +76,7 @@ lee1.exit:                                        ; preds = %lee1.exit.loopexit,
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
 
-attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+neon,+strict-align,+vfp3,-crypto,-fp-armv8,-fp16,-vfp4" "use-soft-float"="false" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
index 27ca4143b5be5..199203a9f5cb0 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/fully-unrolled-cost.ll
@@ -86,7 +86,8 @@ define i64 @test_two_ivs(ptr %a, ptr %b, i64 %start) #0 {
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %i.iv = phi i64 [ 0, %entry ], [ %i.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: induction instruction   %j.iv = phi i64 [ %start, %entry ], [ %j.iv.next, %for.body ]
 ; CHECK-NEXT: Cost of 0 for VF 16: EMIT vp<{{.+}}> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
-; CHECK: Cost for VF 16: 41
+; CHECK: Cost of 1 for VF 16: EXPRESSION vp<%11> = ir<%sum> + partial.reduce.add (mul nuw nsw (ir<%1> zext to i64), (ir<%0> zext to i64))
+; CHECK: Cost for VF 16: 3
 ; CHECK: LV: Selecting VF: 16
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
index 2d1543185098f..8109d0683fe71 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/maxbandwidth-regpressure.ll
@@ -6,8 +6,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-unknown-elf"
 
 define i32 @dotp(ptr %a, ptr %b) #0 {
-; CHECK-REGS-VP-NOT: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
-; CHECK-REGS-VP: LV: Selecting VF: vscale x 8.
+; CHECK-REGS-VP: LV: Selecting VF: vscale x 16.
 ;
 ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 8 because it uses too many registers
 ; CHECK-NOREGS-VP: LV(REG): Not considering vector loop of width vscale x 16 because it uses too many registers
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
index 5a2eee0968863..2f5a58c235b09 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/multiple-result-intrinsics.ll
@@ -37,7 +37,6 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ;
 ; CHECK-ARMPL-LABEL: define void @sincos_f32(
 ; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
@@ -51,15 +50,6 @@ define void @sincos_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noali
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
-; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
-; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
-; CHECK-ARMPL:  [[FOR_BODY:.*:]]
-; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
-; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincos.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
-; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
-; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
-; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
-; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
 ; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
@@ -269,7 +259,6 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ;
 ; CHECK-ARMPL-LABEL: define void @modf_f32(
 ; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
-; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
@@ -283,15 +272,6 @@ define void @modf_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noalias
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
-; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
-; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
-; CHECK-ARMPL:  [[FOR_BODY:.*:]]
-; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
-; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.modf.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
-; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
-; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
-; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
-; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
 ; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
@@ -429,7 +409,6 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ;
 ; CHECK-ARMPL-LABEL: define void @sincospi_f32(
 ; CHECK-ARMPL-SAME: ptr noalias [[IN:%.*]], ptr noalias writeonly [[OUT_A:%.*]], ptr noalias writeonly [[OUT_B:%.*]]) #[[ATTR0]] {
-; CHECK-ARMPL:  [[ENTRY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_PH:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY:.*:]]
 ; CHECK-ARMPL:  [[VECTOR_BODY1:.*:]]
@@ -443,15 +422,6 @@ define void @sincospi_f32(ptr noalias %in, ptr noalias writeonly %out_a, ptr noa
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP15]], ptr [[TMP22:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP16]], ptr [[TMP24:%.*]], align 4
 ; CHECK-ARMPL:    store <vscale x 4 x float> [[TMP17]], ptr [[TMP27:%.*]], align 4
-; CHECK-ARMPL:  [[MIDDLE_BLOCK:.*:]]
-; CHECK-ARMPL:  [[SCALAR_PH:.*:]]
-; CHECK-ARMPL:  [[FOR_BODY:.*:]]
-; CHECK-ARMPL:  [[VEC_EPILOG_VECTOR_BODY:.*:]]
-; CHECK-ARMPL:    [[TMP29:%.*]] = call { <4 x float>, <4 x float> } @llvm.sincospi.v4f32(<4 x float> [[WIDE_LOAD3:%.*]])
-; CHECK-ARMPL:    [[TMP25:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 0
-; CHECK-ARMPL:    [[TMP26:%.*]] = extractvalue { <4 x float>, <4 x float> } [[TMP29]], 1
-; CHECK-ARMPL:    store <4 x float> [[TMP25]], ptr [[TMP30:%.*]], align 4
-; CHECK-ARMPL:    store <4 x float> [[TMP26]], ptr [[TMP28:%.*]], align 4
 ; CHECK-ARMPL:  [[VEC_EPILOG_MIDDLE_BLOCK:.*:]]
 ; CHECK-ARMPL:  [[VEC_EPILOG_SCALAR_PH:.*:]]
 ; CHECK-ARMPL:  [[FOR_BODY1:.*:]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 3dfa6df3313a5..287226f14b753 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -31,9 +31,9 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
@@ -52,37 +52,34 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP6]]
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = sub <16 x i32> zeroinitializer, [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -114,10 +111,10 @@ define i32 @chained_partial_reduce_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP17]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -184,9 +181,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -221,9 +218,9 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -262,10 +259,10 @@ define i32 @chained_partial_reduce_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -331,11 +328,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -352,37 +349,34 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -414,11 +408,11 @@ define i32 @chained_partial_reduce_sub_add(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -486,11 +480,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP16]]
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -508,37 +502,35 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE3:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19]] = sub <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP10]]
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sub <16 x i32> zeroinitializer, [[TMP12]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP20:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP19]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE3]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -570,11 +562,11 @@ define i32 @chained_partial_reduce_sub_sub(ptr %a, ptr %b, ptr %c, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP18]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP19]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -644,12 +636,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP15]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP15]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP12]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -683,9 +675,9 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP11:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP5]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP11]])
 ; CHECK-SVE-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP5]]
@@ -726,12 +718,12 @@ define i32 @chained_partial_reduce_add_add_add(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP11]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP18]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -802,13 +794,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[TMP11:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
-; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP12:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP12]])
-; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-NEON-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP8]], [[TMP13]]
 ; CHECK-NEON-NEXT:    [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP15]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -826,39 +818,37 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[CMP28_NOT:%.*]] = icmp ult i32 [[N]], 2
 ; CHECK-SVE-NEXT:    [[DIV27:%.*]] = lshr i32 [[N]], 1
 ; CHECK-SVE-NEXT:    [[WIDE_TRIP_COUNT:%.*]] = zext nneg i32 [[DIV27]] to i64
-; CHECK-SVE-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], [[TMP1]]
+; CHECK-SVE-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-SVE:       vector.ph:
-; CHECK-SVE-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[WIDE_TRIP_COUNT]], 16
 ; CHECK-SVE-NEXT:    [[N_VEC:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], [[N_MOD_VF]]
 ; CHECK-SVE-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-SVE:       vector.body:
 ; CHECK-SVE-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-SVE-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE4:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-SVE-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-SVE-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw i8, ptr [[C]], i64 [[INDEX]]
-; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP7]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP8]], align 1
-; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP13:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP14:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP15:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD2]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP14]]
-; CHECK-SVE-NEXT:    [[TMP17:%.*]] = sub <vscale x 4 x i32> [[VEC_PHI]], [[TMP16]]
-; CHECK-SVE-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 4 x i32> [[TMP13]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP19:%.*]] = add <vscale x 4 x i32> [[TMP17]], [[TMP18]]
-; CHECK-SVE-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 4 x i32> [[TMP14]], [[TMP15]]
-; CHECK-SVE-NEXT:    [[TMP21]] = sub <vscale x 4 x i32> [[TMP19]], [[TMP20]]
-; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
+; CHECK-SVE-NEXT:    [[TMP6:%.*]] = sub nsw <16 x i32> zeroinitializer, [[TMP5]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP11:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP13]])
+; CHECK-SVE-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP4]], [[TMP11]]
+; CHECK-SVE-NEXT:    [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP14]]
+; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE4]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE3]], <16 x i32> [[TMP10]])
+; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK-SVE:       middle.block:
-; CHECK-SVE-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP21]])
+; CHECK-SVE-NEXT:    [[TMP12:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE4]])
 ; CHECK-SVE-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-SVE-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_CLEANUP:%.*]], label [[SCALAR_PH]]
 ; CHECK-SVE:       scalar.ph:
@@ -890,13 +880,13 @@ define i32 @chained_partial_reduce_sub_add_sub(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP17:%.*]] = sub nsw <vscale x 8 x i32> zeroinitializer, [[TMP16]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP17]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP18]])
-; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP15]]
+; CHECK-SVE-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <vscale x 8 x i32> [[TMP14]], [[TMP12]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP20:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP19]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE4]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE3]], <vscale x 8 x i32> [[TMP20]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -969,9 +959,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP9]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1005,9 +995,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
+; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP5]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1045,9 +1035,9 @@ define i32 @chained_partial_reduce_madd_extadd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP16]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP15]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1110,8 +1100,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT:    [[TMP6:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1142,8 +1132,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-NEXT:    [[TMP2:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP2]])
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP3]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-SVE-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1178,8 +1168,8 @@ define i32 @chained_partial_reduce_extadd_extadd(ptr %a, ptr %b, i32 %N) #0 {
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP11]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE2]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP12]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1240,10 +1230,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-NEON-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
+; CHECK-NEON-NEXT:    [[TMP7:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-NEON-NEXT:    [[TMP10:%.*]] = mul nsw <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-NEON-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP10]])
 ; CHECK-NEON-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1276,10 +1266,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-SVE-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-SVE-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
 ; CHECK-SVE-NEXT:    [[TMP6:%.*]] = mul nsw <16 x i32> [[TMP3]], [[TMP4]]
 ; CHECK-SVE-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
 ; CHECK-SVE-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -1316,10 +1306,10 @@ define i32 @chained_partial_reduce_extadd_madd(ptr %a, ptr %b, ptr %c, i32 %N) #
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
 ; CHECK-SVE-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP15:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP15]])
+; CHECK-SVE-MAXBW-NEXT:    [[TMP13:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-SVE-MAXBW-NEXT:    [[TMP14:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
 ; CHECK-SVE-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <vscale x 8 x i32> [[TMP13]], [[TMP14]]
 ; CHECK-SVE-MAXBW-NEXT:    [[PARTIAL_REDUCE3]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]], <vscale x 8 x i32> [[TMP16]])
 ; CHECK-SVE-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
index b033f6051f812..b430efc9e5283 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-constant-ops.ll
@@ -467,3 +467,83 @@ loop:
 exit:
   ret i32 %red.next
 }
+
+define i64 @partial_reduction_mul_two_users(i64 %n, ptr %a, i16 %b, i32 %c) {
+; CHECK-LABEL: define i64 @partial_reduction_mul_two_users(
+; CHECK-SAME: i64 [[N:%.*]], ptr [[A:%.*]], i16 [[B:%.*]], i32 [[C:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[N]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 8
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[B]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT]] to <8 x i32>
+; CHECK-NEXT:    [[TMP2:%.*]] = mul <8 x i32> [[TMP1]], [[TMP1]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP4:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <8 x i16> poison, i16 [[TMP4]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT1]], <8 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <8 x i32> [[TMP2]] to <8 x i64>
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i64> @llvm.vector.partial.reduce.add.v4i64.v8i64(<4 x i64> [[VEC_PHI]], <8 x i64> [[TMP3]])
+; CHECK-NEXT:    [[TMP5:%.*]] = sext <8 x i16> [[BROADCAST_SPLAT2]] to <8 x i32>
+; CHECK-NEXT:    [[TMP6:%.*]] = sext <8 x i32> [[TMP5]] to <8 x i64>
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> [[PARTIAL_REDUCE]])
+; CHECK-NEXT:    [[VECTOR_RECUR_EXTRACT:%.*]] = extractelement <8 x i64> [[TMP6]], i32 7
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[SCALAR_RECUR_INIT:%.*]] = phi i64 [ [[VECTOR_RECUR_EXTRACT]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP8]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES1:%.*]] = phi i64 [ [[SCALAR_RECUR_INIT]], %[[SCALAR_PH]] ], [ [[LOAD_EXT_EXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RES2:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[ADD:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[LOAD:%.*]] = load i16, ptr [[A]], align 2
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = sext i16 [[B]] to i32
+; CHECK-NEXT:    [[MUL:%.*]] = mul i32 [[CONV]], [[CONV]]
+; CHECK-NEXT:    [[MUL_EXT:%.*]] = zext i32 [[MUL]] to i64
+; CHECK-NEXT:    [[ADD]] = add i64 [[RES2]], [[MUL_EXT]]
+; CHECK-NEXT:    [[OR:%.*]] = or i32 [[MUL]], [[C]]
+; CHECK-NEXT:    [[LOAD_EXT:%.*]] = sext i16 [[LOAD]] to i32
+; CHECK-NEXT:    [[LOAD_EXT_EXT]] = sext i32 [[LOAD_EXT]] to i64
+; CHECK-NEXT:    [[EXITCOND740_NOT:%.*]] = icmp eq i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND740_NOT]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[ADD_LCSSA:%.*]] = phi i64 [ [[ADD]], %[[LOOP]] ], [ [[TMP8]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i64 [[ADD_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %res1 = phi i64 [ 0, %entry ], [ %load.ext.ext, %loop ]
+  %res2 = phi i64 [ 0, %entry ], [ %add, %loop ]
+  %load = load i16, ptr %a, align 2
+  %iv.next = add i64 %iv, 1
+  %conv = sext i16 %b to i32
+  %mul = mul i32 %conv, %conv
+  %mul.ext = zext i32 %mul to i64
+  %add = add i64 %res2, %mul.ext
+  %second_use = or i32 %mul, %c ; this value is otherwise unused, but that's sufficient for the test
+  %load.ext = sext i16 %load to i32
+  %load.ext.ext = sext i32 %load.ext to i64
+  %exitcond740.not = icmp eq i64 %iv, %n
+  br i1 %exitcond740.not, label %exit, label %loop
+
+exit:
+  ret i64 %add
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
index 8ece59aef90a9..d8f1a86c9ebda 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-epilogue.ll
@@ -16,10 +16,10 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP8]], [[TMP5]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP9]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -68,7 +68,6 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[IV_NEXT:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[A]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -77,11 +76,12 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i8> poison, i8 [[TMP2]], i64 0
 ; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT2]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP1:%.*]] = sext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
 ; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[IV_NEXT]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[IV_NEXT]]
@@ -89,7 +89,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END6:%.*]] = add i64 [[IDX_NEG]], [[IV_NEXT]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF4:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
 ; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP6]], [[WHILE_BODY]] ], [ 0, [[ENTRY]] ]
@@ -112,7 +112,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[TMP13]] = add <4 x i32> [[TMP14]], [[VEC_PHI9]]
 ; CHECK-NEXT:    [[INDEX_NEXT14]] = add nuw i64 [[INDEX9]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT14]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP13]])
 ; CHECK-NEXT:    [[CMP_N15:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC5]]
@@ -136,7 +136,7 @@ define void @dotp_small_epilogue_vf(i64 %idx.neg, i8 %a) #1 {
 ; CHECK-NEXT:    [[CMP_IV_NEG:%.*]] = icmp ugt i64 [[IV_NEG]], 0
 ; CHECK-NEXT:    [[CMP_IV:%.*]] = icmp ne i64 [[ACCUM1]], -1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = and i1 [[CMP_IV_NEG]], [[CMP_IV]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[WHILE_BODY1]], label [[WHILE_END_LOOPEXIT]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       while.end.loopexit:
 ; CHECK-NEXT:    [[RESULT:%.*]] = phi i32 [ [[ADD]], [[WHILE_BODY1]] ], [ [[TMP6]], [[MIDDLE_BLOCK]] ], [ [[TMP15]], [[VEC_EPILOG_MIDDLE_BLOCK]] ]
 ; CHECK-NEXT:    ret void
@@ -495,7 +495,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 16
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <16 x i64> [[VEC_IND]], splat (i64 16)
 ; CHECK-NEXT:    [[TMP181:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP181]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[TMP182:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-NEXT:    br label [[EXIT:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
index 09b41fb551775..26e630f969ef3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-mixed.ll
@@ -26,26 +26,26 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -62,8 +62,8 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -71,25 +71,25 @@ define i32 @sudot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
@@ -137,26 +137,26 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[TMP6]], i64 [[TMP9]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
-; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP7:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 4
 ; CHECK-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
-; CHECK-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
-; CHECK-NEXT:    [[TMP14:%.*]] = mul <vscale x 16 x i32> [[TMP12]], [[TMP11]]
-; CHECK-NEXT:    [[TMP19:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP7]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP14]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP19]])
+; CHECK-NEXT:    [[TMP18:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD3]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP11:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP12:%.*]] = mul <vscale x 16 x i32> [[TMP18]], [[TMP11]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP12]])
+; CHECK-NEXT:    [[TMP19:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i32>
+; CHECK-NEXT:    [[TMP20:%.*]] = mul <vscale x 16 x i32> [[TMP19]], [[TMP14]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP20]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 4 x i32> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
-; CHECK-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
+; CHECK-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[BIN_RDX]])
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       scalar.ph:
@@ -173,8 +173,8 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-NOI8MM:       vector.body:
 ; CHECK-NOI8MM-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP23:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NOI8MM-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NOI8MM-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP9:%.*]] = shl nuw i64 [[TMP8]], 3
@@ -182,25 +182,25 @@ define i32 @usdot(ptr %a, ptr %b) #0 {
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP6]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
 ; CHECK-NOI8MM-NEXT:    [[TMP11:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP7:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
 ; CHECK-NOI8MM-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NOI8MM-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NOI8MM-NEXT:    [[TMP16:%.*]] = shl nuw i64 [[TMP15]], 3
 ; CHECK-NOI8MM-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[TMP13]], i64 [[TMP16]]
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
 ; CHECK-NOI8MM-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-NOI8MM-NEXT:    [[TMP18:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-NOI8MM-NEXT:    [[TMP20:%.*]] = mul <vscale x 8 x i32> [[TMP18]], [[TMP11]]
-; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = mul <vscale x 8 x i32> [[TMP19]], [[TMP12]]
-; CHECK-NOI8MM-NEXT:    [[TMP22]] = add <vscale x 8 x i32> [[TMP20]], [[VEC_PHI]]
-; CHECK-NOI8MM-NEXT:    [[TMP23]] = add <vscale x 8 x i32> [[TMP21]], [[VEC_PHI1]]
+; CHECK-NOI8MM-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD3]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP21:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
+; CHECK-NOI8MM-NEXT:    [[TMP14:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP11]]
+; CHECK-NOI8MM-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP21]], [[TMP7]]
+; CHECK-NOI8MM-NEXT:    [[TMP18]] = add <vscale x 8 x i32> [[TMP14]], [[VEC_PHI]]
+; CHECK-NOI8MM-NEXT:    [[TMP20]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-NOI8MM-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NOI8MM-NEXT:    [[TMP24:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[TMP24]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-NOI8MM:       middle.block:
-; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP23]], [[TMP22]]
-; CHECK-NOI8MM-NEXT:    [[TMP25:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
+; CHECK-NOI8MM-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP18]]
+; CHECK-NOI8MM-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[BIN_RDX]])
 ; CHECK-NOI8MM-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NOI8MM-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-NOI8MM:       scalar.ph:
@@ -242,18 +242,18 @@ define i32 @sudot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = sext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -337,18 +337,18 @@ define i32 @usdot_neon(ptr %a, ptr %b) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-NEXT:    [[TMP3:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 16
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP3]]
+; CHECK-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
-; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-NEXT:    [[TMP8:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-NEXT:    [[TMP14:%.*]] = mul <16 x i32> [[TMP10]], [[TMP8]]
+; CHECK-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP14]])
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
index 801eb810d8625..b84763142b686 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product-neon.ll
@@ -18,10 +18,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP6]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = mul <16 x i32> [[TMP9]], [[TMP4]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP10]], [[TMP5]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP11]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul <16 x i32> [[TMP7]], [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP12]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -80,10 +80,10 @@ define i32 @dotp(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = mul <16 x i32> [[TMP6]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -720,26 +720,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -788,59 +788,59 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP19]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE1]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP20]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP17]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP21]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP26]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP27]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP28]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP29]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP30]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP25]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP29]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP32]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP33]], [[TMP37]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP56:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = mul nsw <16 x i32> [[TMP34]], [[TMP56]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP39]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP40]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP33]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP37]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP42]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP46:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP46]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP47:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP49:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP47]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP50:%.*]] = mul nsw <16 x i32> [[TMP44]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP49]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP50]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = mul nsw <16 x i32> [[TMP43]], [[TMP41]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP45]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP51:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP51]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
@@ -884,26 +884,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP13]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nsw <16 x i32> [[TMP18]], [[TMP20]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP21]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP25]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP30]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP31]])
@@ -2025,7 +2025,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
@@ -2062,7 +2062,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14]] = add <16 x i32> [[TMP12]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <16 x i32> [[TMP14]], [[TMP13]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[BIN_RDX]])
@@ -2091,7 +2091,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) {
 ; CHECK-MAXBW-NEXT:    [[TMP8]] = add <16 x i32> [[TMP7]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = extractelement <16 x i32> [[TMP6]], i32 15
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
index 1ace7d44125b9..4636c1b63da82 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-dot-product.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP16]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -47,17 +47,17 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP20]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP28]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP8]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP7]], [[TMP8]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP9]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT1]], 1024
@@ -75,26 +75,26 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD4]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul <vscale x 8 x i32> [[TMP20]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE5]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI1]], <vscale x 8 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP6]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP23]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE5]])
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -134,10 +134,10 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -157,54 +157,78 @@ define i64 @not_dotp_i8_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %b
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 48
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP1]], [[TMP0]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD7:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[NEXT_GEP1]], i32 48
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP18]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw nsw <16 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP7]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <16 x i8> [[WIDE_LOAD10]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw nsw <16 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE13]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI2]], <16 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD11]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <16 x i8> [[WIDE_LOAD7]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw nsw <16 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI3]], <16 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX15:%.*]] = add <2 x i64> [[PARTIAL_REDUCE13]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX15]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX16]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.exit:
-; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP4]]
+; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP9]]
 ;
 ; CHECK-MAXBW-LABEL: define i64 @not_dotp_i8_to_i64_has_neon_dotprod(
 ; CHECK-MAXBW-SAME: ptr readonly [[A:%.*]], ptr readonly [[B:%.*]]) #[[ATTR1:[0-9]+]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP13]], [[TMP11]]
-; CHECK-MAXBW-NEXT:    [[TMP15]] = add <vscale x 8 x i64> [[TMP14]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <16 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP15]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -245,10 +269,10 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP1]], [[TMP0]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -269,30 +293,50 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI3:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX2:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP3:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX2]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr i16, ptr [[NEXT_GEP]], i32 24
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i16>, ptr [[TMP0]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD7:%.*]] = load <8 x i16>, ptr [[TMP10]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <8 x i16>, ptr [[TMP11]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 8
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr i16, ptr [[NEXT_GEP3]], i32 24
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <8 x i16>, ptr [[NEXT_GEP3]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD11:%.*]] = load <8 x i16>, ptr [[TMP18]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <8 x i16>, ptr [[TMP19]], align 2
+; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <8 x i16> [[WIDE_LOAD5]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = mul nuw nsw <8 x i64> [[TMP2]], [[TMP3]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD6]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul nuw nsw <8 x i64> [[TMP4]], [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <8 x i16> [[WIDE_LOAD4]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = mul nuw nsw <8 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP7]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <8 x i16> [[WIDE_LOAD11]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <8 x i16> [[WIDE_LOAD7]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nuw nsw <8 x i64> [[TMP12]], [[TMP13]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE14]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI2]], <8 x i64> [[TMP14]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <8 x i16> [[WIDE_LOAD12]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = zext <8 x i16> [[WIDE_LOAD8]] to <8 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nuw nsw <8 x i64> [[TMP15]], [[TMP16]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI3]], <8 x i64> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
 ; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE7]], [[PARTIAL_REDUCE]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX16:%.*]] = add <2 x i64> [[PARTIAL_REDUCE14]], [[BIN_RDX]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX17:%.*]] = add <2 x i64> [[PARTIAL_REDUCE15]], [[BIN_RDX16]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX17]])
 ; CHECK-INTERLEAVED-NEXT:    br label [[FOR_EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.exit:
 ; CHECK-INTERLEAVED-NEXT:    ret i64 [[TMP9]]
@@ -302,36 +346,28 @@ define i64 @not_dotp_i16_to_i64_has_neon_dotprod(ptr readonly %a, ptr readonly %
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[TMP6]]
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = mul i64 [[N_VEC]], 2
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[TMP8]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX1:%.*]] = mul i64 [[INDEX]], 2
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[OFFSET_IDX1]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x i16>, ptr [[NEXT_GEP2]], align 2
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = zext <vscale x 4 x i16> [[WIDE_LOAD3]] to <vscale x 4 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nuw nsw <vscale x 4 x i64> [[TMP15]], [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[TMP17]] = add <vscale x 4 x i64> [[TMP16]], [[VEC_PHI]]
-; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[NEXT_GEP]], align 2
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i16>, ptr [[NEXT_GEP2]], align 2
+; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext <8 x i16> [[WIDE_LOAD3]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = mul nuw nsw <8 x i64> [[TMP0]], [[TMP1]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP2]])
+; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
+; CHECK-MAXBW-NEXT:    br i1 [[TMP3]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP17]])
-; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
-; CHECK-MAXBW:       scalar.ph:
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
+; CHECK-MAXBW:       for.exit:
+; CHECK-MAXBW-NEXT:    ret i64 [[TMP4]]
 ;
 entry:
   br label %for.body
@@ -687,7 +723,7 @@ define i32 @not_dotp_different_types(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP138]] = add <16 x i32> [[TMP136]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP70:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
-; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP70]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP138]])
 ; CHECK-MAXBW-NEXT:    br label [[FOR_EXIT:%.*]]
@@ -835,7 +871,7 @@ define i32 @not_dotp_not_loop_carried(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = add <vscale x 8 x i32> [[TMP25]], [[TMP26]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP28]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = mul nuw i32 [[TMP20]], 8
@@ -964,7 +1000,7 @@ define i32 @not_dotp_not_phi(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = add <vscale x 8 x i32> [[TMP20]], [[TMP19]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP22]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = call i32 @llvm.vscale.i32()
 ; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nuw i32 [[TMP23]], 8
@@ -1024,26 +1060,26 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP9]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP12]], [[TMP23]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP12]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP15]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP18:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP18]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE10]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP19]])
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD11:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD11]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP21]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE13]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP22]])
@@ -1092,58 +1128,58 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD8:%.*]] = load <16 x i8>, ptr [[TMP43]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[TMP2]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD9:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD10:%.*]] = load <16 x i8>, ptr [[TMP12]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[WIDE_LOAD9]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP14]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP15]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = sext <16 x i8> [[WIDE_LOAD10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP17]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[WIDE_LOAD8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP16]], [[TMP17]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE11]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP18]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD12:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD13:%.*]] = load <16 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD14:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP22]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD15:%.*]] = load <16 x i8>, ptr [[TMP20]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[WIDE_LOAD12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = sext <16 x i8> [[WIDE_LOAD14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP23]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = sext <16 x i8> [[WIDE_LOAD13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[WIDE_LOAD15]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP24]], [[TMP25]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP26]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD18:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD19:%.*]] = load <16 x i8>, ptr [[TMP27]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = getelementptr inbounds i8, ptr [[TMP8]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD20:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP30]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = mul nsw <16 x i32> [[TMP28]], [[TMP31]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP48]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP33]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD21:%.*]] = load <16 x i8>, ptr [[TMP28]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[WIDE_LOAD18]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = sext <16 x i8> [[WIDE_LOAD20]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP30]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE22]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP48:%.*]] = sext <16 x i8> [[WIDE_LOAD19]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = sext <16 x i8> [[WIDE_LOAD21]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP48]], [[TMP33]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE23]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = getelementptr inbounds i8, ptr [[TMP10]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD24:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD25:%.*]] = load <16 x i8>, ptr [[TMP35]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = getelementptr inbounds i8, ptr [[TMP11]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD26:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP38]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = mul nsw <16 x i32> [[TMP36]], [[TMP39]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP40]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP41]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD27:%.*]] = load <16 x i8>, ptr [[TMP36]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = sext <16 x i8> [[WIDE_LOAD24]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = sext <16 x i8> [[WIDE_LOAD26]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = mul nsw <16 x i32> [[TMP37]], [[TMP38]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE28]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP39]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = sext <16 x i8> [[WIDE_LOAD25]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = sext <16 x i8> [[WIDE_LOAD27]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = mul nsw <16 x i32> [[TMP40]], [[TMP41]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE29]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP42]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -1165,21 +1201,21 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-SAME: i32 [[NUM_OUT:%.*]], i64 [[NUM_IN:%.*]], ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUM_IN]], [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[NUM_IN]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[NUM_IN]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI4:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI5:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI6:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE11:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE13:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE10:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI2:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE7:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI3:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = or disjoint i64 [[INDEX]], 1
@@ -1191,38 +1227,38 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = or disjoint i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[TMP15]]
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = getelementptr inbounds i8, ptr [[B]], i64 [[TMP15]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 8 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD9]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <vscale x 8 x i32> [[TMP29]], [[TMP23]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE11]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI7]], <vscale x 8 x i32> [[TMP31]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD12]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD14:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD14]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = mul nsw <vscale x 8 x i32> [[TMP37]], [[TMP43]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI6]], <vscale x 8 x i32> [[TMP45]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD18:%.*]] = load <vscale x 8 x i8>, ptr [[TMP13]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP51:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD18]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD20:%.*]] = load <vscale x 8 x i8>, ptr [[TMP14]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP57:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD20]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP59:%.*]] = mul nsw <vscale x 8 x i32> [[TMP51]], [[TMP57]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI5]], <vscale x 8 x i32> [[TMP59]])
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD24:%.*]] = load <vscale x 8 x i8>, ptr [[TMP16]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP65:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD24]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD26:%.*]] = load <vscale x 8 x i8>, ptr [[TMP17]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP71:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD26]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP73:%.*]] = mul nsw <vscale x 8 x i32> [[TMP65]], [[TMP71]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI4]], <vscale x 8 x i32> [[TMP73]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 16 x i8>, ptr [[TMP8]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD4]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = mul nsw <vscale x 16 x i32> [[TMP27]], [[TMP32]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI3]], <vscale x 16 x i32> [[TMP33]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD6:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD5]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD6]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <vscale x 16 x i32> [[TMP18]], [[TMP19]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE7]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI2]], <vscale x 16 x i32> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD8:%.*]] = load <vscale x 16 x i8>, ptr [[TMP13]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD9:%.*]] = load <vscale x 16 x i8>, ptr [[TMP14]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD8]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD9]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = mul nsw <vscale x 16 x i32> [[TMP21]], [[TMP22]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE10]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI1]], <vscale x 16 x i32> [[TMP23]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD11:%.*]] = load <vscale x 16 x i8>, ptr [[TMP16]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD12:%.*]] = load <vscale x 16 x i8>, ptr [[TMP17]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD11]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD12]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <vscale x 16 x i32> [[TMP24]], [[TMP25]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE13]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP26]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP74:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP74]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
-; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE11]])
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE13]])
+; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE10]])
+; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE7]])
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[NUM_IN]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -1390,7 +1426,7 @@ define i32 @dotp_predicated(i64 %N, ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
 ; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = xor i1 [[TMP19]], true
-; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP20]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    br label [[EXIT:%.*]]
@@ -1525,7 +1561,7 @@ define i32 @not_dotp_extend_user(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP24]] = add <vscale x 8 x i32> [[TMP22]], [[VEC_PHI1]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP24]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vscale.i32()
@@ -1565,110 +1601,93 @@ define i64 @dotp_cost_disagreement(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:  entry:
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       vector.ph:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP9:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD1]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP13]], [[TMP9]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP14]]
-; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP11]], align 1
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = mul nuw nsw <16 x i64> [[TMP3]], [[TMP4]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP5]])
+; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[TMP15]])
-; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVE1-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
 ;
 ; CHECK-INTERLEAVED-LABEL: define i64 @dotp_cost_disagreement(
 ; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-INTERLEAVED-NEXT:  entry:
-; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 4
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP25:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE5:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i64 [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i8>, ptr [[TMP11]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD2]] to <vscale x 2 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP7]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP14]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = shl nuw i64 [[TMP17]], 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i64 [[TMP18]]
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i8>, ptr [[TMP15]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <vscale x 2 x i8>, ptr [[TMP19]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD3]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = zext <vscale x 2 x i8> [[WIDE_LOAD4]] to <vscale x 2 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP20]], [[TMP12]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = mul nuw nsw <vscale x 2 x i64> [[TMP21]], [[TMP13]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP24]] = add <vscale x 2 x i64> [[VEC_PHI]], [[TMP22]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP25]] = add <vscale x 2 x i64> [[VEC_PHI1]], [[TMP23]]
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP26]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP15]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP15]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 32
+; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <vscale x 2 x i64> [[TMP25]], [[TMP24]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[BIN_RDX]])
-; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE5]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
+; CHECK-INTERLEAVED-NEXT:    br label [[SCALAR_PH:%.*]]
 ; CHECK-INTERLEAVED:       scalar.ph:
 ;
 ; CHECK-MAXBW-LABEL: define i64 @dotp_cost_disagreement(
 ; CHECK-MAXBW-SAME: ptr [[A:%.*]], ptr [[B:%.*]]) #[[ATTR0]] {
 ; CHECK-MAXBW-NEXT:  entry:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 3
+; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 41, [[TMP1]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 41, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 41, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP14:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = add nuw nsw i64 [[INDEX]], 1
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP11]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP8]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP14]] = add <vscale x 8 x i64> [[VEC_PHI]], [[TMP13]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP11]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP12]], [[TMP8]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 41, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -1971,7 +1990,7 @@ define i64 @not_dotp_ext_outside_plan(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2104,7 +2123,7 @@ define i64 @not_dotp_ext_outside_plan2(ptr %a, i16 %b, i64 %n) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP11]] = add <vscale x 4 x i64> [[TMP10]], [[VEC_PHI]]
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
 ; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv4i64(<vscale x 4 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
@@ -2160,10 +2179,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP5]], [[TMP6]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
@@ -2181,10 +2200,10 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP]], label [[FOR_BODY_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK-INTERLEAVED:       for.body.preheader:
 ; CHECK-INTERLEAVED-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
-; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-INTERLEAVED:       vector.ph:
-; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 16
+; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
@@ -2194,19 +2213,29 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <2 x i64> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi <2 x i64> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE6:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[NEXT_GEP2:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i64> [[TMP6]], [[TMP5]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP10]])
-; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[NEXT_GEP2]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[NEXT_GEP2]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP6]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nuw nsw <16 x i64> [[TMP13]], [[TMP15]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i64>
+; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i64> [[TMP10]], [[TMP11]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE6]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP12]])
+; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[BIN_RDX:%.*]] = add <2 x i64> [[PARTIAL_REDUCE6]], [[PARTIAL_REDUCE]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.add.v2i64(<2 x i64> [[BIN_RDX]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -2219,35 +2248,35 @@ define dso_local i32 @not_dotp_vscale1(ptr %a, ptr %b, i32 %n, i64 %cost) #0 {
 ; CHECK-MAXBW:       for.body.preheader:
 ; CHECK-MAXBW-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
 ; CHECK-MAXBW-NEXT:    [[TMP1:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 3
+; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4
 ; CHECK-MAXBW-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP0]], [[TMP2]]
 ; CHECK-MAXBW-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 8
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i64 [[TMP3]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP0]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP0]], [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = trunc i64 [[N_VEC]] to i32
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[A]], i64 [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[N_VEC]]
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = insertelement <vscale x 8 x i64> zeroinitializer, i64 [[COST]], i32 0
+; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = insertelement <vscale x 2 x i64> zeroinitializer, i64 [[COST]], i32 0
 ; CHECK-MAXBW-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 8 x i64> [ [[TMP10]], [[VECTOR_PH]] ], [ [[TMP20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i64> [ [[TMP12]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[NEXT_GEP1:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP1]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD2]] to <vscale x 8 x i64>
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = mul nuw nsw <vscale x 8 x i64> [[TMP16]], [[TMP14]]
-; CHECK-MAXBW-NEXT:    [[TMP20]] = add <vscale x 8 x i64> [[TMP17]], [[VEC_PHI]]
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 16 x i8>, ptr [[NEXT_GEP1]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD2]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = mul nuw nsw <vscale x 16 x i64> [[TMP14]], [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i64> @llvm.vector.partial.reduce.add.nxv2i64.nxv16i64(<vscale x 2 x i64> [[VEC_PHI]], <vscale x 16 x i64> [[TMP11]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP18]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vector.reduce.add.nxv8i64(<vscale x 8 x i64> [[TMP20]])
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vector.reduce.add.nxv2i64(<vscale x 2 x i64> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP0]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[EXIT_LOOPEXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
@@ -2319,17 +2348,16 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[TMP36:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[TMP33:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[TMP30:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[TMP27:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[TMP24:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[TMP21:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[TMP18:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[TMP15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP0]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE21:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ [[TMP1]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE20:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ [[TMP2]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE19:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI3:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE18:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI4:%.*]] = phi <4 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE17:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI5:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE16:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI6:%.*]] = phi <4 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE15:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2341,42 +2369,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-INTERLEAVE1-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP31]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP24]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP27]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP30]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP34]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP33]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP10]]
-; CHECK-INTERLEAVE1-NEXT:    [[TMP36]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP29:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP29]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP37:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP37]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK-INTERLEAVE1:       middle.block:
-; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP36]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP33]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP30]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP27]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP24]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP21]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP44:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP18]])
-; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP15]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP45:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVE1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVE1:       scalar.ph:
@@ -2429,7 +2458,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2441,42 +2469,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-INTERLEAVED-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK-INTERLEAVED:       middle.block:
-; CHECK-INTERLEAVED-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-INTERLEAVED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-INTERLEAVED:       scalar.ph:
@@ -2529,7 +2558,6 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[VEC_PHI7:%.*]] = phi <4 x i32> [ [[TMP7]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-MAXBW-NEXT:    [[TMP11:%.*]] = shl nsw i64 [[INDEX]], 3
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i64 [[TMP11]]
 ; CHECK-MAXBW-NEXT:    [[WIDE_VEC:%.*]] = load <128 x i8>, ptr [[TMP12]], align 1
@@ -2541,42 +2569,43 @@ define dso_local void @not_dotp_high_register_pressure(ptr %a, ptr %b, ptr %sum,
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC12:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 5, i32 13, i32 21, i32 29, i32 37, i32 45, i32 53, i32 61, i32 69, i32 77, i32 85, i32 93, i32 101, i32 109, i32 117, i32 125>
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC13:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 6, i32 14, i32 22, i32 30, i32 38, i32 46, i32 54, i32 62, i32 70, i32 78, i32 86, i32 94, i32 102, i32 110, i32 118, i32 126>
 ; CHECK-MAXBW-NEXT:    [[STRIDED_VEC14:%.*]] = shufflevector <128 x i8> [[WIDE_VEC]], <128 x i8> poison, <16 x i32> <i32 7, i32 15, i32 23, i32 31, i32 39, i32 47, i32 55, i32 63, i32 71, i32 79, i32 87, i32 95, i32 103, i32 111, i32 119, i32 127>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = mul nsw <16 x i32> [[TMP13]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP14]])
-; CHECK-MAXBW-NEXT:    [[TMP15:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP15]], [[TMP10]]
+; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = sext <16 x i8> [[STRIDED_VEC]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP44:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul nsw <16 x i32> [[TMP35]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI7]], <16 x i32> [[TMP13]])
+; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sext <16 x i8> [[STRIDED_VEC8]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = mul nsw <16 x i32> [[TMP14]], [[TMP44]]
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE15]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI6]], <16 x i32> [[TMP16]])
 ; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = sext <16 x i8> [[STRIDED_VEC9]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP18:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP18]])
-; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = mul nsw <16 x i32> [[TMP19]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP20]])
-; CHECK-MAXBW-NEXT:    [[TMP21:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP21]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP22]])
-; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP24:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP24]])
-; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = mul nsw <16 x i32> [[TMP25]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP26]])
-; CHECK-MAXBW-NEXT:    [[TMP27:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP27]], [[TMP10]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT:    [[TMP19:%.*]] = mul nsw <16 x i32> [[TMP17]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE16]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI5]], <16 x i32> [[TMP19]])
+; CHECK-MAXBW-NEXT:    [[TMP20:%.*]] = sext <16 x i8> [[STRIDED_VEC10]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP22:%.*]] = mul nsw <16 x i32> [[TMP20]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE17]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI4]], <16 x i32> [[TMP22]])
+; CHECK-MAXBW-NEXT:    [[TMP23:%.*]] = sext <16 x i8> [[STRIDED_VEC11]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP25:%.*]] = mul nsw <16 x i32> [[TMP23]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE18]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP25]])
+; CHECK-MAXBW-NEXT:    [[TMP26:%.*]] = sext <16 x i8> [[STRIDED_VEC12]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP28:%.*]] = mul nsw <16 x i32> [[TMP26]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE19]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP28]])
+; CHECK-MAXBW-NEXT:    [[TMP45:%.*]] = sext <16 x i8> [[STRIDED_VEC13]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = mul nsw <16 x i32> [[TMP45]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE20]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP31]])
+; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = sext <16 x i8> [[STRIDED_VEC14]] to <16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = mul nsw <16 x i32> [[TMP32]], [[TMP44]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE21]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP34]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-MAXBW-NEXT:    [[TMP29:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP27:![0-9]+]]
+; CHECK-MAXBW-NEXT:    br i1 [[TMP29]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP30:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
-; CHECK-MAXBW-NEXT:    [[TMP31:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
-; CHECK-MAXBW-NEXT:    [[TMP32:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
-; CHECK-MAXBW-NEXT:    [[TMP33:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
-; CHECK-MAXBW-NEXT:    [[TMP34:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
-; CHECK-MAXBW-NEXT:    [[TMP35:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
-; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
-; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[TMP36:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE21]])
+; CHECK-MAXBW-NEXT:    [[TMP37:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE20]])
+; CHECK-MAXBW-NEXT:    [[TMP38:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE19]])
+; CHECK-MAXBW-NEXT:    [[TMP39:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE18]])
+; CHECK-MAXBW-NEXT:    [[TMP40:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE17]])
+; CHECK-MAXBW-NEXT:    [[TMP41:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE16]])
+; CHECK-MAXBW-NEXT:    [[TMP42:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE15]])
+; CHECK-MAXBW-NEXT:    [[TMP43:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]], label [[SCALAR_PH]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
index b308b925181b1..bd9fae6cd610b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-interleave.ll
@@ -23,12 +23,12 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i8, ptr [[TMP1]], i32 16
 ; IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; IC2-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
-; IC2-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP4]], [[TMP4]]
+; IC2-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; IC2-NEXT:    [[TMP7:%.*]] = mul nuw nsw <16 x i32> [[TMP5]], [[TMP5]]
-; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP6]])
-; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP7]])
+; IC2-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; IC2-NEXT:    [[TMP6:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
+; IC2-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP6]])
 ; IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; IC2-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC2-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -80,18 +80,18 @@ define i32 @partial_reduce_with_non_constant_start_value(ptr %src, i32 %rdx.star
 ; IC4-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP4]], align 1
 ; IC4-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; IC4-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
+; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP13]])
 ; IC4-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; IC4-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; IC4-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
-; IC4-NEXT:    [[TMP10:%.*]] = mul nuw nsw <16 x i32> [[TMP6]], [[TMP6]]
 ; IC4-NEXT:    [[TMP11:%.*]] = mul nuw nsw <16 x i32> [[TMP7]], [[TMP7]]
-; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP8]], [[TMP8]]
-; IC4-NEXT:    [[TMP13:%.*]] = mul nuw nsw <16 x i32> [[TMP9]], [[TMP9]]
-; IC4-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
 ; IC4-NEXT:    [[PARTIAL_REDUCE7]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
+; IC4-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
+; IC4-NEXT:    [[TMP12:%.*]] = mul nuw nsw <16 x i32> [[TMP10]], [[TMP10]]
 ; IC4-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP12]])
-; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP13]])
+; IC4-NEXT:    [[TMP14:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
+; IC4-NEXT:    [[TMP16:%.*]] = mul nuw nsw <16 x i32> [[TMP14]], [[TMP14]]
+; IC4-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP16]])
 ; IC4-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; IC4-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; IC4-NEXT:    br i1 [[TMP18]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
index 7bb47157d7cf2..6dae09ef97e1c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-sub.ll
@@ -18,11 +18,11 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP10]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
-; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP3]], [[TMP1]]
+; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = mul <16 x i32> [[TMP2]], [[TMP3]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = sub <16 x i32> zeroinitializer, [[TMP4]]
 ; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
@@ -48,19 +48,19 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[TMP7]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP2:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = getelementptr i8, ptr [[TMP14]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD3:%.*]] = load <16 x i8>, ptr [[TMP14]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = mul <16 x i32> [[TMP6]], [[TMP2]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = mul <16 x i32> [[TMP15]], [[TMP3]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = sub <16 x i32> zeroinitializer, [[TMP8]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP9]]
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD4:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD3]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = mul <16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sub <16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = zext <16 x i8> [[WIDE_LOAD4]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = mul <16 x i32> [[TMP8]], [[TMP9]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP11:%.*]] = sub <16 x i32> zeroinitializer, [[TMP10]]
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE5]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP11]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -78,27 +78,27 @@ define i32 @dotp(ptr %a, ptr %b) #0 {
 ; CHECK-MAXBW-NEXT:    br label [[ENTRY:%.*]]
 ; CHECK-MAXBW:       vector.ph:
 ; CHECK-MAXBW-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-MAXBW-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1024, [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i64 1024, [[N_MOD_VF]]
 ; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
-; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 2 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
+; CHECK-MAXBW-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[ENTRY]] ], [ [[PARTIAL_REDUCE:%.*]], [[FOR_BODY]] ]
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[A]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP7]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP7]], align 1
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[B]], i64 [[IV]]
-; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 8 x i8>, ptr [[TMP10]], align 1
-; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD1]] to <vscale x 8 x i32>
-; CHECK-MAXBW-NEXT:    [[TMP13:%.*]] = mul <vscale x 8 x i32> [[TMP12]], [[TMP9]]
-; CHECK-MAXBW-NEXT:    [[TMP14:%.*]] = sub <vscale x 8 x i32> zeroinitializer, [[TMP13]]
-; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 2 x i32> @llvm.vector.partial.reduce.add.nxv2i32.nxv8i32(<vscale x 2 x i32> [[VEC_PHI]], <vscale x 8 x i32> [[TMP14]])
+; CHECK-MAXBW-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 16 x i8>, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD1]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP5:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[TMP6:%.*]] = mul <vscale x 16 x i32> [[TMP4]], [[TMP5]]
+; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = sub <vscale x 16 x i32> zeroinitializer, [[TMP6]]
+; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP8]])
 ; CHECK-MAXBW-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP3]]
 ; CHECK-MAXBW-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK-MAXBW:       middle.block:
-; CHECK-MAXBW-NEXT:    [[TMP17:%.*]] = call i32 @llvm.vector.reduce.add.nxv2i32(<vscale x 2 x i32> [[PARTIAL_REDUCE]])
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[PARTIAL_REDUCE]])
 ; CHECK-MAXBW-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK-MAXBW:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
index 70532ad6586c7..46ec858d7455c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce.ll
@@ -45,8 +45,8 @@ define i32 @zext_add_reduc_i8_i32_sve(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -138,8 +138,8 @@ define i32 @zext_add_reduc_i8_i32_neon(ptr %a) #2 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP0]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP4]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -227,8 +227,8 @@ define i64 @zext_add_reduc_i8_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI]], <16 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v16i64(<2 x i64> [[VEC_PHI1]], <16 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -321,8 +321,8 @@ define i64 @zext_add_reduc_i16_i64(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, ptr [[TMP1]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i16>, ptr [[TMP3]], align 2
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <8 x i16> [[WIDE_LOAD]] to <8 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI]], <8 x i64> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <8 x i16> [[WIDE_LOAD2]] to <8 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v8i64(<2 x i64> [[VEC_PHI1]], <8 x i64> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -421,12 +421,12 @@ define i32 @zext_add_reduc_i8_i32_has_neon_dotprod(ptr %a) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <16 x i8>, ptr [[TMP8]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP10:%.*]] = zext <16 x i8> [[WIDE_LOAD5]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP10]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = zext <16 x i8> [[WIDE_LOAD6]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI3]], <16 x i32> [[TMP7]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 64
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -811,8 +811,8 @@ define i32 @sext_add_reduc_i8_i32(ptr %a) #0 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP1]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD2:%.*]] = load <16 x i8>, ptr [[TMP3]], align 1
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = sext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP4]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <16 x i8> [[WIDE_LOAD2]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE3]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI1]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -1000,11 +1000,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1:       vector.ph:
 ; CHECK-INTERLEAVE1-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
-; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVE1-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
+; CHECK-INTERLEAVE1-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
 ; CHECK-INTERLEAVE1-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-INTERLEAVE1:       vector.body:
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1012,7 +1011,8 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVE1-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVE1-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
-; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVE1-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVE1-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVE1-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
 ; CHECK-INTERLEAVE1-NEXT:    [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-INTERLEAVE1-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1031,23 +1031,23 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-INTERLEAVED:       vector.ph:
 ; CHECK-INTERLEAVED-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP2]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP2]], [[N_MOD_VF]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
+; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i8> poison, i8 [[C]], i64 0
 ; CHECK-INTERLEAVED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i8> [[BROADCAST_SPLATINSERT]], <16 x i8> poison, <16 x i32> zeroinitializer
-; CHECK-INTERLEAVED-NEXT:    [[TMP1:%.*]] = add i32 [[D]], [[N_VEC]]
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-INTERLEAVED-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
 ; CHECK-INTERLEAVED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK-INTERLEAVED:       vector.body:
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI1:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[TMP22:%.*]], [[VECTOR_BODY]] ]
-; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
+; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[VEC_PHI2:%.*]] = phi <4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-INTERLEAVED-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[VEC_PHI1]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP4]], i32 16
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP4]], align 1
 ; CHECK-INTERLEAVED-NEXT:    store <16 x i8> zeroinitializer, ptr [[TMP6]], align 1
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
-; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP3]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = zext <16 x i8> [[BROADCAST_SPLAT]] to <16 x i32>
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI2]], <16 x i32> [[TMP5]])
 ; CHECK-INTERLEAVED-NEXT:    [[TMP22]] = add nuw i32 [[VEC_PHI1]], 32
 ; CHECK-INTERLEAVED-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[TMP22]], [[N_VEC]]
 ; CHECK-INTERLEAVED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
@@ -1071,11 +1071,10 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[TMP4:%.*]] = mul nuw i32 [[TMP3]], 16
 ; CHECK-MAXBW-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[TMP0]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[N_VEC:%.*]] = sub i32 [[TMP0]], [[N_MOD_VF]]
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
-; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    [[TMP7:%.*]] = add i32 [[D]], [[N_VEC]]
 ; CHECK-MAXBW-NEXT:    [[TMP8:%.*]] = insertelement <vscale x 4 x i32> zeroinitializer, i32 [[A]], i32 0
-; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i8> poison, i8 [[C]], i64 0
+; CHECK-MAXBW-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i8> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer
 ; CHECK-MAXBW-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK-MAXBW:       vector.body:
 ; CHECK-MAXBW-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[FOR_BODY]] ]
@@ -1083,6 +1082,7 @@ define i32 @add_of_loop_invariant_zext(i32 %a, ptr %b, i8 %c, i32 %d) #0 {
 ; CHECK-MAXBW-NEXT:    [[OFFSET_IDX:%.*]] = add i32 [[D]], [[INDEX]]
 ; CHECK-MAXBW-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[B]], i32 [[OFFSET_IDX]]
 ; CHECK-MAXBW-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP10]], align 1
+; CHECK-MAXBW-NEXT:    [[TMP9:%.*]] = zext <vscale x 16 x i8> [[BROADCAST_SPLAT]] to <vscale x 16 x i32>
 ; CHECK-MAXBW-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 4 x i32> @llvm.vector.partial.reduce.add.nxv4i32.nxv16i32(<vscale x 4 x i32> [[VEC_PHI]], <vscale x 16 x i32> [[TMP9]])
 ; CHECK-MAXBW-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP4]]
 ; CHECK-MAXBW-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
@@ -1164,12 +1164,12 @@ define i64 @sext_reduction_i32_to_i64(ptr %arr, i64 %n) #1 {
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD5:%.*]] = load <4 x i32>, ptr [[TMP14]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[WIDE_LOAD6:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-INTERLEAVED-NEXT:    [[TMP15:%.*]] = sext <4 x i32> [[WIDE_LOAD]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
-; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI]], <4 x i64> [[TMP15]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP5:%.*]] = sext <4 x i32> [[WIDE_LOAD4]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE7]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI1]], <4 x i64> [[TMP5]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP6:%.*]] = sext <4 x i32> [[WIDE_LOAD5]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE8]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI2]], <4 x i64> [[TMP6]])
+; CHECK-INTERLEAVED-NEXT:    [[TMP7:%.*]] = sext <4 x i32> [[WIDE_LOAD6]] to <4 x i64>
 ; CHECK-INTERLEAVED-NEXT:    [[PARTIAL_REDUCE9]] = call <2 x i64> @llvm.vector.partial.reduce.add.v2i64.v4i64(<2 x i64> [[VEC_PHI3]], <4 x i64> [[TMP7]])
 ; CHECK-INTERLEAVED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; CHECK-INTERLEAVED-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
index ebf4a4fabb605..ab1486da794f7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/pr33053.ll
@@ -37,7 +37,7 @@ for.end:                                          ; preds = %for.body
   ret i32 %conv27
 }
 
-attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+neon" "use-soft-float"="false" }
 !llvm.ident = !{!0}
 
 !0 = !{!"clang"}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
index 25ee10077fb47..70685c1c3fe12 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/reg-usage.ll
@@ -192,7 +192,7 @@ define i32 @dotp_unrolled(i32 %num_out, i64 %num_in, ptr %a, ptr %b) {
 ; CHECK:       LV(REG): VF = 16
 ; CHECK-NEXT:  LV(REG): Found max usage: 2 item
 ; CHECK-NEXT:  LV(REG): RegisterClass: Generic::ScalarRC, 9 registers
-; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 12 registers
+; CHECK-NEXT:  LV(REG): RegisterClass: Generic::VectorRC, 6 registers
 ; CHECK-NEXT:  LV(REG): Found invariant usage: 1 item
 entry:
   br label %for.body
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
index 11cc971586773..fb7890a3b82f4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-reductions.ll
@@ -417,21 +417,17 @@ for.end:                                 ; preds = %for.body, %entry
 
 ; Note: This test was added to ensure we always check the legality of reductions (end emit a warning if necessary) before checking for memory dependencies
 ; CHECK-REMARK: Scalable vectorization not supported for the reduction operations found in this loop.
-; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 2)
+; CHECK-REMARK: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-REMARK: vectorized loop (vectorization width: 4, interleaved count: 1)
 define i32 @memory_dependence(ptr noalias nocapture %a, ptr noalias nocapture readonly %b, i64 %n) {
 ; CHECK-LABEL: @memory_dependence
 ; CHECK: vector.body:
 ; CHECK: %[[LOAD1:.*]] = load <4 x i32>
 ; CHECK: %[[LOAD2:.*]] = load <4 x i32>
-; CHECK: %[[LOAD3:.*]] = load <4 x i32>
-; CHECK: %[[LOAD4:.*]] = load <4 x i32>
-; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD3]], %[[LOAD1]]
-; CHECK: %[[ADD2:.*]] = add nsw <4 x i32> %[[LOAD4]], %[[LOAD2]]
-; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD3]]
-; CHECK: %[[MUL2:.*]] = mul <4 x i32> %[[LOAD4]]
+; CHECK: %[[ADD1:.*]] = add nsw <4 x i32> %[[LOAD2]], %[[LOAD1]]
+; CHECK: %[[MUL1:.*]] = mul <4 x i32> %[[LOAD2]]
 ; CHECK: middle.block:
-; CHECK: %[[RDX:.*]] = mul <4 x i32> %[[MUL2]], %[[MUL1]]
-; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[RDX]])
+; CHECK: call i32 @llvm.vector.reduce.mul.v4i32(<4 x i32> %[[MUL1]])
 entry:
   br label %for.body
 
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index c1f0a35069242..0d8a1021bd438 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -9,49 +9,57 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; DEFAULT-LABEL: define void @cost_store_i8(
 ; DEFAULT-SAME: ptr [[DST:%.*]]) #[[ATTR0:[0-9]+]] {
 ; DEFAULT-NEXT:  iter.check:
-; DEFAULT-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; DEFAULT-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP13:%.*]] = shl nuw i64 [[TMP10]], 3
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP13]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
-; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 5
-; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 101, [[TMP3]]
-; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 101, [[TMP1]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 32
-; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 101, [[TMP5]]
+; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 32
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 101, [[TMP3]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 101, [[N_MOD_VF]]
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP22:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP23:%.*]] = shl nuw i64 [[TMP22]], 4
-; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP9]], i64 [[TMP23]]
-; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP9]], align 1
-; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP24]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 4
+; DEFAULT-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP4]], i64 [[TMP6]]
+; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP4]], align 1
+; DEFAULT-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP7]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; DEFAULT:       middle.block:
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 101, [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; DEFAULT:       vec.epilog.iter.check:
-; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], [[TMP13]]
 ; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
 ; DEFAULT:       vec.epilog.ph:
 ; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP12:%.*]] = mul nuw i64 [[TMP11]], 8
+; DEFAULT-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 101, [[TMP12]]
+; DEFAULT-NEXT:    [[N_VEC3:%.*]] = sub i64 101, [[N_MOD_VF2]]
 ; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; DEFAULT:       vec.epilog.vector.body:
-; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP19:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]]
-; DEFAULT-NEXT:    store <8 x i8> zeroinitializer, ptr [[TMP19]], align 1
-; DEFAULT-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX5]], 8
-; DEFAULT-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT6]], 96
-; DEFAULT-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; DEFAULT-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX1]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> zeroinitializer, ptr [[TMP9]], align 1
+; DEFAULT-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], [[TMP12]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT2]], [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[TMP14]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; DEFAULT:       vec.epilog.middle.block:
-; DEFAULT-NEXT:    br i1 false, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
+; DEFAULT-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 101, [[N_VEC3]]
+; DEFAULT-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; DEFAULT:       vec.epilog.scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 96, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       loop:
 ; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -70,25 +78,25 @@ define void @cost_store_i8(ptr %dst) #0 {
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 16
-; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 4
-; PRED-NEXT:    [[TMP9:%.*]] = sub i64 101, [[TMP8]]
-; PRED-NEXT:    [[TMP10:%.*]] = icmp ugt i64 101, [[TMP8]]
-; PRED-NEXT:    [[TMP11:%.*]] = select i1 [[TMP10]], i64 [[TMP9]], i64 0
+; PRED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 4
+; PRED-NEXT:    [[TMP4:%.*]] = sub i64 101, [[TMP3]]
+; PRED-NEXT:    [[TMP5:%.*]] = icmp ugt i64 101, [[TMP3]]
+; PRED-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 [[TMP4]], i64 0
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 101)
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr align 1 [[TMP13]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv16i8.p0(<vscale x 16 x i8> zeroinitializer, ptr align 1 [[TMP7]], <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP11]])
-; PRED-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP12:%.*]] = xor i1 [[TMP14]], true
-; PRED-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX]], i64 [[TMP6]])
+; PRED-NEXT:    [[TMP8:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP9:%.*]] = xor i1 [[TMP8]], true
+; PRED-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PRED:       middle.block:
-; PRED-NEXT:    br label [[LOOP:%.*]]
+; PRED-NEXT:    br label [[EXIT:%.*]]
 ; PRED:       exit:
 ; PRED-NEXT:    ret void
 ;
@@ -113,33 +121,33 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:  iter.check:
 ; DEFAULT-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; DEFAULT:       vector.memcheck:
-; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000
-; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
-; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
-; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1000
+; DEFAULT-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; DEFAULT-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; DEFAULT-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
 ; DEFAULT-NEXT:    br i1 false, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT]], <16 x i16> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP0:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT]] to <16 x i8>
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
-; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
-; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
-; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP10]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
-; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META9]], !noalias [[META6]]
+; DEFAULT-NEXT:    [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT2]], <16 x i64> poison, <16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT3]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP3:%.*]] = and <16 x i8> [[TMP2]], [[TMP0]]
+; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; DEFAULT-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[TMP4]], i32 16
+; DEFAULT-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP4]], align 1, !alias.scope [[META9:![0-9]+]], !noalias [[META6]]
+; DEFAULT-NEXT:    store <16 x i8> [[TMP3]], ptr [[TMP5]], align 1, !alias.scope [[META9]], !noalias [[META6]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; DEFAULT-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
-; DEFAULT-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 992
+; DEFAULT-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; DEFAULT:       middle.block:
 ; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; DEFAULT:       vec.epilog.iter.check:
@@ -148,20 +156,20 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 992, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT4:%.*]] = insertelement <8 x i16> poison, i16 [[X]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT5:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT4]], <8 x i16> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP15:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8>
+; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <8 x i16> [[BROADCAST_SPLAT5]] to <8 x i8>
 ; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; DEFAULT:       vec.epilog.vector.body:
-; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; DEFAULT-NEXT:    [[TMP16:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP16]], i64 0
+; DEFAULT-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
+; DEFAULT-NEXT:    [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META6]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <8 x i64> poison, i64 [[TMP8]], i64 0
 ; DEFAULT-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <8 x i64> [[BROADCAST_SPLATINSERT7]], <8 x i64> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP18:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8>
-; DEFAULT-NEXT:    [[TMP14:%.*]] = and <8 x i8> [[TMP18]], [[TMP15]]
-; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX5]]
-; DEFAULT-NEXT:    store <8 x i8> [[TMP14]], ptr [[TMP26]], align 1, !alias.scope [[META9]], !noalias [[META6]]
-; DEFAULT-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], 8
-; DEFAULT-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT8]], 1000
-; DEFAULT-NEXT:    br i1 [[TMP17]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = trunc <8 x i64> [[BROADCAST_SPLAT8]] to <8 x i8>
+; DEFAULT-NEXT:    [[TMP10:%.*]] = and <8 x i8> [[TMP9]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP11:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX6]]
+; DEFAULT-NEXT:    store <8 x i8> [[TMP10]], ptr [[TMP11]], align 1, !alias.scope [[META9]], !noalias [[META6]]
+; DEFAULT-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 8
+; DEFAULT-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT9]], 1000
+; DEFAULT-NEXT:    br i1 [[TMP12]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; DEFAULT:       vec.epilog.middle.block:
 ; DEFAULT-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; DEFAULT:       vec.epilog.scalar.ph:
@@ -186,35 +194,35 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:  entry:
 ; PRED-NEXT:    br label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
-; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[DST]], i64 1000
-; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
-; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
-; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP1]]
+; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 1000
+; PRED-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
+; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; PRED-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
 ; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; PRED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
-; PRED-NEXT:    [[TMP10:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP10]], 2
+; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 2
 ; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 1000)
-; PRED-NEXT:    [[TMP11:%.*]] = trunc <vscale x 2 x i16> [[BROADCAST_SPLAT]] to <vscale x 2 x i8>
+; PRED-NEXT:    [[TMP2:%.*]] = trunc <vscale x 2 x i16> [[BROADCAST_SPLAT]] to <vscale x 2 x i8>
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[TMP7:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]]
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP7]], i64 0
+; PRED-NEXT:    [[TMP3:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META3:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP3]], i64 0
 ; PRED-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP8:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT3]] to <vscale x 2 x i8>
-; PRED-NEXT:    [[TMP9:%.*]] = and <vscale x 2 x i8> [[TMP8]], [[TMP11]]
-; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
-; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP9]], ptr align 1 [[TMP5]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
+; PRED-NEXT:    [[TMP4:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT3]] to <vscale x 2 x i8>
+; PRED-NEXT:    [[TMP5:%.*]] = and <vscale x 2 x i8> [[TMP4]], [[TMP2]]
+; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[DST]], i64 [[INDEX]]
+; PRED-NEXT:    call void @llvm.masked.store.nxv2i8.p0(<vscale x 2 x i8> [[TMP5]], ptr align 1 [[TMP6]], <vscale x 2 x i1> [[ACTIVE_LANE_MASK]]), !alias.scope [[META6:![0-9]+]], !noalias [[META3]]
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX_NEXT]], i64 1000)
-; PRED-NEXT:    [[TMP12:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
-; PRED-NEXT:    [[TMP13:%.*]] = xor i1 [[TMP12]], true
-; PRED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; PRED-NEXT:    [[TMP7:%.*]] = extractelement <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], i32 0
+; PRED-NEXT:    [[TMP8:%.*]] = xor i1 [[TMP7]], true
+; PRED-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br label [[EXIT:%.*]]
 ; PRED:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
index bbc0e33af8c84..9c8a18725afa4 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-epilog-vect.ll
@@ -10,8 +10,8 @@
 target triple = "aarch64-linux-gnu"
 
 ; DEBUG: LV: Checking a loop in 'main_vf_vscale_x_16'
-; DEBUG: Create Skeleton for epilogue vectorized loop (first pass)
-; DEBUG: Main Loop VF:vscale x 16, Main Loop UF:2, Epilogue Loop VF:8, Epilogue Loop UF:1
+; DEBUG-NOT: Create Skeleton for epilogue vectorized loop (first pass)
+; DEBUG: Executing best plan with VF=vscale x 16, UF=2
 
 ; DEBUG-FORCED: LV: Checking a loop in 'main_vf_vscale_x_16'
 ; DEBUG-FORCED: LEV: Epilogue vectorization factor is forced.
@@ -20,9 +20,7 @@ target triple = "aarch64-linux-gnu"
 
 define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-LABEL: @main_vf_vscale_x_16(
-; CHECK-NEXT:  iter.check:
-; CHECK-NEXT:    br i1 false, label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
-; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 5
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1024, [[TMP1]]
@@ -43,27 +41,12 @@ define void @main_vf_vscale_x_16(ptr %A) #0 {
 ; CHECK-NEXT:    store <vscale x 16 x i8> splat (i8 1), ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[VEC_EPILOG_ITER_CHECK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
-; CHECK:       vec.epilog.iter.check:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3:![0-9]+]]
-; CHECK:       vec.epilog.ph:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
-; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
-; CHECK:       vec.epilog.vector.body:
-; CHECK-NEXT:    [[INDEX1:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT2:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[A]], i64 [[INDEX1]]
-; CHECK-NEXT:    store <8 x i8> splat (i8 1), ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT2]] = add nuw i64 [[INDEX1]], 8
-; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT2]], 1024
-; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; CHECK:       vec.epilog.middle.block:
-; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       vec.epilog.scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 1024, [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]] ]
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
 ;
@@ -150,7 +133,7 @@ define void @main_vf_vscale_x_2_no_epi_iteration(ptr %A) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP5]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1024, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
@@ -261,13 +244,13 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <vscale x 2 x i64> splat (i64 1), ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF2:%.*]] = urem i64 [[N]], 8
@@ -279,7 +262,7 @@ define void @main_vf_vscale_x_2(ptr %A, i64 %n) #0 vscale_range(8, 8) {
 ; CHECK-NEXT:    store <8 x i64> splat (i64 1), ptr [[TMP9]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT5]] = add nuw i64 [[INDEX4]], 8
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT5]], [[N_VEC3]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N6:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
 ; CHECK-NEXT:    br i1 [[CMP_N6]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -382,14 +365,14 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[TMP6]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 10000, [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 8
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF3]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF7]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[START]], i64 10000
@@ -400,7 +383,7 @@ define void @test_pr57912_pointer_induction(ptr %start) #0 {
 ; CHECK-NEXT:    store <8 x i8> zeroinitializer, ptr [[NEXT_GEP2]], align 1
 ; CHECK-NEXT:    [[INDEX_NEXT3]] = add nuw i64 [[INDEX1]], 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT3]], 10000
-; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    br i1 true, label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; CHECK:       vec.epilog.scalar.ph:
@@ -513,14 +496,14 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP8]], align 4
 ; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; CHECK:       vec.epilog.iter.check:
 ; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF]], 2
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]], !prof [[PROF14:![0-9]+]]
 ; CHECK:       vec.epilog.ph:
 ; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF4:%.*]] = urem i64 [[N]], 2
@@ -528,15 +511,15 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; CHECK:       vec.epilog.vector.body:
 ; CHECK-NEXT:    [[INDEX6:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT9:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x float>, ptr [[TMP18]], align 4
-; CHECK-NEXT:    [[TMP19:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX6]]
-; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP19]], align 4
-; CHECK-NEXT:    [[TMP20:%.*]] = fmul <2 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
-; CHECK-NEXT:    store <2 x float> [[TMP20]], ptr [[TMP19]], align 4
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr inbounds nuw float, ptr [[A]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[WIDE_LOAD7:%.*]] = load <2 x float>, ptr [[TMP15]], align 4
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds nuw float, ptr [[B]], i64 [[INDEX6]]
+; CHECK-NEXT:    [[WIDE_LOAD8:%.*]] = load <2 x float>, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = fmul <2 x float> [[WIDE_LOAD7]], [[WIDE_LOAD8]]
+; CHECK-NEXT:    store <2 x float> [[TMP17]], ptr [[TMP16]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT9]] = add nuw i64 [[INDEX6]], 2
-; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]]
-; CHECK-NEXT:    br i1 [[TMP21]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    [[TMP18:%.*]] = icmp eq i64 [[INDEX_NEXT9]], [[N_VEC5]]
+; CHECK-NEXT:    br i1 [[TMP18]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       vec.epilog.middle.block:
 ; CHECK-NEXT:    [[CMP_N10:%.*]] = icmp eq i64 [[N]], [[N_VEC5]]
 ; CHECK-NEXT:    br i1 [[CMP_N10]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
@@ -575,8 +558,8 @@ define void @trip_count_vscale(ptr noalias %a, ptr noalias %b) vscale_range(1, 1
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
@@ -611,34 +594,34 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[V:%.*]] = tail call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[N:%.*]] = mul nuw nsw i64 [[V]], 1024
-; CHECK-NEXT:    br label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
+; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
-; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
-; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP3]]
+; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[TMP0]], 8
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], [[TMP1]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = shl nuw i64 [[TMP5]], 2
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[TMP4]], i64 [[TMP6]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP4]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP7]], align 4
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP10:%.*]] = shl nuw i64 [[TMP9]], 2
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds nuw float, ptr [[TMP8]], i64 [[TMP10]]
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[TMP12:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP13:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP12]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    store <vscale x 4 x float> [[TMP13]], ptr [[TMP11]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
-; CHECK-NEXT:    [[TMP17:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[A:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw float, ptr [[TMP2]], i64 [[TMP4]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x float>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x float>, ptr [[TMP5]], align 4
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP8:%.*]] = shl nuw i64 [[TMP7]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds nuw float, ptr [[TMP6]], i64 [[TMP8]]
+; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 4 x float>, ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 4 x float>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD]], [[WIDE_LOAD2]]
+; CHECK-NEXT:    [[TMP11:%.*]] = fmul <vscale x 4 x float> [[WIDE_LOAD1]], [[WIDE_LOAD3]]
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP10]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
+; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
@@ -676,8 +659,8 @@ define void @trip_count_vscale_no_epilogue_iterations(ptr noalias %a, ptr noalia
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP10]], ptr [[TMP6]], align 4
 ; CHECK-VF8-NEXT:    store <vscale x 4 x float> [[TMP11]], ptr [[TMP9]], align 4
 ; CHECK-VF8-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP1]]
-; CHECK-VF8-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-VF8-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-VF8-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-VF8-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK-VF8:       middle.block:
 ; CHECK-VF8-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-VF8-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH:%.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index 786a2aab6d0e7..28d2a278de498 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -1630,4 +1630,4 @@ for.body:                                         ; preds = %for.body, %entry
 
 }
 attributes #1 = { "target-features"="+sve" vscale_range(1, 16) }
-attributes #0 = { "unsafe-fp-math"="true" "target-features"="+sve" vscale_range(1, 16) }
+attributes #0 = { "target-features"="+sve" vscale_range(1, 16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
index ae8dc2d02fed2..005ca8c9b2d93 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-constant-ops.ll
@@ -175,18 +175,28 @@ define void @test_add_double_same_var_args_1(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[STRIDED_VEC]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[STRIDED_VEC3]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[STRIDED_VEC1]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[STRIDED_VEC4]], [[BROADCAST_SPLAT]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -227,18 +237,28 @@ define void @test_add_double_same_var_args_2(ptr %res, ptr noalias %A, ptr noali
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[A]], i64 [[TMP0]]
-; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = load <2 x double>, ptr [[TMP1]], align 4
-; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = load <2 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x double>, ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x double> [[WIDE_VEC]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x double>, ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x double> [[WIDE_VEC2]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP3:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC]]
+; CHECK-NEXT:    [[TMP4:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC3]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC1]]
 ; CHECK-NEXT:    [[TMP6:%.*]] = fadd <2 x double> [[BROADCAST_SPLAT]], [[STRIDED_VEC4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[INDEX]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds nuw { double, double }, ptr [[RES]], i64 [[TMP0]]
-; CHECK-NEXT:    store <2 x double> [[TMP5]], ptr [[TMP7]], align 4
-; CHECK-NEXT:    store <2 x double> [[TMP6]], ptr [[TMP8]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> [[TMP5]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP9]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = shufflevector <2 x double> [[TMP4]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x double> [[TMP10]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC5]], ptr [[TMP8]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP11]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
index b23702dc088b9..2a19402347e40 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-cost.ll
@@ -319,46 +319,46 @@ define void @single_fmul_used_by_each_member(ptr noalias %A, ptr noalias %B, ptr
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 3
+; CHECK-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP22:%.*]] = add i64 [[INDEX]], 6
 ; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[A]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP37:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP20]]
-; CHECK-NEXT:    [[TMP39:%.*]] = getelementptr double, ptr [[A]], i64 [[TMP22]]
-; CHECK-NEXT:    [[TMP24:%.*]] = load double, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <2 x double> poison, double [[TMP24]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT1]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP25:%.*]] = load double, ptr [[TMP33]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT12:%.*]] = insertelement <2 x double> poison, double [[TMP25]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT12]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP26:%.*]] = load double, ptr [[TMP37]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT14:%.*]] = insertelement <2 x double> poison, double [[TMP26]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT14]], <2 x double> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP27:%.*]] = load double, ptr [[TMP39]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT16:%.*]] = insertelement <2 x double> poison, double [[TMP27]], i64 0
-; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = shufflevector <2 x double> [[BROADCAST_SPLATINSERT16]], <2 x double> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP25:%.*]] = getelementptr double, ptr [[TMP23]], i32 2
+; CHECK-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[TMP23]], i32 4
+; CHECK-NEXT:    [[TMP27:%.*]] = getelementptr double, ptr [[TMP23]], i32 6
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x double>, ptr [[TMP23]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD12:%.*]] = load <2 x double>, ptr [[TMP25]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD13:%.*]] = load <2 x double>, ptr [[TMP26]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD14:%.*]] = load <2 x double>, ptr [[TMP27]], align 8
 ; CHECK-NEXT:    [[TMP28:%.*]] = fmul <2 x double> [[WIDE_LOAD]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP29:%.*]] = fmul <2 x double> [[WIDE_LOAD12]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP30:%.*]] = fmul <2 x double> [[WIDE_LOAD13]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP31:%.*]] = fmul <2 x double> [[WIDE_LOAD14]], splat (double 5.000000e+00)
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP33:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP34:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP21]]
-; CHECK-NEXT:    [[TMP38:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP20]]
 ; CHECK-NEXT:    [[TMP35:%.*]] = getelementptr { double, double }, ptr [[B]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP32]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP34]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP38]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP35]], align 8
+; CHECK-NEXT:    [[TMP36:%.*]] = shufflevector <2 x double> [[TMP28]], <2 x double> [[TMP28]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x double> [[TMP36]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP32]], align 8
+; CHECK-NEXT:    [[TMP37:%.*]] = shufflevector <2 x double> [[TMP29]], <2 x double> [[TMP29]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC15:%.*]] = shufflevector <4 x double> [[TMP37]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP33]], align 8
+; CHECK-NEXT:    [[TMP38:%.*]] = shufflevector <2 x double> [[TMP30]], <2 x double> [[TMP30]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC16:%.*]] = shufflevector <4 x double> [[TMP38]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP34]], align 8
+; CHECK-NEXT:    [[TMP39:%.*]] = shufflevector <2 x double> [[TMP31]], <2 x double> [[TMP31]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC17:%.*]] = shufflevector <4 x double> [[TMP39]], <4 x double> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP35]], align 8
 ; CHECK-NEXT:    [[TMP40:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP41:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP20]]
+; CHECK-NEXT:    [[TMP42:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP21]]
 ; CHECK-NEXT:    [[TMP43:%.*]] = getelementptr { double, double }, ptr [[C]], i64 [[TMP22]]
-; CHECK-NEXT:    store <2 x double> [[TMP28]], ptr [[TMP40]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP29]], ptr [[TMP42]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP30]], ptr [[TMP41]], align 8
-; CHECK-NEXT:    store <2 x double> [[TMP31]], ptr [[TMP43]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC]], ptr [[TMP40]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC15]], ptr [[TMP41]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC16]], ptr [[TMP42]], align 8
+; CHECK-NEXT:    store <4 x double> [[INTERLEAVED_VEC17]], ptr [[TMP43]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
 ; CHECK-NEXT:    [[TMP44:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP44]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -435,7 +435,7 @@ exit:
   ret void
 }
 
-; We should interleave by 2 after narrowing interleave groups to saturate
+; FIXME: We should interleave by 2 after narrowing interleave groups to saturate
 ; load/store units.
 define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK-LABEL: define void @test_interleave_after_narrowing(
@@ -447,18 +447,12 @@ define void @test_interleave_after_narrowing(i32 %n, ptr %x, ptr noalias %y) {
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[OFFSET_IDX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds nuw float, ptr [[X]], i64 [[TMP5]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x float>, ptr [[TMP7]], align 4
 ; CHECK-NEXT:    [[TMP1:%.*]] = fneg <4 x float> [[WIDE_LOAD]]
-; CHECK-NEXT:    [[TMP4:%.*]] = fneg <4 x float> [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[OFFSET_IDX]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw float, ptr [[Y]], i64 [[TMP5]]
 ; CHECK-NEXT:    store <4 x float> [[TMP1]], ptr [[TMP2]], align 4
-; CHECK-NEXT:    store <4 x float> [[TMP4]], ptr [[TMP6]], align 4
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
 ; CHECK-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
new file mode 100644
index 0000000000000..46b0ebdd2fa62
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-multi-block.ll
@@ -0,0 +1,276 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=1 -S %s | FileCheck --check-prefixes=VF2IC1 %s
+; RUN: opt -p loop-vectorize -force-vector-width=2 -force-vector-interleave=2 -S %s | FileCheck --check-prefixes=VF2IC2 %s
+
+target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
+target triple = "arm64-apple-macosx"
+
+define void @load_store_interleave_group_block_invar_cond(ptr noalias %data, ptr noalias %dst.0, ptr noalias %dst.1, i1 %c) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP4:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP4]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC1-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_invar_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[DST_0:%.*]], ptr noalias [[DST_1:%.*]], i1 [[C:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE11:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP8:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP8]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP9:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP9]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF6:.*]], label %[[PRED_STORE_CONTINUE7:.*]]
+; VF2IC2:       [[PRED_STORE_IF6]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE7]]
+; VF2IC2:       [[PRED_STORE_CONTINUE7]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF8:.*]], label %[[PRED_STORE_CONTINUE9:.*]]
+; VF2IC2:       [[PRED_STORE_IF8]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE9]]
+; VF2IC2:       [[PRED_STORE_CONTINUE9]]:
+; VF2IC2-NEXT:    br i1 [[C]], label %[[PRED_STORE_IF10:.*]], label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_IF10]]:
+; VF2IC2-NEXT:    store i8 1, ptr [[DST_0]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE11]]
+; VF2IC2:       [[PRED_STORE_CONTINUE11]]:
+; VF2IC2-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i8, ptr [[DST_1]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[TMP5]], i32 2
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP5]], align 1
+; VF2IC2-NEXT:    store <2 x i8> zeroinitializer, ptr [[TMP6]], align 1
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %dst.0
+  br label %loop.latch
+
+loop.latch:
+  %gep.dst.1 = getelementptr inbounds i8, ptr %dst.1, i64 %iv
+  store i8 0, ptr %gep.dst.1
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
+
+define void @load_store_interleave_group_block_var_cond(ptr noalias %data, ptr %masks, ptr noalias %dst) {
+; VF2IC1-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC1-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC1-NEXT:  [[ENTRY:.*:]]
+; VF2IC1-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC1:       [[VECTOR_PH]]:
+; VF2IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC1:       [[VECTOR_BODY]]:
+; VF2IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE3:.*]] ]
+; VF2IC1-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
+; VF2IC1-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC1-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC1-NEXT:    [[TMP11:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP11]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC1-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2IC1-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC1-NEXT:    [[WIDE_LOAD1:%.*]] = load <2 x i8>, ptr [[TMP2]], align 1
+; VF2IC1-NEXT:    [[TMP3:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD1]], zeroinitializer
+; VF2IC1-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP3]], i32 0
+; VF2IC1-NEXT:    br i1 [[TMP4]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC1:       [[PRED_STORE_IF]]:
+; VF2IC1-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; VF2IC1-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP5]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP6]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC1:       [[PRED_STORE_CONTINUE]]:
+; VF2IC1-NEXT:    [[TMP7:%.*]] = extractelement <2 x i1> [[TMP3]], i32 1
+; VF2IC1-NEXT:    br i1 [[TMP7]], label %[[PRED_STORE_IF2:.*]], label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_IF2]]:
+; VF2IC1-NEXT:    [[TMP8:%.*]] = add i64 [[INDEX]], 1
+; VF2IC1-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP8]]
+; VF2IC1-NEXT:    store i8 1, ptr [[TMP9]], align 1
+; VF2IC1-NEXT:    br label %[[PRED_STORE_CONTINUE3]]
+; VF2IC1:       [[PRED_STORE_CONTINUE3]]:
+; VF2IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; VF2IC1-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC1-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC1:       [[MIDDLE_BLOCK]]:
+; VF2IC1-NEXT:    br label %[[EXIT:.*]]
+; VF2IC1:       [[EXIT]]:
+; VF2IC1-NEXT:    ret void
+;
+; VF2IC2-LABEL: define void @load_store_interleave_group_block_var_cond(
+; VF2IC2-SAME: ptr noalias [[DATA:%.*]], ptr [[MASKS:%.*]], ptr noalias [[DST:%.*]]) {
+; VF2IC2-NEXT:  [[ENTRY:.*:]]
+; VF2IC2-NEXT:    br label %[[VECTOR_PH:.*]]
+; VF2IC2:       [[VECTOR_PH]]:
+; VF2IC2-NEXT:    br label %[[VECTOR_BODY:.*]]
+; VF2IC2:       [[VECTOR_BODY]]:
+; VF2IC2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[PRED_STORE_CONTINUE12:.*]] ]
+; VF2IC2-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP2:%.*]] = shl nsw i64 [[TMP0]], 1
+; VF2IC2-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP1]]
+; VF2IC2-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP2]]
+; VF2IC2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[WIDE_VEC2:%.*]] = load <4 x i64>, ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2IC2-NEXT:    [[STRIDED_VEC4:%.*]] = shufflevector <4 x i64> [[WIDE_VEC2]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2IC2-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP5]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
+; VF2IC2-NEXT:    [[TMP6:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC3]], <2 x i64> [[STRIDED_VEC4]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2IC2-NEXT:    [[INTERLEAVED_VEC5:%.*]] = shufflevector <4 x i64> [[TMP6]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2IC2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC5]], ptr [[TMP4]], align 8
+; VF2IC2-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[INDEX]]
+; VF2IC2-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[TMP7]], i32 2
+; VF2IC2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i8>, ptr [[TMP7]], align 1
+; VF2IC2-NEXT:    [[WIDE_LOAD6:%.*]] = load <2 x i8>, ptr [[TMP8]], align 1
+; VF2IC2-NEXT:    [[TMP9:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP10:%.*]] = icmp eq <2 x i8> [[WIDE_LOAD6]], zeroinitializer
+; VF2IC2-NEXT:    [[TMP11:%.*]] = extractelement <2 x i1> [[TMP9]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP11]], label %[[PRED_STORE_IF:.*]], label %[[PRED_STORE_CONTINUE:.*]]
+; VF2IC2:       [[PRED_STORE_IF]]:
+; VF2IC2-NEXT:    [[TMP12:%.*]] = add i64 [[INDEX]], 0
+; VF2IC2-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP12]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP13]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE]]
+; VF2IC2:       [[PRED_STORE_CONTINUE]]:
+; VF2IC2-NEXT:    [[TMP14:%.*]] = extractelement <2 x i1> [[TMP9]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP14]], label %[[PRED_STORE_IF7:.*]], label %[[PRED_STORE_CONTINUE8:.*]]
+; VF2IC2:       [[PRED_STORE_IF7]]:
+; VF2IC2-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], 1
+; VF2IC2-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP15]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP16]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE8]]
+; VF2IC2:       [[PRED_STORE_CONTINUE8]]:
+; VF2IC2-NEXT:    [[TMP17:%.*]] = extractelement <2 x i1> [[TMP10]], i32 0
+; VF2IC2-NEXT:    br i1 [[TMP17]], label %[[PRED_STORE_IF9:.*]], label %[[PRED_STORE_CONTINUE10:.*]]
+; VF2IC2:       [[PRED_STORE_IF9]]:
+; VF2IC2-NEXT:    [[TMP18:%.*]] = add i64 [[INDEX]], 2
+; VF2IC2-NEXT:    [[TMP19:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP18]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP19]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE10]]
+; VF2IC2:       [[PRED_STORE_CONTINUE10]]:
+; VF2IC2-NEXT:    [[TMP20:%.*]] = extractelement <2 x i1> [[TMP10]], i32 1
+; VF2IC2-NEXT:    br i1 [[TMP20]], label %[[PRED_STORE_IF11:.*]], label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_IF11]]:
+; VF2IC2-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX]], 3
+; VF2IC2-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i8, ptr [[MASKS]], i64 [[TMP21]]
+; VF2IC2-NEXT:    store i8 1, ptr [[TMP22]], align 1
+; VF2IC2-NEXT:    br label %[[PRED_STORE_CONTINUE12]]
+; VF2IC2:       [[PRED_STORE_CONTINUE12]]:
+; VF2IC2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; VF2IC2-NEXT:    [[TMP23:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; VF2IC2-NEXT:    br i1 [[TMP23]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+; VF2IC2:       [[MIDDLE_BLOCK]]:
+; VF2IC2-NEXT:    br label %[[EXIT:.*]]
+; VF2IC2:       [[EXIT]]:
+; VF2IC2-NEXT:    ret void
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %mul.2 = shl nsw i64 %iv, 1
+  %data.0 = getelementptr inbounds i64, ptr %data, i64 %mul.2
+  %l.0 = load i64, ptr %data.0, align 8
+  store i64 %l.0, ptr %data.0, align 8
+  %add.1 = or disjoint i64 %mul.2, 1
+  %data.1 = getelementptr inbounds i64, ptr %data, i64 %add.1
+  %l.1 = load i64, ptr %data.1, align 8
+  store i64 %l.1, ptr %data.1, align 8
+  %gep.mask = getelementptr inbounds i8, ptr %masks, i64 %iv
+  %l.mask = load i8, ptr %gep.mask
+  %c = icmp eq i8 %l.mask, 0
+  br i1 %c, label %then, label %loop.latch
+
+then:
+  store i8 1, ptr %gep.mask
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add nuw nsw i64 %iv, 1
+  %ec = icmp eq i64 %iv.next, 100
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
index 2865495954140..c26176028626b 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-remove-loop-region.ll
@@ -13,8 +13,12 @@ define void @load_store_interleave_group_tc_2(ptr noalias %data) {
 ; VF2:       [[VECTOR_PH]]:
 ; VF2-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; VF2:       [[VECTOR_BODY]]:
-; VF2-NEXT:    [[WIDE_LOAD:%.*]] = load <2 x i64>, ptr [[DATA]], align 8
-; VF2-NEXT:    store <2 x i64> [[WIDE_LOAD]], ptr [[DATA]], align 8
+; VF2-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[DATA]], align 8
+; VF2-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; VF2-NEXT:    [[STRIDED_VEC1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[STRIDED_VEC]], <2 x i64> [[STRIDED_VEC1]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[DATA]], align 8
 ; VF2-NEXT:    br label %[[MIDDLE_BLOCK:.*]]
 ; VF2:       [[MIDDLE_BLOCK]]:
 ; VF2-NEXT:    br label %[[EXIT:.*]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
index 829acbbf71548..b63e03dccdc18 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-scalable.ll
@@ -1,81 +1,29 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --filter-out-after "^scalar.ph:" --version 5
-; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=IC1 %s
-; RUN: opt -p loop-vectorize -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
+; RUN: opt -p loop-vectorize -force-vector-interleave=1 -S -mcpu=neoverse-512tvb %s | FileCheck --check-prefixes=CHECK %s
 
 target triple = "aarch64-unknown-linux"
 
 define void @load_store_interleave_group(ptr noalias %data) {
-; IC1-LABEL: define void @load_store_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP5]], align 8
-; IC1-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 100, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @load_store_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]]) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 100, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 100, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 100, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP24:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP9:%.*]] = add i64 [[TMP24]], 0
-; CHECK-NEXT:    [[TMP10:%.*]] = mul i64 [[TMP9]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP10]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP7]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x i64>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x i64>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x i64>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x i64>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD1]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD2]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x i64> [[WIDE_LOAD3]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -105,82 +53,27 @@ exit:
 }
 
 define void @test_2xi64_unary_op_load_interleave_group(ptr noalias %data, ptr noalias %factor) {
-; IC1-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
-; IC1-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
-; IC1-NEXT:  [[ENTRY:.*:]]
-; IC1-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP1:%.*]] = shl nuw i64 [[TMP0]], 1
-; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP1]]
-; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
-; IC1:       [[VECTOR_PH]]:
-; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; IC1-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
-; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
-; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
-; IC1:       [[VECTOR_BODY]]:
-; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; IC1-NEXT:    [[TMP4:%.*]] = shl nsw i64 [[INDEX]], 1
-; IC1-NEXT:    [[TMP5:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP4]]
-; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 2 x double>, ptr [[TMP5]], align 8
-; IC1-NEXT:    [[TMP6:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD]]
-; IC1-NEXT:    store <vscale x 2 x double> [[TMP6]], ptr [[TMP5]], align 8
-; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
-; IC1-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; IC1-NEXT:    br i1 [[TMP7]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
-; IC1:       [[MIDDLE_BLOCK]]:
-; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 1111, [[N_VEC]]
-; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
-; IC1:       [[SCALAR_PH]]:
-;
 ; CHECK-LABEL: define void @test_2xi64_unary_op_load_interleave_group(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]]) #[[ATTR0]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 3
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw i64 [[TMP4]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 1111, [[TMP5]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 8
+; CHECK-NEXT:    [[TMP3:%.*]] = mul nuw i64 [[TMP2]], 2
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 1111, [[TMP3]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 1111, [[N_MOD_VF]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP2]], 4
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP20:%.*]] = add i64 [[TMP2]], 0
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP20]], 1
-; CHECK-NEXT:    [[TMP24:%.*]] = add i64 [[INDEX]], [[TMP6]]
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP2]], 2
-; CHECK-NEXT:    [[TMP28:%.*]] = add i64 [[TMP8]], 0
-; CHECK-NEXT:    [[TMP29:%.*]] = mul i64 [[TMP28]], 1
-; CHECK-NEXT:    [[TMP11:%.*]] = add i64 [[INDEX]], [[TMP29]]
-; CHECK-NEXT:    [[TMP12:%.*]] = mul i64 [[TMP2]], 3
-; CHECK-NEXT:    [[TMP13:%.*]] = add i64 [[TMP12]], 0
-; CHECK-NEXT:    [[TMP14:%.*]] = mul i64 [[TMP13]], 1
-; CHECK-NEXT:    [[TMP15:%.*]] = add i64 [[INDEX]], [[TMP14]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP17:%.*]] = shl nsw i64 [[TMP24]], 1
-; CHECK-NEXT:    [[TMP18:%.*]] = shl nsw i64 [[TMP11]], 1
-; CHECK-NEXT:    [[TMP19:%.*]] = shl nsw i64 [[TMP15]], 1
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP17]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP18]]
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds double, ptr [[DATA]], i64 [[TMP19]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = load <vscale x 2 x double>, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 2 x double>, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <vscale x 2 x double>, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <vscale x 2 x double>, ptr [[TMP23]], align 8
 ; CHECK-NEXT:    [[TMP9:%.*]] = fneg <vscale x 2 x double> [[TMP7]]
-; CHECK-NEXT:    [[TMP25:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP26:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP27:%.*]] = fneg <vscale x 2 x double> [[WIDE_LOAD3]]
 ; CHECK-NEXT:    store <vscale x 2 x double> [[TMP9]], ptr [[TMP1]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP25]], ptr [[TMP21]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP26]], ptr [[TMP22]], align 8
-; CHECK-NEXT:    store <vscale x 2 x double> [[TMP27]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP16]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP10]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
@@ -210,3 +103,175 @@ loop:
 exit:
   ret void
 }
+
+define void @test_masked_interleave_group(i32 %N, ptr %mask, ptr %src, ptr %dst) {
+; IC1-LABEL: define void @test_masked_interleave_group(
+; IC1-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; IC1-NEXT:  [[ENTRY:.*:]]
+; IC1-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; IC1-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; IC1-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; IC1-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; IC1-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; IC1:       [[VECTOR_MEMCHECK]]:
+; IC1-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; IC1-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; IC1-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; IC1-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; IC1-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; IC1-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; IC1-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; IC1-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; IC1-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; IC1-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; IC1-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; IC1-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; IC1-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; IC1-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; IC1:       [[VECTOR_PH]]:
+; IC1-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; IC1-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; IC1-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; IC1-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; IC1-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; IC1-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; IC1-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; IC1-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; IC1-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; IC1-NEXT:    br label %[[VECTOR_BODY:.*]]
+; IC1:       [[VECTOR_BODY]]:
+; IC1-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; IC1-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; IC1-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; IC1-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; IC1-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; IC1-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; IC1-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; IC1-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; IC1-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; IC1-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; IC1-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; IC1-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; IC1-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; IC1-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; IC1-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; IC1-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; IC1-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; IC1-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; IC1:       [[MIDDLE_BLOCK]]:
+; IC1-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; IC1-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; IC1:       [[SCALAR_PH]]:
+;
+; CHECK-LABEL: define void @test_masked_interleave_group(
+; CHECK-SAME: i32 [[N:%.*]], ptr [[MASK:%.*]], ptr [[SRC:%.*]], ptr [[DST:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[TMP0:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], 1
+; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
+; CHECK-NEXT:    [[UMAX:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP3]], i64 8)
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[UMAX]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_MEMCHECK:.*]]
+; CHECK:       [[VECTOR_MEMCHECK]]:
+; CHECK-NEXT:    [[TMP4:%.*]] = zext i32 [[N]] to i64
+; CHECK-NEXT:    [[TMP5:%.*]] = shl nuw nsw i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP6:%.*]] = add nuw nsw i64 [[TMP5]], 16
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = add nuw nsw i64 [[TMP4]], 1
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[TMP7]]
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP6]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr [[MASK]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[BOUND03:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP2]]
+; CHECK-NEXT:    [[BOUND14:%.*]] = icmp ult ptr [[SRC]], [[SCEVGEP]]
+; CHECK-NEXT:    [[FOUND_CONFLICT5:%.*]] = and i1 [[BOUND03]], [[BOUND14]]
+; CHECK-NEXT:    [[CONFLICT_RDX:%.*]] = or i1 [[FOUND_CONFLICT]], [[FOUND_CONFLICT5]]
+; CHECK-NEXT:    br i1 [[CONFLICT_RDX]], label %[[SCALAR_PH]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP9:%.*]] = mul nuw i64 [[TMP8]], 4
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP9]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEXT:    [[TMP10:%.*]] = trunc i64 [[N_VEC]] to i32
+; CHECK-NEXT:    [[TMP11:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
+; CHECK-NEXT:    [[TMP13:%.*]] = mul i64 [[N_VEC]], 16
+; CHECK-NEXT:    [[TMP14:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[TMP13]]
+; CHECK-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[N_VEC]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[OFFSET_IDX]]
+; CHECK-NEXT:    [[OFFSET_IDX6:%.*]] = mul i64 [[INDEX]], 16
+; CHECK-NEXT:    [[NEXT_GEP7:%.*]] = getelementptr i8, ptr [[SRC]], i64 [[OFFSET_IDX6]]
+; CHECK-NEXT:    [[NEXT_GEP8:%.*]] = getelementptr i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP8]], align 1, !alias.scope [[META6:![0-9]+]]
+; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq <vscale x 4 x i8> [[WIDE_LOAD]], zeroinitializer
+; CHECK-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 16 x float> @llvm.masked.load.nxv16f32.p0(ptr align 4 [[NEXT_GEP7]], <vscale x 16 x i1> [[INTERLEAVED_MASK]], <vscale x 16 x float> poison), !alias.scope [[META9:![0-9]+]]
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } @llvm.vector.deinterleave4.nxv16f32(<vscale x 16 x float> [[WIDE_MASKED_VEC]])
+; CHECK-NEXT:    [[TMP17:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 0
+; CHECK-NEXT:    [[TMP18:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 1
+; CHECK-NEXT:    [[TMP19:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 2
+; CHECK-NEXT:    [[TMP20:%.*]] = extractvalue { <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float> } [[STRIDED_VEC]], 3
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 16 x float> @llvm.vector.interleave4.nxv16f32(<vscale x 4 x float> [[TMP17]], <vscale x 4 x float> [[TMP18]], <vscale x 4 x float> [[TMP19]], <vscale x 4 x float> [[TMP20]])
+; CHECK-NEXT:    [[INTERLEAVED_MASK9:%.*]] = call <vscale x 16 x i1> @llvm.vector.interleave4.nxv16i1(<vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]], <vscale x 4 x i1> [[TMP16]])
+; CHECK-NEXT:    call void @llvm.masked.store.nxv16f32.p0(<vscale x 16 x float> [[INTERLEAVED_VEC]], ptr align 4 [[NEXT_GEP]], <vscale x 16 x i1> [[INTERLEAVED_MASK9]]), !alias.scope [[META11:![0-9]+]], !noalias [[META13:![0-9]+]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP9]]
+; CHECK-NEXT:    [[TMP21:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP21]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], [[EXIT:label %.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i32 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %dst.iv = phi ptr [ %dst, %entry ], [ %dst.iv.next, %loop.latch ]
+  %src.iv = phi ptr [ %src, %entry ], [ %src.iv.next, %loop.latch ]
+  %mask.iv = phi ptr [ %mask, %entry ], [ %mask.iv.next, %loop.latch ]
+  %mask.iv.next = getelementptr i8, ptr %mask.iv, i64 1
+  %mask.val = load i8, ptr %mask.iv, align 1
+  %should.copy = icmp eq i8 %mask.val, 0
+  br i1 %should.copy, label %then, label %loop.latch
+
+then:
+  %elem0 = load float, ptr %src.iv, align 4
+  store float %elem0, ptr %dst.iv, align 4
+  %src.1.ptr = getelementptr i8, ptr %src.iv, i64 4
+  %s1 = load float, ptr %src.1.ptr, align 4
+  %dst.1.ptr = getelementptr i8, ptr %dst.iv, i64 4
+  store float %s1, ptr %dst.1.ptr, align 4
+  %src.2.ptr = getelementptr i8, ptr %src.iv, i64 8
+  %s2 = load float, ptr %src.2.ptr, align 4
+  %dst.2.ptr = getelementptr i8, ptr %dst.iv, i64 8
+  store float %s2, ptr %dst.2.ptr, align 4
+  %src.3.ptr = getelementptr i8, ptr %src.iv, i64 12
+  %s3 = load float, ptr %src.3.ptr, align 4
+  %dst.3.ptr = getelementptr i8, ptr %dst.iv, i64 12
+  store float %s3, ptr %dst.3.ptr, align 4
+  br label %loop.latch
+
+loop.latch:
+  %iv.next = add i32 %iv, 1
+  %src.iv.next = getelementptr i8, ptr %src.iv, i64 16
+  %dst.iv.next = getelementptr i8, ptr %dst.iv, i64 16
+  %ec = icmp eq i32 %iv, %N
+  br i1 %ec, label %exit, label %loop.header
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
index abfb44dcbd36d..d290f2d4f5bc3 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory-unroll.ll
@@ -60,26 +60,32 @@ define void @test_2xi64_with_wide_load(ptr noalias %data, ptr noalias %factor) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 2
 ; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load i64, ptr [[TMP1]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <2 x i64> poison, i64 [[TMP2]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT]], <2 x i64> poison, <2 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP3:%.*]] = load i64, ptr [[TMP4]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <2 x i64> poison, i64 [[TMP3]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLATINSERT2]], <2 x i64> poison, <2 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i64, ptr [[TMP1]], i32 2
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = load <2 x i64>, ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[BROADCAST_SPLAT3:%.*]] = load <2 x i64>, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP7:%.*]] = shl nsw i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i64, ptr [[DATA]], i64 [[TMP7]]
-; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = load <2 x i64>, ptr [[TMP8]], align 8
-; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = load <2 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <4 x i64>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <4 x i64> [[WIDE_VEC]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[WIDE_VEC3:%.*]] = load <4 x i64>, ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT:    [[STRIDED_VEC5:%.*]] = shufflevector <4 x i64> [[WIDE_VEC3]], <4 x i64> poison, <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT:    [[TMP10:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[WIDE_LOAD]]
+; CHECK-NEXT:    [[TMP11:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[WIDE_LOAD1]]
 ; CHECK-NEXT:    [[TMP15:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT]], [[STRIDED_VEC2]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = mul <2 x i64> [[BROADCAST_SPLAT3]], [[STRIDED_VEC5]]
-; CHECK-NEXT:    store <2 x i64> [[TMP15]], ptr [[TMP8]], align 8
-; CHECK-NEXT:    store <2 x i64> [[TMP16]], ptr [[TMP9]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
+; CHECK-NEXT:    [[TMP17:%.*]] = shufflevector <2 x i64> [[TMP10]], <2 x i64> [[TMP15]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP17]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[TMP18:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> [[TMP16]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; CHECK-NEXT:    [[INTERLEAVED_VEC6:%.*]] = shufflevector <4 x i64> [[TMP18]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; CHECK-NEXT:    store <4 x i64> [[INTERLEAVED_VEC6]], ptr [[TMP9]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
index f2e689c0b419d..75980ba1189cd 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/transform-narrow-interleave-to-widen-memory.ll
@@ -328,8 +328,10 @@ define void @same_live_in_store_interleave_group(i64 %x, ptr noalias %dst) {
 ; VF2-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; VF2-NEXT:    [[TMP0:%.*]] = shl nsw i64 [[INDEX]], 1
 ; VF2-NEXT:    [[TMP1:%.*]] = getelementptr inbounds i64, ptr [[DST]], i64 [[TMP0]]
-; VF2-NEXT:    store <2 x i64> [[BROADCAST_SPLAT]], ptr [[TMP1]], align 8
-; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 1
+; VF2-NEXT:    [[TMP2:%.*]] = shufflevector <2 x i64> [[BROADCAST_SPLAT]], <2 x i64> [[BROADCAST_SPLAT]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; VF2-NEXT:    [[INTERLEAVED_VEC:%.*]] = shufflevector <4 x i64> [[TMP2]], <4 x i64> poison, <4 x i32> <i32 0, i32 2, i32 1, i32 3>
+; VF2-NEXT:    store <4 x i64> [[INTERLEAVED_VEC]], ptr [[TMP1]], align 8
+; VF2-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 2
 ; VF2-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
 ; VF2-NEXT:    br i1 [[TMP3]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; VF2:       [[MIDDLE_BLOCK]]:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
index d4e5dea3d4aab..49f663f5703b6 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/vplan-printing.ll
@@ -23,18 +23,15 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT: vector.body:
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV:%.+]]> = CANONICAL-INDUCTION ir<0>, vp<[[CAN_IV_NEXT:%.+]]>
-; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, ir<[[REDUCE:%.+]]> (VF scaled by 1/4)
+; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<[[ACC:%.+]]> = phi vp<[[RDX_START]]>, vp<[[REDUCE:%.+]]> (VF scaled by 1/4)
 ; CHECK-NEXT:   vp<[[STEPS:%.+]]> = SCALAR-STEPS vp<[[CAN_IV]]>, ir<1>, vp<[[VF]]>
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_A:%.+]]> = vector-pointer ir<%gep.a>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load vp<[[PTR_A]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[STEPS]]>
 ; CHECK-NEXT:   vp<[[PTR_B:%.+]]> = vector-pointer ir<%gep.b>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load vp<[[PTR_B]]>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
-; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
-; CHECK-NEXT:   PARTIAL-REDUCE ir<[[REDUCE]]> = add ir<[[ACC]]>, ir<%mul>
+; CHECK-NEXT:   EXPRESSION vp<[[REDUCE]]> = ir<[[ACC]]> + partial.reduce.add (mul (ir<%load.b> zext to i32), (ir<%load.a> zext to i32))
 ; CHECK-NEXT:   EMIT vp<[[CAN_IV_NEXT]]> = add nuw vp<[[CAN_IV]]>, vp<[[VFxUF]]>
 ; CHECK-NEXT:   EMIT branch-on-count vp<[[CAN_IV_NEXT]]>, vp<[[VEC_TC]]>
 ; CHECK-NEXT: No successors
@@ -42,7 +39,7 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT: Successor(s): middle.block
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
-; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, ir<[[REDUCE]]>
+; CHECK-NEXT:   EMIT vp<[[RED_RESULT:%.+]]> = compute-reduction-result ir<[[ACC]]>, vp<[[REDUCE]]>
 ; CHECK-NEXT:   EMIT vp<[[CMP:%.+]]> = icmp eq ir<1024>, vp<[[VEC_TC]]>
 ; CHECK-NEXT:   EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph
@@ -89,10 +86,10 @@ define i32 @print_partial_reduction(ptr %a, ptr %b) {
 ; CHECK-NEXT:   WIDEN-REDUCTION-PHI ir<%accum> = phi vp<[[RDX_START]]>, ir<%add> (VF scaled by 1/4)
 ; CHECK-NEXT:   CLONE ir<%gep.a> = getelementptr ir<%a>, vp<[[EP_IV]]>
 ; CHECK-NEXT:   WIDEN ir<%load.a> = load ir<%gep.a>
-; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   CLONE ir<%gep.b> = getelementptr ir<%b>, vp<[[EP_IV]]>
 ; CHECK-NEXT:   WIDEN ir<%load.b> = load ir<%gep.b>
 ; CHECK-NEXT:   WIDEN-CAST ir<%ext.b> = zext ir<%load.b> to i32
+; CHECK-NEXT:   WIDEN-CAST ir<%ext.a> = zext ir<%load.a> to i32
 ; CHECK-NEXT:   WIDEN ir<%mul> = mul ir<%ext.b>, ir<%ext.a>
 ; CHECK-NEXT:   PARTIAL-REDUCE ir<%add> = add ir<%accum>, ir<%mul>
 ; CHECK-NEXT:   EMIT vp<[[EP_IV_NEXT:%.+]]> = add nuw vp<[[EP_IV]]>, ir<16>
diff --git a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
index 0f398a69a4bc7..25799183b1a02 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/arm-ieee-vectorize.ll
@@ -327,5 +327,5 @@ for.end:                                          ; preds = %for.body, %entry
 
 declare float @fabsf(float)
 
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-a8" "target-features"="+dsp,+neon,+vfp3" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
index 0a9b1e0af48bc..61e3a1848ceed 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/partial-reduce-dot-product.ll
@@ -60,10 +60,10 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -125,17 +125,17 @@ define i32 @vqdot(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -222,10 +222,10 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -287,17 +287,17 @@ define i32 @vqdotu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
-; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP12]]
 ; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
@@ -384,10 +384,10 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -449,18 +449,18 @@ define i32 @vqdotsu(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = sext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = zext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = sext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
@@ -545,10 +545,10 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; ZVQDOTQ-NEXT:    [[VEC_PHI:%.*]] = phi <vscale x 1 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[PARTIAL_REDUCE:%.*]], [[VECTOR_BODY]] ]
 ; ZVQDOTQ-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[TMP6]], align 1
-; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; ZVQDOTQ-NEXT:    [[WIDE_LOAD1:%.*]] = load <vscale x 4 x i8>, ptr [[TMP9]], align 1
 ; ZVQDOTQ-NEXT:    [[TMP11:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD1]] to <vscale x 4 x i32>
+; ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
 ; ZVQDOTQ-NEXT:    [[TMP12:%.*]] = mul <vscale x 4 x i32> [[TMP11]], [[TMP8]]
 ; ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <vscale x 1 x i32> @llvm.vector.partial.reduce.add.nxv1i32.nxv4i32(<vscale x 1 x i32> [[VEC_PHI]], <vscale x 4 x i32> [[TMP12]])
 ; ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP3]]
@@ -610,18 +610,18 @@ define i32 @vqdotsu2(ptr %a, ptr %b) #0 {
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP2:%.*]] = getelementptr i8, ptr [[TMP0]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i8>, ptr [[TMP0]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD2:%.*]] = load <8 x i8>, ptr [[TMP2]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP3:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[B]], i64 [[INDEX]]
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[TMP5]], i32 8
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD3:%.*]] = load <8 x i8>, ptr [[TMP5]], align 1
 ; FIXED-ZVQDOTQ-NEXT:    [[WIDE_LOAD4:%.*]] = load <8 x i8>, ptr [[TMP7]], align 1
-; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
-; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = mul <8 x i32> [[TMP8]], [[TMP3]]
+; FIXED-ZVQDOTQ-NEXT:    [[TMP9:%.*]] = zext <8 x i8> [[WIDE_LOAD3]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP4:%.*]] = sext <8 x i8> [[WIDE_LOAD]] to <8 x i32>
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP11:%.*]] = mul <8 x i32> [[TMP9]], [[TMP4]]
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP10]])
-; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI]], <8 x i32> [[TMP11]])
+; FIXED-ZVQDOTQ-NEXT:    [[TMP10:%.*]] = zext <8 x i8> [[WIDE_LOAD4]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP8:%.*]] = sext <8 x i8> [[WIDE_LOAD2]] to <8 x i32>
+; FIXED-ZVQDOTQ-NEXT:    [[TMP14:%.*]] = mul <8 x i32> [[TMP10]], [[TMP8]]
+; FIXED-ZVQDOTQ-NEXT:    [[PARTIAL_REDUCE5]] = call <2 x i32> @llvm.vector.partial.reduce.add.v2i32.v8i32(<2 x i32> [[VEC_PHI1]], <8 x i32> [[TMP14]])
 ; FIXED-ZVQDOTQ-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
 ; FIXED-ZVQDOTQ-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024
 ; FIXED-ZVQDOTQ-NEXT:    br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
index c8d20dccbb32b..b26e9cf55ddbf 100644
--- a/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/memory-interleave.ll
@@ -7,6 +7,7 @@ target triple = "wasm32-unknown-wasi"
 %struct.TwoInts = type { i32, i32 }
 %struct.ThreeInts = type { i32, i32, i32 }
 %struct.FourInts = type { i32, i32, i32, i32 }
+%struct.TwoShorts = type { i16, i16 }
 %struct.ThreeShorts = type { i16, i16, i16 }
 %struct.FourShorts = type { i16, i16, i16, i16 }
 %struct.TwoBytes = type { i8, i8 }
@@ -14,6 +15,8 @@ target triple = "wasm32-unknown-wasi"
 %struct.FourBytes = type { i8, i8, i8, i8 }
 %struct.FiveBytes = type { i8, i8, i8, i8, i8 }
 %struct.EightBytes = type { i8, i8, i8, i8, i8, i8, i8, i8 }
+%struct.TwoFloats = type { float, float }
+%struct.FourFloats = type { float, float, float, float }
 
 ; CHECK-LABEL: two_ints_same_op
 ; CHECK: Cost of 7 for VF 2: INTERLEAVE-GROUP with factor 2 at %10
@@ -1350,3 +1353,1000 @@ define hidden void @scale_uv_row_down2_linear(ptr nocapture noundef readonly %0,
 34:                                               ; preds = %6, %4
   ret void
 }
+
+; CHECK-LABEL: two_floats_same_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.022
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_vary_op
+; CHECK: LV: Scalar loop costs: 14
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 14.
+; CHECK: LV: Vector loop of width 2 costs: 19.
+; CHECK: LV: Vector loop of width 4 costs: 15.
+; CHECK: LV: Selecting VF: 1.
+define hidden void @two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp20.not = icmp eq i32 %N, 0
+  br i1 %cmp20.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.021 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.021
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.021
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.021
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %inc = add nuw i32 %i.021, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.025
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.025
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_bytes_two_floats_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 23
+; CHECK: LV: Vector loop of width 4 costs: 13
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_bytes_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoBytes, ptr %a, i32 %i.024
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoBytes, ptr %b, i32 %i.024
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.023
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_bytes_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 21
+; CHECK: LV: Vector loop of width 4 costs: 14.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoBytes, ptr %res, i32 %i.022
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_same_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp24.not = icmp eq i32 %N, 0
+  br i1 %cmp24.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.025 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.025
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.025
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.025
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %inc = add nuw i32 %i.025, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_shorts_two_floats_vary_op
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 18
+; CHECK: LV: Vector loop of width 2 costs: 22
+; CHECK: LV: Vector loop of width 4 costs: 11.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_shorts_two_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp23.not = icmp eq i32 %N, 0
+  br i1 %cmp23.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.024 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoShorts, ptr %a, i32 %i.024
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoShorts, ptr %b, i32 %i.024
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %add = fadd float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.TwoFloats, ptr %res, i32 %i.024
+  store float %add, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %sub = fsub float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %sub, ptr %y12, align 4
+  %inc = add nuw i32 %i.024, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_same_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp22.not = icmp eq i32 %N, 0
+  br i1 %cmp22.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.023 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.023
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.023
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.023
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %inc = add nuw i32 %i.023, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: two_floats_two_shorts_vary_op
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 10 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 11 for VF 2: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: Cost of 7 for VF 4: INTERLEAVE-GROUP with factor 2
+; CHECK: LV: Scalar loop costs: 16
+; CHECK: LV: Vector loop of width 2 costs: 20
+; CHECK: LV: Vector loop of width 4 costs: 13.
+; CHECK: LV: Selecting VF: 4.
+define hidden void @two_floats_two_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp21.not = icmp eq i32 %N, 0
+  br i1 %cmp21.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.022 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.TwoFloats, ptr %a, i32 %i.022
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.TwoFloats, ptr %b, i32 %i.022
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %conv = fptosi float %add to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.TwoShorts, ptr %res, i32 %i.022
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %conv8 = fptosi float %sub to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %inc = add nuw i32 %i.022, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.046
+  store float %mul, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %mul8, ptr %y10, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %mul14 = fmul float %4, %5
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul14, ptr %z16, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %mul20 = fmul float %6, %7
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %mul20, ptr %w22, align 4
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 24
+; CHECK: LV: Vector loop of width 2 costs: 33
+; CHECK: LV: Vector loop of width 4 costs: 30
+; CHECK: LV: Selecting VF: 1
+define hidden void @four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp42.not = icmp eq i32 %N, 0
+  br i1 %cmp42.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.043 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.043
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.043
+  %1 = load float, ptr %arrayidx1, align 4
+  %add = fadd float %0, %1
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.043
+  store float %add, ptr %arrayidx3, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %sub = fsub float %2, %3
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store float %sub, ptr %y9, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z12 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z12, align 4
+  %mul = fmul float %4, %5
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 8
+  store float %mul, ptr %z14, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w17, align 4
+  %div = fdiv float %6, %7
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 12
+  store float %div, ptr %w19, align 4
+  %inc = add nuw i32 %i.043, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.053
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.053
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv15 = sitofp i8 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z17, align 1
+  %conv18 = sitofp i8 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv23 = sitofp i8 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w25, align 1
+  %conv26 = sitofp i8 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_bytes_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 43
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_bytes_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourBytes, ptr %a, i32 %i.050
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = sitofp i8 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourBytes, ptr %b, i32 %i.050
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv3 = sitofp i8 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 1
+  %2 = load i8, ptr %y, align 1
+  %conv7 = sitofp i8 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 1
+  %3 = load i8, ptr %y9, align 1
+  %conv10 = sitofp i8 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %4 = load i8, ptr %z, align 1
+  %conv14 = sitofp i8 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %5 = load i8, ptr %z16, align 1
+  %conv17 = sitofp i8 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 3
+  %6 = load i8, ptr %w, align 1
+  %conv21 = sitofp i8 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 3
+  %7 = load i8, ptr %w23, align 1
+  %conv24 = sitofp i8 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.049
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i8
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv9, ptr %y11, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i8
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv16, ptr %z18, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i8
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv23, ptr %w25, align 1
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_bytes_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 38
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_bytes_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i8
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourBytes, ptr %res, i32 %i.046
+  store i8 %conv, ptr %arrayidx3, align 1
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i8
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 1
+  store i8 %conv8, ptr %y10, align 1
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i8
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i8 %conv14, ptr %z16, align 1
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i8
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 3
+  store i8 %conv20, ptr %w22, align 1
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp52.not = icmp eq i32 %N, 0
+  br i1 %cmp52.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.053 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.053
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.053
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.053
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %mul11 = fmul float %conv7, %conv10
+  %y13 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %mul11, ptr %y13, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv15 = sitofp i16 %4 to float
+  %z17 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z17, align 2
+  %conv18 = sitofp i16 %5 to float
+  %mul19 = fmul float %conv15, %conv18
+  %z21 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %mul19, ptr %z21, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv23 = sitofp i16 %6 to float
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w25, align 2
+  %conv26 = sitofp i16 %7 to float
+  %mul27 = fmul float %conv23, %conv26
+  %w29 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %mul27, ptr %w29, align 4
+  %inc = add nuw i32 %i.053, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_shorts_four_floats_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 32
+; CHECK: LV: Vector loop of width 2 costs: 37
+; CHECK: LV: Vector loop of width 4 costs: 23
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_shorts_four_floats_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp49.not = icmp eq i32 %N, 0
+  br i1 %cmp49.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.050 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourShorts, ptr %a, i32 %i.050
+  %0 = load i16, ptr %arrayidx, align 2
+  %conv = sitofp i16 %0 to float
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourShorts, ptr %b, i32 %i.050
+  %1 = load i16, ptr %arrayidx1, align 2
+  %conv3 = sitofp i16 %1 to float
+  %mul = fmul float %conv, %conv3
+  %arrayidx4 = getelementptr inbounds nuw %struct.FourFloats, ptr %res, i32 %i.050
+  store float %mul, ptr %arrayidx4, align 4
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 2
+  %2 = load i16, ptr %y, align 2
+  %conv7 = sitofp i16 %2 to float
+  %y9 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 2
+  %3 = load i16, ptr %y9, align 2
+  %conv10 = sitofp i16 %3 to float
+  %add = fadd float %conv7, %conv10
+  %y12 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 4
+  store float %add, ptr %y12, align 4
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %4 = load i16, ptr %z, align 2
+  %conv14 = sitofp i16 %4 to float
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %5 = load i16, ptr %z16, align 2
+  %conv17 = sitofp i16 %5 to float
+  %div = fdiv float %conv14, %conv17
+  %z19 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 8
+  store float %div, ptr %z19, align 4
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 6
+  %6 = load i16, ptr %w, align 2
+  %conv21 = sitofp i16 %6 to float
+  %w23 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 6
+  %7 = load i16, ptr %w23, align 2
+  %conv24 = sitofp i16 %7 to float
+  %sub = fsub float %conv21, %conv24
+  %w26 = getelementptr inbounds nuw i8, ptr %arrayidx4, i32 12
+  store float %sub, ptr %w26, align 4
+  %inc = add nuw i32 %i.050, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_same_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_same_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp48.not = icmp eq i32 %N, 0
+  br i1 %cmp48.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.049 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.049
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.049
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.049
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %mul8 = fmul float %2, %3
+  %conv9 = fptosi float %mul8 to i16
+  %y11 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv9, ptr %y11, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z14 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z14, align 4
+  %mul15 = fmul float %4, %5
+  %conv16 = fptosi float %mul15 to i16
+  %z18 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv16, ptr %z18, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w21 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w21, align 4
+  %mul22 = fmul float %6, %7
+  %conv23 = fptosi float %mul22 to i16
+  %w25 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv23, ptr %w25, align 2
+  %inc = add nuw i32 %i.049, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
+; CHECK-LABEL: four_floats_four_shorts_vary_op
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 2: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 36 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: Cost of 18 for VF 4: INTERLEAVE-GROUP with factor 4
+; CHECK: LV: Scalar loop costs: 28
+; CHECK: LV: Vector loop of width 2 costs: 35
+; CHECK: LV: Vector loop of width 4 costs: 26
+; CHECK: LV: Selecting VF: 4
+define hidden void @four_floats_four_shorts_vary_op(ptr noundef readonly captures(none) %a, ptr noundef readonly captures(none) %b, ptr noundef writeonly captures(none) %res, i32 noundef %N) {
+entry:
+  %cmp45.not = icmp eq i32 %N, 0
+  br i1 %cmp45.not, label %for.cond.cleanup, label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.046 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw %struct.FourFloats, ptr %a, i32 %i.046
+  %0 = load float, ptr %arrayidx, align 4
+  %arrayidx1 = getelementptr inbounds nuw %struct.FourFloats, ptr %b, i32 %i.046
+  %1 = load float, ptr %arrayidx1, align 4
+  %mul = fmul float %0, %1
+  %conv = fptosi float %mul to i16
+  %arrayidx3 = getelementptr inbounds nuw %struct.FourShorts, ptr %res, i32 %i.046
+  store i16 %conv, ptr %arrayidx3, align 2
+  %y = getelementptr inbounds nuw i8, ptr %arrayidx, i32 4
+  %2 = load float, ptr %y, align 4
+  %y7 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 4
+  %3 = load float, ptr %y7, align 4
+  %add = fadd float %2, %3
+  %conv8 = fptosi float %add to i16
+  %y10 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 2
+  store i16 %conv8, ptr %y10, align 2
+  %z = getelementptr inbounds nuw i8, ptr %arrayidx, i32 8
+  %4 = load float, ptr %z, align 4
+  %z13 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 8
+  %5 = load float, ptr %z13, align 4
+  %div = fdiv float %4, %5
+  %conv14 = fptosi float %div to i16
+  %z16 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 4
+  store i16 %conv14, ptr %z16, align 2
+  %w = getelementptr inbounds nuw i8, ptr %arrayidx, i32 12
+  %6 = load float, ptr %w, align 4
+  %w19 = getelementptr inbounds nuw i8, ptr %arrayidx1, i32 12
+  %7 = load float, ptr %w19, align 4
+  %sub = fsub float %6, %7
+  %conv20 = fptosi float %sub to i16
+  %w22 = getelementptr inbounds nuw i8, ptr %arrayidx3, i32 6
+  store i16 %conv20, ptr %w22, align 2
+  %inc = add nuw i32 %i.046, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
new file mode 100644
index 0000000000000..2338da5339354
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/WebAssembly/partial-reduce-accumulate.ll
@@ -0,0 +1,126 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize %s -S | FileCheck %s
+; RUN: opt -mattr=+simd128 -passes=loop-vectorize -vectorizer-maximize-bandwidth %s -S | FileCheck %s --check-prefix=CHECK-MAX-BANDWIDTH
+
+target triple = "wasm32"
+
+define hidden i32 @accumulate_add_u8_u8(ptr noundef readonly  %a, ptr noundef readonly  %b, i32 noundef %N) {
+; CHECK-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 4
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP2]], align 1
+; CHECK-NEXT:    [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32>
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i8>, ptr [[TMP5]], align 1
+; CHECK-NEXT:    [[TMP6:%.*]] = zext <4 x i8> [[WIDE_LOAD1]] to <4 x i32>
+; CHECK-NEXT:    [[TMP7:%.*]] = add <4 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEXT:    [[TMP8]] = add <4 x i32> [[TMP7]], [[TMP6]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP8]])
+; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP10]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK:       [[FOR_COND_CLEANUP]]:
+; CHECK-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP10]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK:       [[FOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-NEXT:    [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-NEXT:    [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+; CHECK-MAX-BANDWIDTH-LABEL: define hidden i32 @accumulate_add_u8_u8(
+; CHECK-MAX-BANDWIDTH-SAME: ptr noundef readonly [[A:%.*]], ptr noundef readonly [[B:%.*]], i32 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-MAX-BANDWIDTH-NEXT:  [[ENTRY:.*]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-MAX-BANDWIDTH:       [[VECTOR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[N_MOD_VF:%.*]] = urem i32 [[N]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    [[N_VEC:%.*]] = sub i32 [[N]], [[N_MOD_VF]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH:       [[VECTOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[PARTIAL_REDUCE2:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[TMP2]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[INDEX]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[WIDE_LOAD1:%.*]] = load <16 x i8>, ptr [[TMP5]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT:    [[PARTIAL_REDUCE:%.*]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[VEC_PHI]], <16 x i32> [[TMP3]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP6:%.*]] = zext <16 x i8> [[WIDE_LOAD1]] to <16 x i32>
+; CHECK-MAX-BANDWIDTH-NEXT:    [[PARTIAL_REDUCE2]] = call <4 x i32> @llvm.vector.partial.reduce.add.v4i32.v16i32(<4 x i32> [[PARTIAL_REDUCE]], <16 x i32> [[TMP6]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 16
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP9:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-MAX-BANDWIDTH:       [[MIDDLE_BLOCK]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[PARTIAL_REDUCE2]])
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N]], [[N_VEC]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[CMP_N]], label %[[FOR_COND_CLEANUP:.*]], label %[[SCALAR_PH]]
+; CHECK-MAX-BANDWIDTH:       [[SCALAR_PH]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i32 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    br label %[[FOR_BODY:.*]]
+; CHECK-MAX-BANDWIDTH:       [[FOR_COND_CLEANUP]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[RESULT_0_LCSSA:%.*]] = phi i32 [ [[ADD3:%.*]], %[[FOR_BODY]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    ret i32 [[RESULT_0_LCSSA]]
+; CHECK-MAX-BANDWIDTH:       [[FOR_BODY]]:
+; CHECK-MAX-BANDWIDTH-NEXT:    [[IV:%.*]] = phi i32 [ [[INC:%.*]], %[[FOR_BODY]] ], [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[RED:%.*]] = phi i32 [ [[ADD3]], %[[FOR_BODY]] ], [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw i8, ptr [[A]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP11:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CONV:%.*]] = zext i8 [[TMP11]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds nuw i8, ptr [[B]], i32 [[IV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[TMP12:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[CONV2:%.*]] = zext i8 [[TMP12]] to i32
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ADD:%.*]] = add i32 [[RED]], [[CONV]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[ADD3]] = add i32 [[ADD]], [[CONV2]]
+; CHECK-MAX-BANDWIDTH-NEXT:    [[INC]] = add nuw i32 [[IV]], 1
+; CHECK-MAX-BANDWIDTH-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC]], [[N]]
+; CHECK-MAX-BANDWIDTH-NEXT:    br i1 [[EXITCOND_NOT]], label %[[FOR_COND_CLEANUP]], label %[[FOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
+;
+entry:
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body
+  ret i32 %add3
+
+for.body:                                         ; preds = %entry, %for.body
+  %iv = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+  %red = phi i32 [ %add3, %for.body ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds nuw i8, ptr %a, i32 %iv
+  %0 = load i8, ptr %arrayidx, align 1
+  %conv = zext i8 %0 to i32
+  %arrayidx1 = getelementptr inbounds nuw i8, ptr %b, i32 %iv
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %add = add i32 %red, %conv
+  %add3 = add i32 %add, %conv2
+  %inc = add nuw i32 %iv, 1
+  %exitcond.not = icmp eq i32 %inc, %N
+  br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
index 9168ebf6335ab..08d39ea038586 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/illegal-parallel-loop-uniform-write.ll
@@ -214,7 +214,7 @@ for.end15:                                        ; preds = %for.end.us, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !3 = !{!4, !5}
 !4 = !{!4}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
index 44e9d3e426f45..b7f8be1802de1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/int128_no_gather.ll
@@ -71,6 +71,6 @@ declare i32 @printf(ptr, ...) #1
 ; Function Attrs: nounwind
 declare i32 @puts(ptr nocapture readonly) #2
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
+attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+mpx,+pclmul,+pku,+popcnt,+rdrnd,+rdseed,+rtm,+sgx,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
index f066000fe9f66..52e90e4475208 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll
@@ -362,4 +362,4 @@ for.body:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body
 }
 
-attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "unsafe-fp-math"="false" }
+attributes #0 = { "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
index 005696af67b5a..e405fe7c6f764 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/scatter_crash.ll
@@ -335,4 +335,4 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit99
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+evex512,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-vzeroupper" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
index b98a2ea54ffe1..c7550ca016e76 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/tail_folding_and_assume_safety.ll
@@ -143,7 +143,7 @@ for.inc:
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !llvm.loop !11
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
index f4d80af7089bb..2a3ce037e9567 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/transform-narrow-interleave-to-widen-memory.ll
@@ -8,69 +8,15 @@ target triple = "x86_64-unknown-linux"
 define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n) {
 ; CHECK-LABEL: define void @test_4xi64(
 ; CHECK-SAME: ptr noalias [[DATA:%.*]], ptr noalias [[FACTOR:%.*]], i64 noundef [[N:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT:  [[ITER_CHECK:.*]]:
+; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], 4
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH:.*]], label %[[VECTOR_MAIN_LOOP_ITER_CHECK:.*]]
-; CHECK:       [[VECTOR_MAIN_LOOP_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], 16
-; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label %[[VEC_EPILOG_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
 ; CHECK:       [[VECTOR_PH]]:
-; CHECK-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 [[N]], 16
-; CHECK-NEXT:    [[N_VEC1:%.*]] = sub i64 [[N]], [[N_MOD_VF1]]
-; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
-; CHECK:       [[VECTOR_BODY]]:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT1:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[TMP1:%.*]] = add i64 [[INDEX]], 2
-; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 3
-; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP21:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP22:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP1]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[TMP2]]
-; CHECK-NEXT:    [[TMP7:%.*]] = load i64, ptr [[TMP20]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <4 x i64> poison, i64 [[TMP7]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT1:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT1]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP8:%.*]] = load i64, ptr [[TMP21]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT5:%.*]] = insertelement <4 x i64> poison, i64 [[TMP8]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT6:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT5]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP9:%.*]] = load i64, ptr [[TMP22]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT7:%.*]] = insertelement <4 x i64> poison, i64 [[TMP9]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT8:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT7]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP10:%.*]] = load i64, ptr [[TMP6]], align 8
-; CHECK-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <4 x i64> poison, i64 [[TMP10]], i64 0
-; CHECK-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <4 x i64> [[BROADCAST_SPLATINSERT9]], <4 x i64> poison, <4 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[INDEX]], i32 0
-; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP0]], i32 0
-; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP1]], i32 0
-; CHECK-NEXT:    [[TMP23:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[TMP2]], i32 0
-; CHECK-NEXT:    [[WIDE_LOAD1:%.*]] = load <4 x i64>, ptr [[TMP11]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD2:%.*]] = load <4 x i64>, ptr [[TMP12]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i64>, ptr [[TMP13]], align 8
-; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <4 x i64>, ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[TMP15:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT1]], [[WIDE_LOAD1]]
-; CHECK-NEXT:    [[TMP16:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT6]], [[WIDE_LOAD2]]
-; CHECK-NEXT:    [[TMP17:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT8]], [[WIDE_LOAD3]]
-; CHECK-NEXT:    [[TMP18:%.*]] = mul <4 x i64> [[BROADCAST_SPLAT10]], [[WIDE_LOAD4]]
-; CHECK-NEXT:    store <4 x i64> [[TMP15]], ptr [[TMP11]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP16]], ptr [[TMP12]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP17]], ptr [[TMP13]], align 8
-; CHECK-NEXT:    store <4 x i64> [[TMP18]], ptr [[TMP23]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT1]] = add nuw i64 [[INDEX]], 4
-; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT1]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[TMP19]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-; CHECK:       [[MIDDLE_BLOCK]]:
-; CHECK-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 [[N]], [[N_VEC1]]
-; CHECK-NEXT:    br i1 [[CMP_N1]], label %[[EXIT:.*]], label %[[VEC_EPILOG_ITER_CHECK:.*]]
-; CHECK:       [[VEC_EPILOG_ITER_CHECK]]:
-; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_MOD_VF1]], 4
-; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label %[[VEC_EPILOG_SCALAR_PH]], label %[[VEC_EPILOG_PH]]
-; CHECK:       [[VEC_EPILOG_PH]]:
-; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N]], 4
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[N]], [[N_MOD_VF]]
-; CHECK-NEXT:    br label %[[VEC_EPILOG_VECTOR_BODY:.*]]
-; CHECK:       [[VEC_EPILOG_VECTOR_BODY]]:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], %[[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VEC_EPILOG_VECTOR_BODY]] ]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV]]
 ; CHECK-NEXT:    [[TMP5:%.*]] = load i64, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i64> poison, i64 [[TMP5]], i64 0
@@ -81,15 +27,15 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <4 x i64> [[TMP4]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 1
 ; CHECK-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP14]], label %[[VEC_EPILOG_MIDDLE_BLOCK:.*]], label %[[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP3:![0-9]+]]
-; CHECK:       [[VEC_EPILOG_MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br i1 [[TMP14]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT]], label %[[VEC_EPILOG_SCALAR_PH]]
-; CHECK:       [[VEC_EPILOG_SCALAR_PH]]:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], %[[VEC_EPILOG_ITER_CHECK]] ], [ 0, %[[ITER_CHECK]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
 ; CHECK-NEXT:    br label %[[LOOP:.*]]
 ; CHECK:       [[LOOP]]:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], %[[LOOP]] ]
 ; CHECK-NEXT:    [[DATA_2:%.*]] = getelementptr inbounds i64, ptr [[FACTOR]], i64 [[IV1]]
 ; CHECK-NEXT:    [[L_2:%.*]] = load i64, ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[DATA_0:%.*]] = getelementptr inbounds { i64, i64, i64, i64 }, ptr [[DATA]], i64 [[IV1]], i32 0
@@ -110,7 +56,7 @@ define void @test_4xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_3]], ptr [[DATA_3]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP4:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -171,7 +117,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP5:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -194,7 +140,7 @@ define void @test_2xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP5:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -249,7 +195,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -272,7 +218,7 @@ define void @test_2xi64_interleave_loads_order_flipped(ptr noalias %data, ptr no
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP8:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -327,7 +273,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP8:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -350,7 +296,7 @@ define void @test_2xi64_store_order_flipped_1(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_0]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP9:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -405,7 +351,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP4]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -428,7 +374,7 @@ define void @test_2xi64_store_order_flipped_2(ptr noalias %data, ptr noalias %fa
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_0]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP11:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -489,7 +435,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store <8 x i64> [[INTERLEAVED_VEC]], ptr [[TMP16]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -513,7 +459,7 @@ define void @test_2xi64_different_loads_feeding_fmul(ptr noalias %data, ptr noal
 ; CHECK-NEXT:    store i64 [[MUL_1]], ptr [[DATA_1]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -573,7 +519,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <12 x i64> [[INTERLEAVED_VEC]], ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 4
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
@@ -598,7 +544,7 @@ define void @test_3xi64(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i64 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -707,7 +653,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store <24 x i32> [[INTERLEAVED_VEC]], ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[IV]], 8
 ; CHECK-NEXT:    [[TMP15:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP15]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       [[MIDDLE_BLOCK]]:
 ; CHECK-NEXT:    br label %[[SCALAR_PH]]
 ; CHECK:       [[SCALAR_PH]]:
@@ -731,7 +677,7 @@ define void @test_3xi32(ptr noalias %data, ptr noalias %factor, i64 noundef %n)
 ; CHECK-NEXT:    store i32 [[MUL_2]], ptr [[DATA_2]], align 8
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EC:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       [[EXIT]]:
 ; CHECK-NEXT:    ret void
 ;
@@ -765,20 +711,19 @@ exit:
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
 ; CHECK: [[META2]] = !{!"llvm.loop.unroll.runtime.disable"}
-; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META1]], [[META2]]}
-; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META2]], [[META1]]}
-; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META1]], [[META2]]}
-; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META2]], [[META1]]}
-; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META1]], [[META2]]}
-; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META2]], [[META1]]}
-; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META1]], [[META2]]}
-; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META2]], [[META1]]}
-; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META1]], [[META2]]}
-; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META2]], [[META1]]}
-; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META1]], [[META2]]}
-; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META2]], [[META1]]}
-; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META1]], [[META2]]}
-; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META2]], [[META1]]}
-; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META1]], [[META2]]}
-; CHECK: [[LOOP18]] = distinct !{[[LOOP18]], [[META2]], [[META1]]}
+; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
+; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
+; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
+; CHECK: [[LOOP8]] = distinct !{[[LOOP8]], [[META1]], [[META2]]}
+; CHECK: [[LOOP9]] = distinct !{[[LOOP9]], [[META2]], [[META1]]}
+; CHECK: [[LOOP10]] = distinct !{[[LOOP10]], [[META1]], [[META2]]}
+; CHECK: [[LOOP11]] = distinct !{[[LOOP11]], [[META2]], [[META1]]}
+; CHECK: [[LOOP12]] = distinct !{[[LOOP12]], [[META1]], [[META2]]}
+; CHECK: [[LOOP13]] = distinct !{[[LOOP13]], [[META2]], [[META1]]}
+; CHECK: [[LOOP14]] = distinct !{[[LOOP14]], [[META1]], [[META2]]}
+; CHECK: [[LOOP15]] = distinct !{[[LOOP15]], [[META2]], [[META1]]}
+; CHECK: [[LOOP16]] = distinct !{[[LOOP16]], [[META1]], [[META2]]}
+; CHECK: [[LOOP17]] = distinct !{[[LOOP17]], [[META2]], [[META1]]}
 ;.
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
index 2c187caaf93c7..9471e4a71e8c8 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vectorization-remarks-profitable.ll
@@ -72,7 +72,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void, !dbg !27
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
diff --git a/llvm/test/Transforms/LoopVectorize/constantfolder.ll b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
index 66592b0ccf677..fdeb497efae15 100644
--- a/llvm/test/Transforms/LoopVectorize/constantfolder.ll
+++ b/llvm/test/Transforms/LoopVectorize/constantfolder.ll
@@ -288,3 +288,72 @@ loop.latch:
 exit:
   ret void
 }
+
+define void @const_fold_binaryintrinsic(ptr %dst, i64 %d) {
+; CHECK-LABEL: define void @const_fold_binaryintrinsic(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    store i64 3, ptr [[DST]], align 2
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP2]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %const.0 = xor i64 %d, %d
+  %trunc = call i64 @llvm.umax.i64(i64 %const.0, i64 3)
+  store i64 %trunc, ptr %dst, align 2
+  %iv.next = add i64 %iv, 1
+  %cmp = icmp ult i64 %iv.next, 100
+  br i1 %cmp, label %loop, label %exit
+
+exit:
+  ret void
+}
+
+define void @const_fold_widegep(ptr noalias %A, ptr noalias %B, i64 %d) {
+; CHECK-LABEL: define void @const_fold_widegep(
+; CHECK-SAME: ptr noalias [[A:%.*]], ptr noalias [[B:%.*]], i64 [[D:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    store ptr [[A]], ptr [[B]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP0:%.*]] = icmp eq i64 [[INDEX_NEXT]], 100
+; CHECK-NEXT:    br i1 [[TMP0]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ]
+  %const.0 = xor i64 %d, %d
+  %gep.A = getelementptr i64, ptr %A, i64 %const.0
+  %gep.B = getelementptr i64, ptr %B, i64 %const.0
+  store ptr %gep.A, ptr %gep.B
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exit.cond = icmp ult i64 %iv.next, 100
+  br i1 %exit.cond, label %loop, label %exit
+
+exit:
+  ret void
+}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
index 8ce2a979b1e46..0656de476f482 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-missing-instr-debug-loc.ll
@@ -50,7 +50,7 @@ for.body:                                         ; preds = %for.body.preheader,
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !9, !llvm.loop !18
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
index 65bd36cfa0853..5e51624f28eba 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info-2.ll
@@ -134,7 +134,7 @@ for.cond.cleanup:
   ret void, !dbg !44
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
index 4b7b714a25628..77ec95a3d8ef9 100644
--- a/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/diag-with-hotness-info.ll
@@ -145,7 +145,7 @@ for.body:                                         ; preds = %entry, %for.body
   br i1 %exitcond, label %for.cond.cleanup, label %for.body, !dbg !43, !llvm.loop !55
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3,+x87" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
index 5cf99b8998603..34873319176d1 100644
--- a/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
+++ b/llvm/test/Transforms/LoopVectorize/incorrect-dom-info.ll
@@ -135,7 +135,7 @@ thread-pre-split5:                                ; preds = %.lr.ph
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 8efe29a54d913..fc2e2337e0569 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -783,7 +783,7 @@ for.body:                                         ; preds = %for.body, %entry
 @SA = common global i32 0, align 4
 @SB = common global float 0.000000e+00, align 4
 
-define void @int_float_struct(ptr nocapture readonly %A) #0 {
+define void @int_float_struct(ptr nocapture readonly %A) {
 ; CHECK-LABEL: @int_float_struct(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[VECTOR_PH:%.*]]
@@ -1546,5 +1546,3 @@ loop:
 end:
   ret void
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/LoopVectorize/metadata-width.ll b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
index ddf90294d5bfc..22243d96d1b42 100644
--- a/llvm/test/Transforms/LoopVectorize/metadata-width.ll
+++ b/llvm/test/Transforms/LoopVectorize/metadata-width.ll
@@ -89,7 +89,7 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!0, !1, !5}
 !1 = !{!"llvm.loop.vectorize.width", i32 8}
diff --git a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
index a1fc1b8f34ff3..87b5a0b3553e4 100644
--- a/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
+++ b/llvm/test/Transforms/LoopVectorize/multiple-address-spaces.ll
@@ -56,4 +56,4 @@ for.end:                                          ; preds = %for.body
   ret i32 0
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
index 705bbabfcf1df..64ae730eb938e 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-address-space.ll
@@ -218,4 +218,4 @@ for.end:                                          ; preds = %for.body, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
index 1effb1061a3dd..872a39298c0f2 100644
--- a/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
+++ b/llvm/test/Transforms/LoopVectorize/runtime-check-readonly-address-space.ll
@@ -129,4 +129,4 @@ for.end:                                          ; preds = %for.body
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
index 8224d6b5df9f4..137e098b91260 100644
--- a/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
+++ b/llvm/test/Transforms/LoopVectorize/scev-exitlim-crash.ll
@@ -104,8 +104,8 @@ for.end26:                                        ; preds = %for.cond4.for.end26
 }
 declare i32 @fn2(double) #1
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "use-soft-float"="false" }
 
 !0 = !{!"int", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
new file mode 100644
index 0000000000000..01934b1d7fbd2
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsafe-ic-hint-remark.ll
@@ -0,0 +1,30 @@
+; RUN: opt -passes=loop-vectorize -pass-remarks-analysis=loop-vectorize -S < %s 2>&1 | FileCheck %s
+
+; Make sure the unsafe user specified interleave count is ignored.
+
+; CHECK: remark: <unknown>:0:0: Ignoring user-specified interleave count due to possibly unsafe dependencies in the loop.
+; CHECK-LABEL: @loop_distance_4
+define void @loop_distance_4(ptr %a, ptr %b) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i64 [ 4, %entry ], [ %iv.next, %loop ]
+  %0 = getelementptr i32, ptr %b, i64 %iv
+  %arrayidx = getelementptr i8, ptr %0, i64 -16
+  %1 = load i32, ptr %arrayidx, align 4
+  %arrayidx2 = getelementptr inbounds nuw i32, ptr %a, i64 %iv
+  %2 = load i32, ptr %arrayidx2, align 4
+  %add = add nsw i32 %2, %1
+  store i32 %add, ptr %0, align 4
+  %iv.next = add nuw nsw i64 %iv, 1
+  %exitcond.not = icmp eq i64 %iv.next, 64
+  br i1 %exitcond.not, label %for.end, label %loop, !llvm.loop !1
+
+for.end:
+  ret void
+}
+
+!1 = !{!1, !2, !3}
+!2 = !{!"llvm.loop.interleave.count", i32 4}
+!3 = !{!"llvm.loop.vectorize.width", i32 4}
diff --git a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
index e2f500753bbdc..cc187dec37709 100644
--- a/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/int64-and-ptr.ll
@@ -35,11 +35,7 @@ if.end5:                                          ; preds = %if.then, %entry
   ret i1 %rez.0
 }
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
diff --git a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
index d4fd34afc1452..d8065e869ba9c 100644
--- a/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
+++ b/llvm/test/Transforms/MergeICmps/X86/pr41917.ll
@@ -7,13 +7,13 @@ target triple = "i386-pc-windows-msvc19.11.0"
 %class.a = type { i32, i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind optsize
-define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr #0 {
+define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly align 4 %g, ptr byval(%class.a) nocapture readonly align 4 %p2) local_unnamed_addr {
 ; CHECK-LABEL: @pr41917(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    [[CALL:%.*]] = tail call zeroext i1 @f2()
 ; CHECK-NEXT:    br i1 [[CALL]], label [[LAND_RHS:%.*]], label %"land.end+land.rhs3"
 ; CHECK:       land.rhs:
-; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2() #[[ATTR3]]
+; CHECK-NEXT:    [[CALL1:%.*]] = tail call zeroext i1 @f2()
 ; CHECK-NEXT:    br label %"land.end+land.rhs3"
 ; CHECK:       "land.end+land.rhs3":
 ; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [[CLASS_A:%.*]], ptr [[G:%.*]], i32 0, i32 1
@@ -25,11 +25,11 @@ define dso_local zeroext i1 @pr41917(ptr byval(%class.a) nocapture readonly alig
 ; CHECK-NEXT:    ret i1 [[TMP2]]
 ;
 entry:
-  %call = tail call zeroext i1 @f2() #2
+  %call = tail call zeroext i1 @f2()
   br i1 %call, label %land.rhs, label %land.end
 
 land.rhs:                                         ; preds = %entry
-  %call1 = tail call zeroext i1 @f2() #2
+  %call1 = tail call zeroext i1 @f2()
   br label %land.end
 
 land.end:                                         ; preds = %land.rhs, %entry
@@ -53,11 +53,7 @@ land.end6:                                        ; preds = %land.rhs3, %land.en
   ret i1 %4
 }
 
-declare dso_local zeroext i1 @f2() local_unnamed_addr #1
-
-attributes #0 = { nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind optsize }
+declare dso_local zeroext i1 @f2() local_unnamed_addr
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
index 1cf9fd9009564..12b41845f26a8 100644
--- a/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
+++ b/llvm/test/Transforms/NewGVN/basic-cyclic-opt.ll
@@ -4,7 +4,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 
 ;; Function Attrs: nounwind ssp uwtable
 ;; We should eliminate the sub, and one of the phi nodes
-define void @vnum_test1(ptr %data) #0 {
+define void @vnum_test1(ptr %data) {
 ; CHECK-LABEL: @vnum_test1(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -74,7 +74,7 @@ bb19:                                             ; preds = %bb4
 ;; We should eliminate the sub, one of the phi nodes, prove the store of the sub
 ;; and the load of data are equivalent, that the load always produces constant 0, and
 ;; delete the load replacing it with constant 0.
-define i32 @vnum_test2(ptr %data) #0 {
+define i32 @vnum_test2(ptr %data) {
 ; CHECK-LABEL: @vnum_test2(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -142,11 +142,10 @@ bb21:                                             ; preds = %bb4
   ret i32 %p.0
 }
 
-
 ; Function Attrs: nounwind ssp uwtable
 ;; Same as test 2, with a conditional store of m-n, so it has to also discover
 ;; that data ends up with the same value no matter what branch is taken.
-define i32 @vnum_test3(ptr %data) #0 {
+define i32 @vnum_test3(ptr %data) {
 ; CHECK-LABEL: @vnum_test3(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds i32, ptr [[DATA:%.*]], i64 3
@@ -305,7 +304,6 @@ bb3:                                              ; preds = %bb2
   %tmp3 = sub i32 %tmp, %phi2
   ret i32 %tmp3
 }
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.ident = !{!0, !0, !0}
 
diff --git a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
index 8b2d662e484fc..b6da86bd430f8 100644
--- a/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
+++ b/llvm/test/Transforms/NewGVN/cond_br2-xfail.ll
@@ -10,7 +10,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 %"union.llvm::SmallVectorBase::U" = type { x86_fp80 }
 
 ; Function Attrs: ssp uwtable
-define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
+define void @_Z4testv() personality ptr @__gxx_personality_v0 {
 ; CHECK: @_Z4testv()
 ; CHECK: invoke.cont:
 ; CHECK: br i1 true, label %new.notnull.i11, label %if.end.i14
@@ -18,7 +18,7 @@ define void @_Z4testv() #0 personality ptr @__gxx_personality_v0 {
 
 entry:
   %sv = alloca %"class.llvm::SmallVector", align 16
-  call void @llvm.lifetime.start.p0(ptr %sv) #1
+  call void @llvm.lifetime.start.p0(ptr %sv)
   %FirstEl.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 3
   store ptr %FirstEl.i.i.i.i.i.i, ptr %sv, align 16, !tbaa !4
   %EndX.i.i.i.i.i.i = getelementptr inbounds %"class.llvm::SmallVector", ptr %sv, i64 0, i32 0, i32 0, i32 0, i32 0, i32 1
@@ -83,11 +83,11 @@ invoke.cont3:                                     ; preds = %invoke.cont2
   br i1 %cmp.i.i.i.i19, label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21, label %if.then.i.i.i20
 
 if.then.i.i.i20:                                  ; preds = %invoke.cont3
-  call void @free(ptr %5) #1
+  call void @free(ptr %5)
   br label %_ZN4llvm11SmallVectorIiLj8EED1Ev.exit21
 
 _ZN4llvm11SmallVectorIiLj8EED1Ev.exit21:          ; preds = %invoke.cont3, %if.then.i.i.i20
-  call void @llvm.lifetime.end.p0(ptr %sv) #1
+  call void @llvm.lifetime.end.p0(ptr %sv)
   ret void
 
 lpad:                                             ; preds = %if.end.i14, %if.end.i, %invoke.cont2
@@ -98,7 +98,7 @@ lpad:                                             ; preds = %if.end.i14, %if.end
   br i1 %cmp.i.i.i.i, label %eh.resume, label %if.then.i.i.i
 
 if.then.i.i.i:                                    ; preds = %lpad
-  call void @free(ptr %7) #1
+  call void @free(ptr %7)
   br label %eh.resume
 
 eh.resume:                                        ; preds = %if.then.i.i.i, %lpad
@@ -106,24 +106,19 @@ eh.resume:                                        ; preds = %if.then.i.i.i, %lpa
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 declare i32 @__gxx_personality_v0(...)
 
-declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr) #2
+declare void @_Z1gRN4llvm11SmallVectorIiLj8EEE(ptr)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64) #2
+declare void @_ZN4llvm15SmallVectorBase8grow_podEmm(ptr, i64, i64)
 
 ; Function Attrs: nounwind
-declare void @free(ptr nocapture) #3
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @free(ptr nocapture)
 
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
diff --git a/llvm/test/Transforms/NewGVN/equivalent-phi.ll b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
index ba4fc14fa2feb..388b94125f560 100644
--- a/llvm/test/Transforms/NewGVN/equivalent-phi.ll
+++ b/llvm/test/Transforms/NewGVN/equivalent-phi.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 ;; one set of indexing calculations and a load
 
 ; Function Attrs: nounwind ssp uwtable
-define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) #0 {
+define i32 @bar(i32 %arg, i32 %arg1, i32 %arg2) {
 ; CHECK-LABEL: @bar(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB3:%.*]]
@@ -59,8 +59,6 @@ bb20:                                             ; preds = %bb17
   ret i32 %tmp14
 }
 
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/NewGVN/memory-handling.ll b/llvm/test/Transforms/NewGVN/memory-handling.ll
index f83d145167c75..71e90419f0bf5 100644
--- a/llvm/test/Transforms/NewGVN/memory-handling.ll
+++ b/llvm/test/Transforms/NewGVN/memory-handling.ll
@@ -282,10 +282,10 @@ declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1) #2
 ; Function Attrs: inlinehint nounwind readonly uwtable
 declare i32 @tolower(i32) local_unnamed_addr #3
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind readnone }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { inlinehint nounwind readonly uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { inlinehint nounwind readonly uwtable }
 attributes #4 = { nounwind readnone }
 attributes #5 = { nounwind readonly }
 
diff --git a/llvm/test/Transforms/NewGVN/pr31483.ll b/llvm/test/Transforms/NewGVN/pr31483.ll
index 82e9a2ab286ee..c1fb8366c15e5 100644
--- a/llvm/test/Transforms/NewGVN/pr31483.ll
+++ b/llvm/test/Transforms/NewGVN/pr31483.ll
@@ -6,7 +6,7 @@ target datalayout = "E-m:e-i64:64-n32:64"
 ;; Ensure we do not believe the indexing increments are unreachable due to incorrect memory
 ;; equivalence detection.  In PR31483, we were deleting those blocks as unreachable
 ; Function Attrs: nounwind
-define signext i32 @ham(ptr %arg, ptr %arg1) #0 {
+define signext i32 @ham(ptr %arg, ptr %arg1) {
 ; CHECK-LABEL: @ham(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    [[TMP:%.*]] = alloca ptr, align 8
@@ -89,12 +89,7 @@ bb23:                                             ; preds = %bb2
   ret i32 undef
 }
 
-declare signext i32 @zot(ptr, ...) #1
+declare signext i32 @zot(ptr, ...)
 
 ; Function Attrs: nounwind
-declare void @llvm.va_end(ptr) #2
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="ppc64" "target-features"="+altivec,-bpermd,-crypto,-direct-move,-extdiv,-power8-vector,-vsx" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-
+declare void @llvm.va_end(ptr)
diff --git a/llvm/test/Transforms/NewGVN/pr31501.ll b/llvm/test/Transforms/NewGVN/pr31501.ll
index 353c693f2a29b..b2ba42b25ad86 100644
--- a/llvm/test/Transforms/NewGVN/pr31501.ll
+++ b/llvm/test/Transforms/NewGVN/pr31501.ll
@@ -49,9 +49,9 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 %struct.wombat.28 = type <{ ptr, i8, i8, [6 x i8] }>
 
 ; Function Attrs: norecurse nounwind ssp uwtable
-define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr #0 align 2 {
+define weak_odr hidden ptr @quux(ptr %arg, ptr %arg1) local_unnamed_addr align 2 {
 ; CHECK-LABEL: define weak_odr hidden ptr @quux(
-; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
+; CHECK-SAME: ptr [[ARG:%.*]], ptr [[ARG1:%.*]]) local_unnamed_addr align 2 {
 ; CHECK-NEXT:  [[BB:.*]]:
 ; CHECK-NEXT:    [[TMP:%.*]] = getelementptr inbounds [[STRUCT_BARNEY:%.*]], ptr [[ARG]], i64 0, i32 3, i32 0, i32 0, i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = load ptr, ptr [[TMP]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -112,8 +112,6 @@ bb21:                                             ; preds = %bb19, %bb
   ret ptr %tmp22
 }
 
-attributes #0 = { norecurse nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/llvm/test/Transforms/NewGVN/pr33187.ll b/llvm/test/Transforms/NewGVN/pr33187.ll
index 969f172cba956..61eb7a526e6db 100644
--- a/llvm/test/Transforms/NewGVN/pr33187.ll
+++ b/llvm/test/Transforms/NewGVN/pr33187.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ;; Ensure we don't change after value numbering by accidentally deleting the wrong expression.
 ; RUN: opt -passes=newgvn -S %s | FileCheck %s
-define void @fn1(i1 %arg) local_unnamed_addr #0 {
+define void @fn1(i1 %arg) local_unnamed_addr {
 ; CHECK-LABEL: @fn1(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND_PREHEADER:%.*]]
@@ -89,8 +89,7 @@ if.end18:                                         ; preds = %L, %while.body12
   br label %while.cond10
 }
 
-
-define void @hoge() local_unnamed_addr #0 {
+define void @hoge() local_unnamed_addr {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br label [[BB1:%.*]]
@@ -108,9 +107,6 @@ bb1:                                              ; preds = %bb1, %bb
   br label %bb1
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-
 define void @a(i1 %arg) {
 ; CHECK-LABEL: @a(
 ; CHECK-NEXT:  b:
@@ -143,4 +139,3 @@ f:                                                ; preds = %d
   %e = getelementptr i8, ptr %i, i64 1
   br label %d
 }
-
diff --git a/llvm/test/Transforms/NewGVN/pr33305.ll b/llvm/test/Transforms/NewGVN/pr33305.ll
index e742f14249c7c..ff645f8804a28 100644
--- a/llvm/test/Transforms/NewGVN/pr33305.ll
+++ b/llvm/test/Transforms/NewGVN/pr33305.ll
@@ -16,9 +16,9 @@ target triple = "x86_64-apple-macosx10.12.0"
 @str.2 = private unnamed_addr constant [8 x i8] c"Screwed\00"
 
 ; Function Attrs: nounwind optsize ssp uwtable
-define i32 @main() local_unnamed_addr #0 {
+define i32 @main() local_unnamed_addr {
 ; CHECK-LABEL: define i32 @main(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[DOTPR_I:%.*]] = load i32, ptr @c, align 4, !tbaa [[INT_TBAA3:![0-9]+]]
 ; CHECK-NEXT:    [[CMP13_I:%.*]] = icmp slt i32 [[DOTPR_I]], 1
@@ -77,7 +77,7 @@ define i32 @main() local_unnamed_addr #0 {
 ; CHECK-NEXT:    br i1 [[TOBOOL]], label %[[IF_END:.*]], label %[[IF_THEN:.*]]
 ; CHECK:       [[IF_THEN]]:
 ; CHECK-NEXT:    [[PUTS2:%.*]] = tail call i32 @puts(ptr @str.2)
-; CHECK-NEXT:    tail call void @abort() #[[ATTR3:[0-9]+]]
+; CHECK-NEXT:    tail call void @abort()
 ; CHECK-NEXT:    unreachable
 ; CHECK:       [[IF_END]]:
 ; CHECK-NEXT:    [[PUTS:%.*]] = tail call i32 @puts(ptr @str)
@@ -153,7 +153,7 @@ fn1.exit:                                         ; preds = %if.then.i, %for.end
 
 if.then:                                          ; preds = %fn1.exit
   %puts2 = tail call i32 @puts(ptr @str.2)
-  tail call void @abort() #3
+  tail call void @abort()
   unreachable
 
 if.end:                                           ; preds = %fn1.exit
@@ -162,15 +162,10 @@ if.end:                                           ; preds = %fn1.exit
 }
 
 ; Function Attrs: noreturn nounwind optsize
-declare void @abort() local_unnamed_addr #1
+declare void @abort() local_unnamed_addr
 
 ; Function Attrs: nounwind
-declare i32 @puts(ptr nocapture readonly) local_unnamed_addr #2
-
-attributes #0 = { nounwind optsize ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn nounwind optsize "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
-attributes #3 = { noreturn nounwind optsize }
+declare i32 @puts(ptr nocapture readonly) local_unnamed_addr
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/NewGVN/pr34430.ll b/llvm/test/Transforms/NewGVN/pr34430.ll
index 490ba433305d5..62d5770167f95 100644
--- a/llvm/test/Transforms/NewGVN/pr34430.ll
+++ b/llvm/test/Transforms/NewGVN/pr34430.ll
@@ -4,7 +4,7 @@
 source_filename = "bugpoint-output-e4c7d0f.bc"
 
 ;  Make sure we still properly resolve phi cycles when they involve predicateinfo copies of phis.
-define void @hoge(i1 %arg) local_unnamed_addr #0 {
+define void @hoge(i1 %arg) local_unnamed_addr {
 ; CHECK-LABEL: @hoge(
 ; CHECK-NEXT:  bb:
 ; CHECK-NEXT:    br i1 %arg, label [[BB6:%.*]], label [[BB1:%.*]]
@@ -41,8 +41,6 @@ bb6:                                              ; preds = %bb4, %bb2, %bb1, %b
   br label %bb4
 }
 
-attributes #0 = { norecurse noreturn nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0"}
diff --git a/llvm/test/Transforms/NewGVN/pr34452.ll b/llvm/test/Transforms/NewGVN/pr34452.ll
index 48bdd88e9591a..7c38147c1ac9a 100644
--- a/llvm/test/Transforms/NewGVN/pr34452.ll
+++ b/llvm/test/Transforms/NewGVN/pr34452.ll
@@ -6,9 +6,9 @@ source_filename = "bugpoint-output-09f7a24.bc"
 @WHOLELINE = external local_unnamed_addr global i32, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @sgrep() local_unnamed_addr #0 {
+define void @sgrep() local_unnamed_addr {
 ; CHECK-LABEL: define void @sgrep(
-; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) local_unnamed_addr {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @WHOLELINE, align 4, !tbaa [[INT_TBAA1:![0-9]+]]
 ; CHECK-NEXT:    [[TOBOOL:%.*]] = icmp eq i32 [[TMP0]], 0
@@ -36,10 +36,7 @@ while.body.us:                                    ; preds = %while.body.us, %ent
 }
 
 ; Function Attrs: nounwind readnone speculatable
-declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64) #1
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/OpenMP/dead_use.ll b/llvm/test/Transforms/OpenMP/dead_use.ll
index 1c4b2c6fe27a6..ad0f91ccdedb4 100644
--- a/llvm/test/Transforms/OpenMP/dead_use.ll
+++ b/llvm/test/Transforms/OpenMP/dead_use.ll
@@ -6,9 +6,9 @@
 @0 = private unnamed_addr global %struct.ident_t { i32 0, i32 2, i32 0, i32 0, ptr @.str }, align 8
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @b() #0 {
+define dso_local i32 @b() {
 ; CHECK-LABEL: define dso_local i32 @b(
-; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: ) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @a()
 ; CHECK-NEXT:    [[TMP3:%.*]] = load i32, ptr [[TMP1]], align 4
@@ -21,9 +21,9 @@ define dso_local i32 @b() #0 {
 }
 
 ; Function Attrs: nounwind uwtable
-define internal i32 @a() #0 {
+define internal i32 @a() {
 ; CHECK-LABEL: define internal i32 @a(
-; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-SAME: ) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = alloca i32, align 4
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i32 @b()
 ; CHECK-NEXT:    call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr @[[GLOB0:[0-9]+]], i32 0, ptr @.omp_outlined.)
@@ -38,9 +38,9 @@ define internal i32 @a() #0 {
 }
 
 ; Function Attrs: norecurse nounwind uwtable
-define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
+define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) {
 ; CHECK-LABEL: define internal void @.omp_outlined.(
-; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) #[[ATTR1:[0-9]+]] {
+; CHECK-SAME: ptr noalias [[TMP0:%.*]], ptr noalias [[TMP1:%.*]]) {
 ; CHECK-NEXT:    [[TMP3:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    [[TMP4:%.*]] = alloca ptr, align 8
 ; CHECK-NEXT:    store ptr [[TMP0]], ptr [[TMP3]], align 8, !tbaa [[ANYPTR_TBAA2:![0-9]+]]
@@ -55,11 +55,7 @@ define internal void @.omp_outlined.(ptr noalias %0, ptr noalias %1) #1 {
 }
 
 ; Function Attrs: nounwind
-declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...) #2
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare !callback !6 void @__kmpc_fork_call(ptr, i32, ptr, ...)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/OpenMP/icv_remarks.ll b/llvm/test/Transforms/OpenMP/icv_remarks.ll
index f76d4876a55de..3caa56f3cd415 100644
--- a/llvm/test/Transforms/OpenMP/icv_remarks.ll
+++ b/llvm/test/Transforms/OpenMP/icv_remarks.ll
@@ -14,55 +14,48 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV active_levels Value: 0
 ; CHECK-DAG: remark: icv_remarks.c:12:0: OpenMP ICV cancel Value: 0
-define dso_local void @foo(i32 %a) local_unnamed_addr #0 !dbg !17 {
+define dso_local void @foo(i32 %a) local_unnamed_addr !dbg !17 {
 entry:
   %.kmpc_loc.addr = alloca %struct.ident_t, align 8
   call void @llvm.memcpy.p0.p0.i64(ptr nonnull align 8 dereferenceable(24) %.kmpc_loc.addr, ptr nonnull align 8 dereferenceable(24) @0, i64 16, i1 false)
   call void @llvm.dbg.value(metadata i32 %a, metadata !19, metadata !DIExpression()), !dbg !21
-  tail call void @omp_set_num_threads(i32 %a) #1, !dbg !22
-  %call = tail call i32 @omp_get_max_threads() #1, !dbg !23
+  tail call void @omp_set_num_threads(i32 %a), !dbg !22
+  %call = tail call i32 @omp_get_max_threads(), !dbg !23
   call void @llvm.dbg.value(metadata i32 %call, metadata !20, metadata !DIExpression()), !dbg !21
-  tail call void @use(i32 %call) #1, !dbg !24
+  tail call void @use(i32 %call), !dbg !24
   %0 = getelementptr inbounds %struct.ident_t, ptr %.kmpc_loc.addr, i64 0, i32 4, !dbg !25
   store ptr @1, ptr %0, align 8, !dbg !25, !tbaa !26
-  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.) #1, !dbg !25
+  call void (ptr, i32, ptr, ...) @__kmpc_fork_call(ptr nonnull %.kmpc_loc.addr, i32 0, ptr @.omp_outlined.), !dbg !25
   ret void, !dbg !32
 }
 
-declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr #1
+declare !dbg !4 dso_local void @omp_set_num_threads(i32) local_unnamed_addr
 
-declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr #1
+declare !dbg !9 dso_local i32 @omp_get_max_threads() local_unnamed_addr
 
-declare !dbg !12 dso_local void @use(i32) local_unnamed_addr #2
+declare !dbg !12 dso_local void @use(i32) local_unnamed_addr
 
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV nthreads Value: IMPLEMENTATION_DEFINED
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV active_levels Value: 0
 ; CHECK-DAG: remark: icv_remarks.c:18:0: OpenMP ICV cancel Value: 0
-define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) #3 !dbg !33 {
+define internal void @.omp_outlined.(ptr noalias nocapture readnone %.global_tid., ptr noalias nocapture readnone %.bound_tid.) !dbg !33 {
 entry:
   call void @llvm.dbg.value(metadata ptr %.global_tid., metadata !41, metadata !DIExpression()), !dbg !43
   call void @llvm.dbg.value(metadata ptr %.bound_tid., metadata !42, metadata !DIExpression()), !dbg !43
-  call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()) #1, !dbg !50
-  call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()) #1, !dbg !50
-  tail call void @omp_set_num_threads(i32 10) #1, !dbg !52
-  %call.i = tail call i32 @omp_get_max_threads() #1, !dbg !53
-  call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()) #1, !dbg !54
-  tail call void @use(i32 %call.i) #1, !dbg !55
+  call void @llvm.dbg.value(metadata ptr undef, metadata !44, metadata !DIExpression()), !dbg !50
+  call void @llvm.dbg.value(metadata ptr undef, metadata !47, metadata !DIExpression()), !dbg !50
+  tail call void @omp_set_num_threads(i32 10), !dbg !52
+  %call.i = tail call i32 @omp_get_max_threads(), !dbg !53
+  call void @llvm.dbg.value(metadata i32 %call.i, metadata !48, metadata !DIExpression()), !dbg !54
+  tail call void @use(i32 %call.i), !dbg !55
   ret void, !dbg !56
 }
 
-declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg) #4
+declare void @llvm.memcpy.p0.p0.i64(ptr noalias nocapture writeonly, ptr noalias nocapture readonly, i64, i1 immarg)
 
-declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr #1
+declare !callback !57 dso_local void @__kmpc_fork_call(ptr, i32, ptr, ...) local_unnamed_addr
 
-declare void @llvm.dbg.value(metadata, metadata, metadata) #5
-
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { argmemonly nounwind willreturn }
-attributes #5 = { nounwind readnone speculatable willreturn }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14, !15, !59}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
index 0a1efdade760f..eb1cc5ec7fa6c 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-correct.ll
@@ -9,7 +9,6 @@
 ; CHECK-NOT: remark: {{.*}}
 ; CHECK: !{!"branch_weights", i32 0, i32 200000}
 
-
 ; ModuleID = 'misexpect-branch-correct.c'
 source_filename = "misexpect-branch-correct.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -19,14 +18,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !3
-  call void @llvm.lifetime.start.p0(ptr %x) #4
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !3
   %0 = load i32, ptr %rando, align 4, !tbaa !3
   %rem = srem i32 %0, 200000
@@ -52,31 +51,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !3
-  call void @llvm.lifetime.end.p0(ptr %x) #4
-  call void @llvm.lifetime.end.p0(ptr %rando) #4
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
index 68b233edda524..339082580793b 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-overflow.ll
@@ -16,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+  call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
   %call = call i32 (...) @buzz(), !dbg !9
   store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
-  call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+  call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
   store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
   %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
   %rem = srem i32 %0, 200000, !dbg !15
@@ -49,31 +49,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
-  call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
-  call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
   ret i32 %2, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
index 2f188f5efd6fc..d34708c9d60d2 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-stripped.ll
@@ -19,7 +19,6 @@
 ; DISABLED-NOT: warning: <unknown>:0:0: 19.98%
 ; DISABLED-NOT: remark: <unknown>:0:0: Potential performance regression from use of the llvm.expect intrinsic: Annotation was correct on 19.98% (399668 / 2000000) of profiled executions.
 
-
 ; ModuleID = 'misexpect-branch.c'
 source_filename = "misexpect-branch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -29,14 +28,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !3
-  call void @llvm.lifetime.start.p0(ptr %x) #4
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !3
   %0 = load i32, ptr %rando, align 4, !tbaa !3
   %rem = srem i32 %0, 200000
@@ -62,31 +61,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !3
-  call void @llvm.lifetime.end.p0(ptr %x) #4
-  call void @llvm.lifetime.end.p0(ptr %rando) #4
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
index 4add781d52071..a43e63226a1b0 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch-unpredictable.ll
@@ -7,7 +7,6 @@
 ; CHECK-NOT: warning: {{.*}}
 ; CHECK-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-branch-unpredictable.c'
 source_filename = "clang/test/Profile/misexpect-branch-unpredictable.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -17,14 +16,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 {
+define i32 @bar() {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #3
+  call void @llvm.lifetime.start.p0(ptr %rando)
   %call = call i32 (...) @buzz()
   store i32 %call, ptr %rando, align 4, !tbaa !2
-  call void @llvm.lifetime.start.p0(ptr %x) #3
+  call void @llvm.lifetime.start.p0(ptr %x)
   store i32 0, ptr %x, align 4, !tbaa !2
   %0 = load i32, ptr %rando, align 4, !tbaa !2
   %rem = srem i32 %0, 200000
@@ -49,27 +48,22 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !tbaa !2
-  call void @llvm.lifetime.end.p0(ptr %x) #3
-  call void @llvm.lifetime.end.p0(ptr %rando) #3
+  call void @llvm.lifetime.end.p0(ptr %x)
+  call void @llvm.lifetime.end.p0(ptr %rando)
   ret i32 %2
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
index 5a7731bd2442d..7e01f7aca5c32 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-branch.ll
@@ -33,14 +33,14 @@ target triple = "x86_64-unknown-linux-gnu"
 @outer_loop = constant i32 2000, align 4
 
 ; Function Attrs: nounwind
-define i32 @bar() #0 !dbg !6 {
+define i32 @bar() !dbg !6 {
 entry:
   %rando = alloca i32, align 4
   %x = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %rando) #4, !dbg !9
+  call void @llvm.lifetime.start.p0(ptr %rando), !dbg !9
   %call = call i32 (...) @buzz(), !dbg !9
   store i32 %call, ptr %rando, align 4, !dbg !9, !tbaa !10
-  call void @llvm.lifetime.start.p0(ptr %x) #4, !dbg !14
+  call void @llvm.lifetime.start.p0(ptr %x), !dbg !14
   store i32 0, ptr %x, align 4, !dbg !14, !tbaa !10
   %0 = load i32, ptr %rando, align 4, !dbg !15, !tbaa !10
   %rem = srem i32 %0, 200000, !dbg !15
@@ -66,31 +66,25 @@ if.else:                                          ; preds = %entry
 
 if.end:                                           ; preds = %if.else, %if.then
   %2 = load i32, ptr %x, align 4, !dbg !19, !tbaa !10
-  call void @llvm.lifetime.end.p0(ptr %x) #4, !dbg !20
-  call void @llvm.lifetime.end.p0(ptr %rando) #4, !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %x), !dbg !20
+  call void @llvm.lifetime.end.p0(ptr %rando), !dbg !20
   ret i32 %2, !dbg !19
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
-declare i32 @buzz(...) #2
+declare i32 @buzz(...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #3
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare i32 @baz(i32) #2
+declare i32 @baz(i32)
 
-declare i32 @foo(i32) #2
+declare i32 @foo(i32)
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+cx8,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind readnone willreturn }
-attributes #4 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
index 859ba7223a17b..38c27b9f3266c 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch-default.ll
@@ -25,7 +25,6 @@
 ; CORRECT-NOT: warning: {{.*}}
 ; CORRECT-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-switch.c'
 source_filename = "misexpect-switch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -37,10 +36,10 @@ target triple = "x86_64-unknown-linux-gnu"
 @arry = dso_local global [25 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 {
+define dso_local void @init_arry() {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %i) #6
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !4
   br label %for.cond
 
@@ -50,7 +49,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #6
+  %call = call i32 @rand()
   %rem = srem i32 %call, 10
   %1 = load i32, ptr %i, align 4, !tbaa !4
   %idxprom = sext i32 %1 to i64
@@ -65,24 +64,24 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #6
+  call void @llvm.lifetime.end.p0(ptr %i)
   ret void
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %val = alloca i32, align 4
@@ -90,9 +89,9 @@ entry:
   %condition = alloca i32, align 4
   store i32 0, ptr %retval, align 4
   call void @init_arry()
-  call void @llvm.lifetime.start.p0(ptr %val) #6
+  call void @llvm.lifetime.start.p0(ptr %val)
   store i32 0, ptr %val, align 4, !tbaa !4
-  call void @llvm.lifetime.start.p0(ptr %j) #6
+  call void @llvm.lifetime.start.p0(ptr %j)
   store i32 0, ptr %j, align 4, !tbaa !4
   br label %for.cond
 
@@ -102,8 +101,8 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start.p0(ptr %condition) #6
-  %call = call i32 @rand() #6
+  call void @llvm.lifetime.start.p0(ptr %condition)
+  %call = call i32 @rand()
   %rem = srem i32 %call, 5
   store i32 %rem, ptr %condition, align 4, !tbaa !4
   %1 = load i32, ptr %condition, align 4, !tbaa !4
@@ -138,7 +137,7 @@ sw.default:                                       ; preds = %for.body
   unreachable
 
 sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb
-  call void @llvm.lifetime.end.p0(ptr %condition) #6
+  call void @llvm.lifetime.end.p0(ptr %condition)
   br label %for.inc
 
 for.inc:                                          ; preds = %sw.epilog
@@ -148,25 +147,17 @@ for.inc:                                          ; preds = %sw.epilog
   br label %for.cond
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %j) #6
-  call void @llvm.lifetime.end.p0(ptr %val) #6
+  call void @llvm.lifetime.end.p0(ptr %j)
+  call void @llvm.lifetime.end.p0(ptr %val)
   ret i32 0
 }
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
index 242d5b8cc112e..966555908a51e 100644
--- a/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
+++ b/llvm/test/Transforms/PGOProfile/misexpect-switch.ll
@@ -28,7 +28,6 @@
 ; CORRECT-NOT: warning: {{.*}}
 ; CORRECT-NOT: remark: {{.*}}
 
-
 ; ModuleID = 'misexpect-switch.c'
 source_filename = "misexpect-switch.c"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -40,10 +39,10 @@ target triple = "x86_64-unknown-linux-gnu"
 @arry = dso_local global [25 x i32] zeroinitializer, align 16, !dbg !12
 
 ; Function Attrs: nounwind uwtable
-define dso_local void @init_arry() #0 !dbg !21 {
+define dso_local void @init_arry() !dbg !21 {
 entry:
   %i = alloca i32, align 4
-  call void @llvm.lifetime.start.p0(ptr %i) #6, !dbg !26
+  call void @llvm.lifetime.start.p0(ptr %i), !dbg !26
   call void @llvm.dbg.declare(metadata ptr %i, metadata !25, metadata !DIExpression()), !dbg !27
   store i32 0, ptr %i, align 4, !dbg !28, !tbaa !30
   br label %for.cond, !dbg !34
@@ -54,7 +53,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !dbg !38
 
 for.body:                                         ; preds = %for.cond
-  %call = call i32 @rand() #6, !dbg !39
+  %call = call i32 @rand(), !dbg !39
   %rem = srem i32 %call, 10, !dbg !41
   %1 = load i32, ptr %i, align 4, !dbg !42, !tbaa !30
   %idxprom = sext i32 %1 to i64, !dbg !43
@@ -69,24 +68,24 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !dbg !47, !llvm.loop !48
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #6, !dbg !50
+  call void @llvm.lifetime.end.p0(ptr %i), !dbg !50
   ret void, !dbg !50
 }
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nounwind readnone speculatable willreturn
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #2
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare dso_local i32 @rand() #3
+declare dso_local i32 @rand()
 
 ; Function Attrs: argmemonly nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define dso_local i32 @main() #0 !dbg !51 {
+define dso_local i32 @main() !dbg !51 {
 entry:
   %retval = alloca i32, align 4
   %val = alloca i32, align 4
@@ -94,10 +93,10 @@ entry:
   %condition = alloca i32, align 4
   store i32 0, ptr %retval, align 4
   call void @init_arry(), !dbg !62
-  call void @llvm.lifetime.start.p0(ptr %val) #6, !dbg !63
+  call void @llvm.lifetime.start.p0(ptr %val), !dbg !63
   call void @llvm.dbg.declare(metadata ptr %val, metadata !55, metadata !DIExpression()), !dbg !64
   store i32 0, ptr %val, align 4, !dbg !64, !tbaa !30
-  call void @llvm.lifetime.start.p0(ptr %j) #6, !dbg !65
+  call void @llvm.lifetime.start.p0(ptr %j), !dbg !65
   call void @llvm.dbg.declare(metadata ptr %j, metadata !56, metadata !DIExpression()), !dbg !66
   store i32 0, ptr %j, align 4, !dbg !67, !tbaa !30
   br label %for.cond, !dbg !68
@@ -108,9 +107,9 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.end, !dbg !71
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start.p0(ptr %condition) #6, !dbg !72
+  call void @llvm.lifetime.start.p0(ptr %condition), !dbg !72
   call void @llvm.dbg.declare(metadata ptr %condition, metadata !57, metadata !DIExpression()), !dbg !73
-  %call = call i32 @rand() #6, !dbg !74
+  %call = call i32 @rand(), !dbg !74
   %rem = srem i32 %call, 5, !dbg !75
   store i32 %rem, ptr %condition, align 4, !dbg !73, !tbaa !30
   %1 = load i32, ptr %condition, align 4, !dbg !76, !tbaa !30
@@ -145,7 +144,7 @@ sw.default:                                       ; preds = %for.body
   unreachable, !dbg !87
 
 sw.epilog:                                        ; preds = %sw.bb3, %sw.bb2, %sw.bb
-  call void @llvm.lifetime.end.p0(ptr %condition) #6, !dbg !88
+  call void @llvm.lifetime.end.p0(ptr %condition), !dbg !88
   br label %for.inc, !dbg !89
 
 for.inc:                                          ; preds = %sw.epilog
@@ -155,25 +154,17 @@ for.inc:                                          ; preds = %sw.epilog
   br label %for.cond, !dbg !91, !llvm.loop !92
 
 for.end:                                          ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %j) #6, !dbg !94
-  call void @llvm.lifetime.end.p0(ptr %val) #6, !dbg !94
+  call void @llvm.lifetime.end.p0(ptr %j), !dbg !94
+  call void @llvm.lifetime.end.p0(ptr %val), !dbg !94
   ret i32 0, !dbg !95
 }
 
 ; Function Attrs: nounwind readnone willreturn
-declare i64 @llvm.expect.i64(i64, i64) #4
-
-declare dso_local i32 @sum(ptr, i32) #5
+declare i64 @llvm.expect.i64(i64, i64)
 
-declare dso_local i32 @random_sample(ptr, i32) #5
+declare dso_local i32 @sum(ptr, i32)
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nounwind readnone speculatable willreturn }
-attributes #3 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readnone willreturn }
-attributes #5 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #6 = { nounwind }
+declare dso_local i32 @random_sample(ptr, i32)
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!17, !18, !19}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
index 27e8fd0ece55f..3b617502ed955 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_muladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
 
 ; This function (a 16x reduction of a[i] * b[i]) should be vectorized successfully.
 
-define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) #0 {
+define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1) {
 ; CHECK-LABEL: define dso_local nofpclass(nan inf) float @vmlaq
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP3:%.*]] = load <16 x float>, ptr [[TMP0]], align 4, !tbaa [[TBAA4:![0-9]+]]
@@ -21,9 +21,9 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
   %6 = alloca i32, align 4
   store ptr %0, ptr %3, align 8, !tbaa !4
   store ptr %1, ptr %4, align 8, !tbaa !4
-  call void @llvm.lifetime.start.p0(ptr %5) #2
+  call void @llvm.lifetime.start.p0(ptr %5)
   store float 0.000000e+00, ptr %5, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %6) #2
+  call void @llvm.lifetime.start.p0(ptr %6)
   store i32 0, ptr %6, align 4, !tbaa !11
   br label %7
 
@@ -33,7 +33,7 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
   br i1 %9, label %11, label %10
 
 10:                                               ; preds = %7
-  call void @llvm.lifetime.end.p0(ptr %6) #2
+  call void @llvm.lifetime.end.p0(ptr %6)
   br label %28
 
 11:                                               ; preds = %7
@@ -61,16 +61,12 @@ define dso_local nofpclass(nan inf) float @vmlaq(ptr noundef %0, ptr noundef %1)
 
 28:                                               ; preds = %10
   %29 = load float, ptr %5, align 4, !tbaa !9
-  call void @llvm.lifetime.end.p0(ptr %5) #2
+  call void @llvm.lifetime.end.p0(ptr %5)
   ret float %29
 }
 
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #1
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #1
-
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
index 68bfbc1344f23..eefde9ddfdc1d 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/reduce_submuladd.ll
@@ -6,7 +6,7 @@ target triple = "aarch64"
 
 ; This function (a more complex reduction of (a[i] - b[i]) * itself) should be vectorized successfully.
 
-define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #0 {
+define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
 ; CHECK-LABEL: define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii
 ; CHECK-SAME: (ptr noundef readonly captures(none) [[TMP0:%.*]], ptr noundef readonly captures(none) [[TMP1:%.*]], i32 noundef [[TMP2:%.*]], i32 noundef [[TMP3:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  .preheader.i:
@@ -125,7 +125,7 @@ define dso_local noundef nofpclass(nan inf) float @_Z4testPKfS0_ii(ptr noundef %
   ret float %13
 }
 
-define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) #1 {
+define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr noundef %0, ptr noundef %1, i32 noundef %2, i32 noundef %3) {
   %5 = alloca ptr, align 8
   %6 = alloca ptr, align 8
   %7 = alloca i32, align 4
@@ -143,15 +143,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   store ptr %1, ptr %6, align 8, !tbaa !4
   store i32 %2, ptr %7, align 4, !tbaa !9
   store i32 %3, ptr %8, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %9) #3
+  call void @llvm.lifetime.start.p0(ptr %9)
   store i32 3, ptr %9, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %10) #3
+  call void @llvm.lifetime.start.p0(ptr %10)
   store i32 3, ptr %10, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %11) #3
+  call void @llvm.lifetime.start.p0(ptr %11)
   store i32 7, ptr %11, align 4, !tbaa !9
-  call void @llvm.lifetime.start.p0(ptr %12) #3
+  call void @llvm.lifetime.start.p0(ptr %12)
   store float 0.000000e+00, ptr %12, align 4, !tbaa !11
-  call void @llvm.lifetime.start.p0(ptr %13) #3
+  call void @llvm.lifetime.start.p0(ptr %13)
   store i32 0, ptr %13, align 4, !tbaa !9
   br label %18
 
@@ -162,13 +162,13 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 
 21:                                               ; preds = %18
   store i32 2, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %13) #3
+  call void @llvm.lifetime.end.p0(ptr %13)
   br label %62
 
 22:                                               ; preds = %18
-  call void @llvm.lifetime.start.p0(ptr %15) #3
+  call void @llvm.lifetime.start.p0(ptr %15)
   store float 0.000000e+00, ptr %15, align 4, !tbaa !11
-  call void @llvm.lifetime.start.p0(ptr %16) #3
+  call void @llvm.lifetime.start.p0(ptr %16)
   store i32 0, ptr %16, align 4, !tbaa !9
   br label %23
 
@@ -179,11 +179,11 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 
 26:                                               ; preds = %23
   store i32 5, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %16) #3
+  call void @llvm.lifetime.end.p0(ptr %16)
   br label %47
 
 27:                                               ; preds = %23
-  call void @llvm.lifetime.start.p0(ptr %17) #3
+  call void @llvm.lifetime.start.p0(ptr %17)
   %28 = load ptr, ptr %5, align 8, !tbaa !4
   %29 = load i32, ptr %16, align 4, !tbaa !9
   %30 = sext i32 %29 to i64
@@ -202,7 +202,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   %42 = load float, ptr %15, align 4, !tbaa !11
   %43 = fadd fast float %42, %41
   store float %43, ptr %15, align 4, !tbaa !11
-  call void @llvm.lifetime.end.p0(ptr %17) #3
+  call void @llvm.lifetime.end.p0(ptr %17)
   br label %44
 
 44:                                               ; preds = %27
@@ -226,7 +226,7 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
   %57 = load float, ptr %12, align 4, !tbaa !11
   %58 = fadd fast float %57, %56
   store float %58, ptr %12, align 4, !tbaa !11
-  call void @llvm.lifetime.end.p0(ptr %15) #3
+  call void @llvm.lifetime.end.p0(ptr %15)
   br label %59
 
 59:                                               ; preds = %47
@@ -238,20 +238,15 @@ define internal noundef nofpclass(nan inf) float @_ZL6reduceILi7EEfPKfS1_ii(ptr
 62:                                               ; preds = %21
   %63 = load float, ptr %12, align 4, !tbaa !11
   store i32 1, ptr %14, align 4
-  call void @llvm.lifetime.end.p0(ptr %12) #3
-  call void @llvm.lifetime.end.p0(ptr %11) #3
-  call void @llvm.lifetime.end.p0(ptr %10) #3
-  call void @llvm.lifetime.end.p0(ptr %9) #3
+  call void @llvm.lifetime.end.p0(ptr %12)
+  call void @llvm.lifetime.end.p0(ptr %11)
+  call void @llvm.lifetime.end.p0(ptr %10)
+  call void @llvm.lifetime.end.p0(ptr %9)
   ret float %63
 }
 
-declare void @llvm.lifetime.start.p0(ptr captures(none)) #2
-declare void @llvm.lifetime.end.p0(ptr captures(none)) #2
-
-attributes #0 = { mustprogress uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { inlinehint mustprogress nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr captures(none))
+declare void @llvm.lifetime.end.p0(ptr captures(none))
 
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
index 92e625deb11b1..82ecc3a0ed172 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/slpordering.ll
@@ -10,7 +10,7 @@ target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 target triple = "aarch64"
 
 ; Function Attrs: nounwind uwtable
-define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) #0 {
+define i32 @slpordering(ptr noundef %p1, i32 noundef %ip1, ptr noundef %p2, i32 noundef %ip2) {
 ; CHECK-LABEL: define range(i32 0, 65536) i32 @slpordering(
 ; CHECK-SAME: ptr noundef readonly captures(none) [[P1:%.*]], i32 noundef [[IP1:%.*]], ptr noundef readonly captures(none) [[P2:%.*]], i32 noundef [[IP2:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
@@ -136,14 +136,14 @@ entry:
   store i32 %ip1, ptr %ip1.addr, align 4, !tbaa !8
   store ptr %p2, ptr %p2.addr, align 8, !tbaa !4
   store i32 %ip2, ptr %ip2.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %emp) #2
-  call void @llvm.lifetime.start.p0(ptr %r0) #2
-  call void @llvm.lifetime.start.p0(ptr %r1) #2
-  call void @llvm.lifetime.start.p0(ptr %r2) #2
-  call void @llvm.lifetime.start.p0(ptr %r3) #2
-  call void @llvm.lifetime.start.p0(ptr %sum) #2
+  call void @llvm.lifetime.start.p0(ptr %emp)
+  call void @llvm.lifetime.start.p0(ptr %r0)
+  call void @llvm.lifetime.start.p0(ptr %r1)
+  call void @llvm.lifetime.start.p0(ptr %r2)
+  call void @llvm.lifetime.start.p0(ptr %r3)
+  call void @llvm.lifetime.start.p0(ptr %sum)
   store i32 0, ptr %sum, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %i) #2
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !8
   br label %for.cond
 
@@ -153,7 +153,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #2
+  call void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -241,22 +241,22 @@ for.body:                                         ; preds = %for.cond
   %shl42 = shl i32 %sub41, 16
   %rdd43 = add nsw i32 %sub36, %shl42
   store i32 %rdd43, ptr %r3, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e0) #2
+  call void @llvm.lifetime.start.p0(ptr %e0)
   %33 = load i32, ptr %r0, align 4, !tbaa !8
   %34 = load i32, ptr %r1, align 4, !tbaa !8
   %rdd44 = add i32 %33, %34
   store i32 %rdd44, ptr %e0, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e1) #2
+  call void @llvm.lifetime.start.p0(ptr %e1)
   %35 = load i32, ptr %r0, align 4, !tbaa !8
   %36 = load i32, ptr %r1, align 4, !tbaa !8
   %sub45 = sub i32 %35, %36
   store i32 %sub45, ptr %e1, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e2) #2
+  call void @llvm.lifetime.start.p0(ptr %e2)
   %37 = load i32, ptr %r2, align 4, !tbaa !8
   %38 = load i32, ptr %r3, align 4, !tbaa !8
   %rdd46 = add i32 %37, %38
   store i32 %rdd46, ptr %e2, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e3) #2
+  call void @llvm.lifetime.start.p0(ptr %e3)
   %39 = load i32, ptr %r2, align 4, !tbaa !8
   %40 = load i32, ptr %r3, align 4, !tbaa !8
   %sub47 = sub i32 %39, %40
@@ -293,10 +293,10 @@ for.body:                                         ; preds = %for.cond
   %rrrayidx61 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 %idxprom60
   %rrrayidx62 = getelementptr inbounds [4 x i32], ptr %rrrayidx61, i64 0, i64 3
   store i32 %sub59, ptr %rrrayidx62, align 4, !tbaa !8
-  call void @llvm.lifetime.end.p0(ptr %e3) #2
-  call void @llvm.lifetime.end.p0(ptr %e2) #2
-  call void @llvm.lifetime.end.p0(ptr %e1) #2
-  call void @llvm.lifetime.end.p0(ptr %e0) #2
+  call void @llvm.lifetime.end.p0(ptr %e3)
+  call void @llvm.lifetime.end.p0(ptr %e2)
+  call void @llvm.lifetime.end.p0(ptr %e1)
+  call void @llvm.lifetime.end.p0(ptr %e0)
   br label %for.inc
 
 for.inc:                                          ; preds = %for.body
@@ -316,7 +316,7 @@ for.inc:                                          ; preds = %for.body
   br label %for.cond, !llvm.loop !11
 
 for.end:                                          ; preds = %for.cond.cleanup
-  call void @llvm.lifetime.start.p0(ptr %i65) #2
+  call void @llvm.lifetime.start.p0(ptr %i65)
   store i32 0, ptr %i65, align 4, !tbaa !8
   br label %for.cond66
 
@@ -326,11 +326,11 @@ for.cond66:                                       ; preds = %for.inc114, %for.en
   br i1 %cmp67, label %for.body70, label %for.cond.cleanup69
 
 for.cond.cleanup69:                               ; preds = %for.cond66
-  call void @llvm.lifetime.end.p0(ptr %i65) #2
+  call void @llvm.lifetime.end.p0(ptr %i65)
   br label %for.end116
 
 for.body70:                                       ; preds = %for.cond66
-  call void @llvm.lifetime.start.p0(ptr %e071) #2
+  call void @llvm.lifetime.start.p0(ptr %e071)
   %rrrayidx72 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
   %59 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom73 = sext i32 %59 to i64
@@ -343,7 +343,7 @@ for.body70:                                       ; preds = %for.cond66
   %62 = load i32, ptr %rrrayidx77, align 4, !tbaa !8
   %rdd78 = add i32 %60, %62
   store i32 %rdd78, ptr %e071, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e179) #2
+  call void @llvm.lifetime.start.p0(ptr %e179)
   %rrrayidx80 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 0
   %63 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom81 = sext i32 %63 to i64
@@ -356,7 +356,7 @@ for.body70:                                       ; preds = %for.cond66
   %66 = load i32, ptr %rrrayidx85, align 4, !tbaa !8
   %sub86 = sub i32 %64, %66
   store i32 %sub86, ptr %e179, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e287) #2
+  call void @llvm.lifetime.start.p0(ptr %e287)
   %rrrayidx88 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
   %67 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom89 = sext i32 %67 to i64
@@ -369,7 +369,7 @@ for.body70:                                       ; preds = %for.cond66
   %70 = load i32, ptr %rrrayidx93, align 4, !tbaa !8
   %rdd94 = add i32 %68, %70
   store i32 %rdd94, ptr %e287, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %e395) #2
+  call void @llvm.lifetime.start.p0(ptr %e395)
   %rrrayidx96 = getelementptr inbounds [4 x [4 x i32]], ptr %emp, i64 0, i64 2
   %71 = load i32, ptr %i65, align 4, !tbaa !8
   %idxprom97 = sext i32 %71 to i64
@@ -398,10 +398,10 @@ for.body70:                                       ; preds = %for.cond66
   %82 = load i32, ptr %e395, align 4, !tbaa !8
   %sub106 = sub nsw i32 %81, %82
   store i32 %sub106, ptr %r3, align 4, !tbaa !8
-  call void @llvm.lifetime.end.p0(ptr %e395) #2
-  call void @llvm.lifetime.end.p0(ptr %e287) #2
-  call void @llvm.lifetime.end.p0(ptr %e179) #2
-  call void @llvm.lifetime.end.p0(ptr %e071) #2
+  call void @llvm.lifetime.end.p0(ptr %e395)
+  call void @llvm.lifetime.end.p0(ptr %e287)
+  call void @llvm.lifetime.end.p0(ptr %e179)
+  call void @llvm.lifetime.end.p0(ptr %e071)
   %83 = load i32, ptr %r0, align 4, !tbaa !8
   %call = call i32 @twoabs(i32 noundef %83)
   %84 = load i32, ptr %r1, align 4, !tbaa !8
@@ -432,28 +432,28 @@ for.end116:                                       ; preds = %for.cond.cleanup69
   %shr = lshr i32 %90, 16
   %rdd119 = add i32 %conv118, %shr
   %shr120 = lshr i32 %rdd119, 1
-  call void @llvm.lifetime.end.p0(ptr %sum) #2
-  call void @llvm.lifetime.end.p0(ptr %r3) #2
-  call void @llvm.lifetime.end.p0(ptr %r2) #2
-  call void @llvm.lifetime.end.p0(ptr %r1) #2
-  call void @llvm.lifetime.end.p0(ptr %r0) #2
-  call void @llvm.lifetime.end.p0(ptr %emp) #2
+  call void @llvm.lifetime.end.p0(ptr %sum)
+  call void @llvm.lifetime.end.p0(ptr %r3)
+  call void @llvm.lifetime.end.p0(ptr %r2)
+  call void @llvm.lifetime.end.p0(ptr %r1)
+  call void @llvm.lifetime.end.p0(ptr %r0)
+  call void @llvm.lifetime.end.p0(ptr %emp)
   ret i32 %shr120
 }
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: readwrite)
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 ; Function Attrs: nounwind uwtable
-define internal i32 @twoabs(i32 noundef %r) #0 {
+define internal i32 @twoabs(i32 noundef %r) {
 entry:
   %r.addr = alloca i32, align 4
   %s = alloca i32, align 4
   store i32 %r, ptr %r.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %s) #2
+  call void @llvm.lifetime.start.p0(ptr %s)
   %0 = load i32, ptr %r.addr, align 4, !tbaa !8
   %shr = lshr i32 %0, 15
   %rnd = and i32 %shr, 65537
@@ -464,14 +464,10 @@ entry:
   %rdd = add i32 %1, %2
   %3 = load i32, ptr %s, align 4, !tbaa !8
   %xor = xor i32 %rdd, %3
-  call void @llvm.lifetime.end.p0(ptr %s) #2
+  call void @llvm.lifetime.end.p0(ptr %s)
   ret i32 %xor
 }
 
-attributes #0 = { nounwind uwtable "frame-pointer"="non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+fp-armv8,+neon,+v8a,-fmv" "unsafe-fp-math"="true" }
-attributes #1 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
-attributes #2 = { nounwind }
-
 !4 = !{!5, !5, i64 0}
 !5 = !{!"any pointer", !6, i64 0}
 !6 = !{!"omnipotent char", !7, i64 0}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
index 7fb72e6c5147a..811957c7ea5d3 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_add_q7.ll
@@ -170,5 +170,5 @@ unreachable:                                      ; preds = %cleanup
 declare void @llvm.lifetime.start.p0(ptr nocapture)
 declare void @llvm.lifetime.end.p0(ptr nocapture)
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
-attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
+attributes #1 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
index 436f848a94366..67355779fdac0 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_fill_q7.ll
@@ -9,7 +9,7 @@ target triple = "thumbv6m-none-none-eabi"
 ; should be deleted, too.
 
 ; Function Attrs: nounwind
-define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) #0 {
+define dso_local void @arm_fill_q7(i8 signext %value, ptr %pDst, i32 %blockSize) {
 ; OLDPM-LABEL: @arm_fill_q7(
 ; OLDPM-NEXT:  entry:
 ; OLDPM-NEXT:    [[CMP_NOT20:%.*]] = icmp ult i32 [[BLOCKSIZE:%.*]], 4
@@ -59,8 +59,8 @@ entry:
   store i8 %value, ptr %value.addr, align 1, !tbaa !3
   store ptr %pDst, ptr %pDst.addr, align 4, !tbaa !6
   store i32 %blockSize, ptr %blockSize.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %blkCnt) #3
-  call void @llvm.lifetime.start.p0(ptr %packedValue) #3
+  call void @llvm.lifetime.start.p0(ptr %blkCnt)
+  call void @llvm.lifetime.start.p0(ptr %packedValue)
   %0 = load i8, ptr %value.addr, align 1, !tbaa !3
   %conv = sext i8 %0 to i32
   %shl = shl i32 %conv, 0
@@ -122,23 +122,23 @@ while.body16:                                     ; preds = %while.cond13
   br label %while.cond13, !llvm.loop !12
 
 while.end18:                                      ; preds = %while.cond13
-  call void @llvm.lifetime.end.p0(ptr %packedValue) #3
-  call void @llvm.lifetime.end.p0(ptr %blkCnt) #3
+  call void @llvm.lifetime.end.p0(ptr %packedValue)
+  call void @llvm.lifetime.end.p0(ptr %blkCnt)
   ret void
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture)
 
 ; Function Attrs: alwaysinline nounwind
-define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) #2 {
+define internal void @write_q7x4_ia(ptr %pQ7, i32 %value) {
 entry:
   %pQ7.addr = alloca ptr, align 4
   %value.addr = alloca i32, align 4
   %val = alloca i32, align 4
   store ptr %pQ7, ptr %pQ7.addr, align 4, !tbaa !6
   store i32 %value, ptr %value.addr, align 4, !tbaa !8
-  call void @llvm.lifetime.start.p0(ptr %val) #3
+  call void @llvm.lifetime.start.p0(ptr %val)
   %0 = load i32, ptr %value.addr, align 4, !tbaa !8
   store i32 %0, ptr %val, align 4, !tbaa !8
   %1 = load i32, ptr %val, align 4, !tbaa !8
@@ -175,17 +175,12 @@ entry:
   %14 = load ptr, ptr %13, align 4, !tbaa !6
   %add.ptr = getelementptr inbounds i8, ptr %14, i32 4
   store ptr %add.ptr, ptr %13, align 4, !tbaa !6
-  call void @llvm.lifetime.end.p0(ptr %val) #3
+  call void @llvm.lifetime.end.p0(ptr %val)
   ret void
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m0plus" "target-features"="+armv6-m,+strict-align,+thumb-mode,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-dotprod,-dsp,-fp16fml,-fullfp16,-hwdiv,-hwdiv-arm,-i8mm,-lob,-mve,-mve.fp,-ras,-sb,-sha2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
index c18620710b685..4274719f2efd3 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mean_q7.ll
@@ -133,7 +133,7 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 declare i32 @llvm.arm.mve.addv.v16i8(<16 x i8>, i32) #2
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-pacbti,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #1 = { argmemonly nocallback nofree nosync nounwind willreturn }
 attributes #2 = { nounwind readnone }
 attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
index a0bb4ba947c9c..3a8b69ca9fb99 100644
--- a/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
+++ b/llvm/test/Transforms/PhaseOrdering/ARM/arm_mult_q15.ll
@@ -216,7 +216,7 @@ unreachable:                                      ; preds = %cleanup
 
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
 
-attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #0 = { nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" "unsafe-fp-math"="true" }
+attributes #2 = { alwaysinline nounwind "frame-pointer"="all" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m55" "target-features"="+armv8.1-m.main,+dsp,+fp-armv8d16,+fp-armv8d16sp,+fp16,+fp64,+fullfp16,+hwdiv,+lob,+mve,+mve.fp,+ras,+strict-align,+thumb-mode,+vfp2,+vfp2sp,+vfp3d16,+vfp3d16sp,+vfp4d16,+vfp4d16sp,-aes,-bf16,-cdecp0,-cdecp1,-cdecp2,-cdecp3,-cdecp4,-cdecp5,-cdecp6,-cdecp7,-crc,-crypto,-d32,-dotprod,-fp-armv8,-fp-armv8sp,-fp16fml,-hwdiv-arm,-i8mm,-neon,-sb,-sha2,-vfp3,-vfp3sp,-vfp4,-vfp4sp" }
 attributes #3 = { nounwind }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
index 8e25c9c5547d6..c5f56d3644c5f 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/fmaddsub.ll
@@ -15,7 +15,7 @@
 ; Ideally, this should reach the backend with 1 fmul, 1 fsub, 1 fadd, and 1 shuffle.
 ; That may require some coordination between VectorCombine, SLP, and other passes.
 
-define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_ps128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x float> [[A]], [[B:%.*]]
@@ -43,7 +43,7 @@ define <4 x float> @buildvector_mul_addsub_ps128(<4 x float> %C, <4 x float> %D,
   ret <4 x float> %vecinsert4
 }
 
-define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_pd128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <2 x double> [[A]], [[B:%.*]]
@@ -63,7 +63,7 @@ define <2 x double> @buildvector_mul_addsub_pd128(<2 x double> %C, <2 x double>
   ret <2 x double> %vecinsert2
 }
 
-define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_ps256(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP0:%.*]] = fsub <8 x float> [[A]], [[B:%.*]]
@@ -123,7 +123,7 @@ define <8 x float> @buildvector_mul_addsub_ps256(<8 x float> %C, <8 x float> %D,
   ret <8 x float> %vecinsert8
 }
 
-define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_addsub_pd256(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fsub <4 x double> [[A]], [[B:%.*]]
@@ -151,7 +151,7 @@ define <4 x double> @buildvector_mul_addsub_pd256(<4 x double> %C, <4 x double>
   ret <4 x double> %vecinsert4
 }
 
-define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_ps512(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP1:%.*]] = fsub <16 x float> [[A]], [[B:%.*]]
@@ -243,7 +243,7 @@ define <16 x float> @buildvector_mul_addsub_ps512(<16 x float> %C, <16 x float>
   ret <16 x float> %vecinsert16
 }
 
-define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_addsub_ps512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -350,7 +350,7 @@ define <16 x float> @buildvector_mul_addsub_ps512_partial(<16 x float> %C, <16 x
   ret <16 x float> %vecinsert16
 }
 
-define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE2-LABEL: @buildvector_mul_addsub_pd512(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -410,7 +410,7 @@ define <8 x double> @buildvector_mul_addsub_pd512(<8 x double> %C, <8 x double>
   ret <8 x double> %vecinsert8
 }
 
-define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_addsub_pd512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fsub <8 x double> [[A]], [[B:%.*]]
@@ -507,7 +507,7 @@ define <8 x double> @buildvector_mul_addsub_pd512_partial(<8 x double> %C, <8 x
   ret <8 x double> %vecinsert8
 }
 
-define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) #0 {
+define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D, <4 x float> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_ps128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x float> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x float> [[A]], [[B:%.*]]
@@ -535,7 +535,7 @@ define <4 x float> @buildvector_mul_subadd_ps128(<4 x float> %C, <4 x float> %D,
   ret <4 x float> %vecinsert4
 }
 
-define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) #0 {
+define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double> %D, <2 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_pd128(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <2 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <2 x double> [[A]], [[B:%.*]]
@@ -555,7 +555,7 @@ define <2 x double> @buildvector_mul_subadd_pd128(<2 x double> %C, <2 x double>
   ret <2 x double> %vecinsert2
 }
 
-define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) #0 {
+define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D, <8 x float> %B) {
 ; SSE2-LABEL: @buildvector_mul_subadd_ps256(
 ; SSE2-NEXT:    [[A:%.*]] = fmul <8 x float> [[C:%.*]], [[D:%.*]]
 ; SSE2-NEXT:    [[TMP0:%.*]] = fadd <8 x float> [[A]], [[B:%.*]]
@@ -634,7 +634,7 @@ define <8 x float> @buildvector_mul_subadd_ps256(<8 x float> %C, <8 x float> %D,
   ret <8 x float> %vecinsert8
 }
 
-define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) #0 {
+define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double> %D, <4 x double> %B) {
 ; CHECK-LABEL: @buildvector_mul_subadd_pd256(
 ; CHECK-NEXT:    [[A:%.*]] = fmul <4 x double> [[C:%.*]], [[D:%.*]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = fadd <4 x double> [[A]], [[B:%.*]]
@@ -662,7 +662,7 @@ define <4 x double> @buildvector_mul_subadd_pd256(<4 x double> %C, <4 x double>
   ret <4 x double> %vecinsert4
 }
 
-define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_ps512(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <16 x float> [[A]], [[B:%.*]]
@@ -756,7 +756,7 @@ define <16 x float> @buildvector_mul_subadd_ps512(<16 x float> %C, <16 x float>
   ret <16 x float> %vecinsert16
 }
 
-define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) #0 {
+define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x float> %D, <16 x float> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_ps512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <16 x float> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <16 x float> [[A]], <16 x float> poison, <12 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 13>
@@ -863,7 +863,7 @@ define <16 x float> @buildvector_mul_subadd_ps512_partial(<16 x float> %C, <16 x
   ret <16 x float> %vecinsert16
 }
 
-define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_pd512(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP0:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -925,7 +925,7 @@ define <8 x double> @buildvector_mul_subadd_pd512(<8 x double> %C, <8 x double>
   ret <8 x double> %vecinsert8
 }
 
-define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) #0 {
+define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x double> %D, <8 x double> %B) {
 ; SSE-LABEL: @buildvector_mul_subadd_pd512_partial(
 ; SSE-NEXT:    [[A:%.*]] = fmul <8 x double> [[C:%.*]], [[D:%.*]]
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <8 x double> [[A]], [[B:%.*]]
@@ -1021,5 +1021,3 @@ define <8 x double> @buildvector_mul_subadd_pd512_partial(<8 x double> %C, <8 x
   %vecinsert8 = insertelement <8 x double> %vecinsert7, double %add7, i32 7
   ret <8 x double> %vecinsert8
 }
-
-attributes #0 = { nounwind "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
index ae6f4a7b76ab8..57637d6077f28 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv-nounroll.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-apple-macosx11.0.0"
 ;        a[i] /= b;
 ;  }
 
-define void @vdiv(ptr %a, float %b) #0 {
+define void @vdiv(ptr %a, float %b) {
 ; CHECK-LABEL: define void @vdiv(
 ; CHECK-SAME: ptr captures(none) [[A:%.*]], float [[B:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -40,7 +40,7 @@ entry:
   %i = alloca i32, align 4
   store ptr %a, ptr %a.addr, align 8, !tbaa !3
   store float %b, ptr %b.addr, align 4, !tbaa !7
-  call void @llvm.lifetime.start.p0(ptr %i) #2
+  call void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !9
   br label %for.cond
 
@@ -50,7 +50,7 @@ for.cond:                                         ; preds = %for.inc, %entry
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call void @llvm.lifetime.end.p0(ptr %i) #2
+  call void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body:                                         ; preds = %for.cond
@@ -74,12 +74,8 @@ for.end:                                          ; preds = %for.cond.cleanup
   ret void
 }
 
-declare void @llvm.lifetime.start.p0(ptr nocapture) #1
-declare void @llvm.lifetime.end.p0(ptr nocapture) #1
-
-attributes #0 = { nounwind ssp uwtable "frame-pointer"="all" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+cx8,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "tune-cpu"="generic" "unsafe-fp-math"="true" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.start.p0(ptr nocapture)
+declare void @llvm.lifetime.end.p0(ptr nocapture)
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
index bcdf90c6c5c89..bfb8554e6243c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vdiv.ll
@@ -211,7 +211,7 @@ for.end:
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "unsafe-fp-math"="true" "use-soft-float"="false" }
+attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-jump-tables"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="skylake-avx512" "target-features"="+adx,+aes,+avx,+avx2,+avx512bw,+avx512cd,+avx512dq,+avx512f,+avx512vl,+bmi,+bmi2,+clflushopt,+clwb,+cx16,+cx8,+f16c,+fma,+fsgsbase,+fxsr,+invpcid,+lzcnt,+mmx,+movbe,+pclmul,+pku,+popcnt,+prfchw,+rdrnd,+rdseed,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsavec,+xsaveopt,+xsaves" "use-soft-float"="false" }
 
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
index dd5ff12fda613..987a528b52640 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -8,7 +8,7 @@
 target triple = "x86_64--"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define i32 @add_v4i32(ptr %p) #0 {
+define i32 @add_v4i32(ptr %p) {
 ; CHECK-LABEL: @add_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0:![0-9]+]]
@@ -46,7 +46,7 @@ for.end:
   ret i32 %r.0
 }
 
-define signext i16 @mul_v8i16(ptr %p) #0 {
+define signext i16 @mul_v8i16(ptr %p) {
 ; CHECK-LABEL: @mul_v8i16(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2, !tbaa [[TBAA4:![0-9]+]]
@@ -89,7 +89,7 @@ for.end:
   ret i16 %r.0
 }
 
-define signext i8 @or_v16i8(ptr %p) #0 {
+define signext i8 @or_v16i8(ptr %p) {
 ; CHECK-LABEL: @or_v16i8(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1, !tbaa [[TBAA6:![0-9]+]]
@@ -134,7 +134,7 @@ for.end:
   ret i8 %r.0
 }
 
-define i32 @smin_v4i32(ptr %p) #0 {
+define i32 @smin_v4i32(ptr %p) {
 ; CHECK-LABEL: @smin_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -185,7 +185,7 @@ for.end:
   ret i32 %r.0
 }
 
-define i32 @umax_v4i32(ptr %p) #0 {
+define i32 @umax_v4i32(ptr %p) {
 ; CHECK-LABEL: @umax_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4, !tbaa [[TBAA0]]
@@ -236,7 +236,7 @@ for.end:
   ret i32 %r.0
 }
 
-define float @fadd_v4i32(ptr %p) #0 {
+define float @fadd_v4i32(ptr %p) {
 ; CHECK-LABEL: @fadd_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7:![0-9]+]]
@@ -275,7 +275,7 @@ for.end:
   ret float %r.0
 }
 
-define float @fmul_v4i32(ptr %p) #0 {
+define float @fmul_v4i32(ptr %p) {
 ; CHECK-LABEL: @fmul_v4i32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -315,7 +315,7 @@ for.end:
   ret float %r.0
 }
 
-define float @fmin_v4f32(ptr %p) #0 {
+define float @fmin_v4f32(ptr %p) {
 ; CHECK-LABEL: @fmin_v4f32(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4, !tbaa [[TBAA7]]
@@ -440,8 +440,6 @@ entry:
   ret float %call13
 }
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+avx,+cx16,+cx8,+fxsr,+mmx,+popcnt,+sahf,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave" "unsafe-fp-math"="true" "use-soft-float"="false" }
-
 !0 = !{i32 1, !"wchar_size", i32 4}
 !1 = !{i32 7, !"PIC Level", i32 2}
 !2 = !{!"clang version 11.0.0 (https://github.com/llvm/llvm-project.git a9fe69c359de653015c39e413e48630d069abe27)"}
diff --git a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
index 125675fe50467..a13a43ce9e6ab 100644
--- a/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
+++ b/llvm/test/Transforms/PhaseOrdering/always-inline-alloca-promotion.ll
@@ -12,9 +12,8 @@ define void @pluto() #0 {
 ; CHECK-SAME: ) local_unnamed_addr #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:    [[TMP1:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp sgt i64 [[TMP1]], 0
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP2]], <vscale x 16 x float> [[TMP3]], <vscale x 16 x float> undef
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0)
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP2]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]]
 ; CHECK-NEXT:    br label %[[BB5:.*]]
 ; CHECK:       [[BB5]]:
 ; CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP4]])
@@ -34,9 +33,8 @@ define void @ham() #1 {
 ; CHECK-NEXT:  [[SNORK_EXIT:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i64, ptr inttoptr (i64 48 to ptr), align 16
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp sgt i64 [[TMP0]], 0
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 16 x float> @llvm.vector.insert.nxv16f32.nxv4f32(<vscale x 16 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer, i64 0)
-; CHECK-NEXT:    [[SPEC_SELECT:%.*]] = select i1 [[TMP1]], <vscale x 16 x float> [[TMP2]], <vscale x 16 x float> undef
-; CHECK-NEXT:    [[TMP3:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> [[SPEC_SELECT]], i64 0)
+; CHECK-NEXT:    [[TMP2:%.*]] = tail call <vscale x 4 x float> @llvm.vector.extract.nxv4f32.nxv16f32(<vscale x 16 x float> undef, i64 0)
+; CHECK-NEXT:    [[TMP3:%.*]] = select i1 [[TMP1]], <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP2]]
 ; CHECK-NEXT:    tail call void @llvm.aarch64.sme.mopa.nxv4f32(i32 0, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x i1> zeroinitializer, <vscale x 4 x float> zeroinitializer, <vscale x 4 x float> [[TMP3]])
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/SCCP/conditions-ranges.ll b/llvm/test/Transforms/SCCP/conditions-ranges.ll
index a3cf23bd3ae9a..f793814e445f8 100644
--- a/llvm/test/Transforms/SCCP/conditions-ranges.ll
+++ b/llvm/test/Transforms/SCCP/conditions-ranges.ll
@@ -1547,3 +1547,28 @@ bb2:
   call void @use(i1 %c4)
   ret void
 }
+
+define i1 @and_predicate_dominating_phi(i32 %x) {
+; CHECK-LABEL: @and_predicate_dominating_phi(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    ret i1 true
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  %ret = icmp uge i32 %res, 1
+  ret i1 %ret
+}
diff --git a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
index 3a562586d69d8..0bd9a99ba08e4 100644
--- a/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
+++ b/llvm/test/Transforms/SLPVectorizer/NVPTX/vectorizable-intrinsic.ll
@@ -5,9 +5,9 @@ target datalayout = "e-p:32:32-i64:64-v16:16-v32:32-n16:32:64"
 target triple = "nvptx--nvidiacl"
 
 ; Vector versions of the intrinsics are scalarized, so keep them scalar
-define <2 x i8> @cltz_test(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test(<2 x i8> %x) {
 ; CHECK-LABEL: define <2 x i8> @cltz_test(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -27,10 +27,9 @@ entry:
   ret <2 x i8> %vecinit2
 }
 
-
-define <2 x i8> @cltz_test_poison(<2 x i8> %x) #0 {
+define <2 x i8> @cltz_test_poison(<2 x i8> %x) {
 ; CHECK-LABEL: define <2 x i8> @cltz_test_poison(
-; CHECK-SAME: <2 x i8> [[X:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <2 x i8> [[X:%.*]]) {
 ; CHECK-NEXT:  [[ENTRY:.*:]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = extractelement <2 x i8> [[X]], i32 0
 ; CHECK-NEXT:    [[CALL_I:%.*]] = call i8 @llvm.ctlz.i8(i8 [[TMP0]], i1 false)
@@ -50,7 +49,5 @@ entry:
   ret <2 x i8> %vecinit2
 }
 
-declare i8 @llvm.ctlz.i8(i8, i1) #3
-
-attributes #0 = { alwaysinline nounwind "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare i8 @llvm.ctlz.i8(i8, i1)
+ "unsafe-fp-math"="false"
\ No newline at end of file
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
index 29589f369c54c..def73d7ffeabb 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_bullet3.ll
@@ -7,7 +7,7 @@ target triple = "x86_64-apple-macosx10.8.0"
 %class.btVector3.23.221.463.485.507.573.595.683.727.749.815.837.991.1585.1607.1629.1651.1849.2047.2069.2091.2113 = type { [4 x float] }
 
 ; Function Attrs: ssp uwtable
-define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) #0 align 2 {
+define void @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(ptr %vertices, i1 %arg) align 2 {
 ; CHECK-LABEL: @_ZN11HullLibrary15CleanupVerticesEjPK9btVector3jRjPS0_fRS0_(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[ARG:%.*]], label [[RETURN:%.*]], label [[IF_END:%.*]]
@@ -128,5 +128,3 @@ if.then17.2:                                      ; preds = %if.end22.1
 if.end22.2:                                       ; preds = %if.then17.2, %if.end22.1
   br i1 %arg, label %for.end36, label %for.body
 }
-
-attributes #0 = { ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
index fc1bd856da9c3..02e18d6a16f4b 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/crash_flop7.ll
@@ -5,7 +5,7 @@ target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f3
 target triple = "x86_64-apple-macosx10.8.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @main(i1 %arg) #0 {
+define void @main(i1 %arg) {
 ; CHECK-LABEL: @main(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 %arg, label [[WHILE_BODY:%.*]], label [[WHILE_END:%.*]]
@@ -73,5 +73,3 @@ for.body267:                                      ; preds = %for.body267, %for.b
 for.end300:                                       ; preds = %for.body267, %for.end80
   unreachable
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
index f98a569d679fe..686befe22f3b8 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/debug_info.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-apple-macosx10.7.0"
 ;   A[8] = y0; A[8+1] = y1;
 ; }
 
-define i32 @depth(ptr nocapture %A, i32 %m) #0 !dbg !4 {
+define i32 @depth(ptr nocapture %A, i32 %m) !dbg !4 {
 ; CHECK-LABEL: @depth(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:      #dbg_value(ptr [[A:%.*]], [[META12:![0-9]+]], !DIExpression(), [[META18:![0-9]+]])
@@ -60,10 +60,7 @@ for.end:                                          ; preds = %for.body.lr.ph, %en
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !32}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
index 1b76ee970e6d8..4d18bf848ff4a 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/pr16899.ll
@@ -6,7 +6,7 @@ target triple = "i386--netbsd"
 @a = common global ptr null, align 4
 
 ; Function Attrs: noreturn nounwind readonly
-define i32 @fn1() #0 {
+define i32 @fn1() {
 ; CHECK-LABEL: define i32 @fn1(
 ; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
 ; CHECK-NEXT:  [[ENTRY:.*]]:
@@ -37,8 +37,6 @@ do.body:                                          ; preds = %do.body, %entry
   br label %do.body
 }
 
-attributes #0 = { noreturn nounwind readonly "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !0 = !{!"any pointer", !1}
 !1 = !{!"omnipotent char", !2}
 !2 = !{!"Simple C/C++ TBAA"}
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
index 9e8cdc62c729a..538751f7af2dc 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vector_gep.ll
@@ -8,7 +8,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; The GEP has scalar and vector parameters and returns vector of pointers.
 
 ; Function Attrs: noreturn readonly uwtable
-define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr #0 {
+define void @_Z3fn1v(i32 %x, <16 x ptr>%y) local_unnamed_addr {
 ; CHECK-LABEL: @_Z3fn1v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[CONV42_LE:%.*]] = sext i32 [[X:%.*]] to i64
@@ -25,6 +25,3 @@ entry:
   %VectorGep208 = getelementptr i32, <16 x ptr> %y, i64 %conv42.le
   unreachable
 }
-
-attributes #0 = { noreturn readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="broadwell" "target-features"="+adx,+aes,+avx,+avx2,+avx512cd,+avx512f,+bmi,+bmi2,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rdseed,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
index 79bef36515a88..8077595d1ab51 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/vectorize-pair-path.ll
@@ -11,7 +11,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; the method implementation and it is not guaranteed that the best option
 ; encountered first (like here).
 
-define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr #0 {
+define double @root_selection(double %a, double %b, double %c, double %d) local_unnamed_addr {
 ; CHECK-LABEL: @root_selection(
 ; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <2 x double> poison, double [[B:%.*]], i32 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = insertelement <2 x double> [[TMP1]], double [[A:%.*]], i32 1
@@ -53,5 +53,3 @@ define double @root_selection(double %a, double %b, double %c, double %d) local_
   %i18 = fadd fast double %i17, %d
   ret double %i18
 }
-
-attributes #0 = { "unsafe-fp-math"="true" }
diff --git a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
index 369ca28ece55b..bdc09ed73393a 100644
--- a/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
+++ b/llvm/test/Transforms/SLPVectorizer/consecutive-access.ll
@@ -8,7 +8,7 @@
 @D = common global [2000 x float] zeroinitializer, align 16
 
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_3double(i32 %u) #0 {
+define void @foo_3double(i32 %u) {
 ; CHECK-LABEL: @foo_3double(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -65,7 +65,7 @@ entry:
 ; A[C1 + C2*i] are consecutive, if C2 is a power of 2, and C2 > C1 > 0.
 ; Thus, the following code should be vectorized.
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double(i32 %u) #0 {
+define void @foo_2double(i32 %u) {
 ; CHECK-LABEL: @foo_2double(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -104,7 +104,7 @@ entry:
 
 ; Similar to the previous test, but with different datatype.
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_4float(i32 %u) #0 {
+define void @foo_4float(i32 %u) {
 ; CHECK-LABEL: @foo_4float(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -159,7 +159,7 @@ entry:
 
 ; Similar to the previous tests, but now we are dealing with AddRec SCEV.
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop(ptr %A, i32 %n) #0 {
+define i32 @foo_loop(ptr %A, i32 %n) {
 ; CHECK-LABEL: @foo_loop(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -248,7 +248,7 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; Similar to foo_2double but with a non-power-of-2 factor and potential
 ; wrapping (both indices wrap or both don't in the same time)
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2(i32 %u) #0 {
+define void @foo_2double_non_power_of_2(i32 %u) {
 ; CHECK-LABEL: @foo_2double_non_power_of_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -289,7 +289,7 @@ entry:
 
 ; Similar to foo_2double_non_power_of_2 but with zext's instead of sext's
 ; Function Attrs: nounwind ssp uwtable
-define void @foo_2double_non_power_of_2_zext(i32 %u) #0 {
+define void @foo_2double_non_power_of_2_zext(i32 %u) {
 ; CHECK-LABEL: @foo_2double_non_power_of_2_zext(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[U_ADDR:%.*]] = alloca i32, align 4
@@ -332,7 +332,7 @@ entry:
 ; Alternatively, this is like foo_loop, but with a non-power-of-2 factor and
 ; potential wrapping (both indices wrap or both don't in the same time)
 ; Function Attrs: nounwind ssp uwtable
-define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) #0 {
+define i32 @foo_loop_non_power_of_2(ptr %A, i32 %n) {
 ; CHECK-LABEL: @foo_loop_non_power_of_2(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[A_ADDR:%.*]] = alloca ptr, align 8
@@ -438,7 +438,7 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ;
 ; Make sure we are able to vectorize this from now on:
 ;
-define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr #0 {
+define double @bar(ptr nocapture readonly %a, i32 %n) local_unnamed_addr {
 ; CHECK-X86-LABEL: @bar(
 ; CHECK-X86-NEXT:  entry:
 ; CHECK-X86-NEXT:    [[CMP15:%.*]] = icmp eq i32 [[N:%.*]], 0
@@ -548,8 +548,6 @@ define void @store_constant_expression(ptr %p) {
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
index 74e52da688040..385df87478e53 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector-inseltpoison.ll
@@ -6,7 +6,7 @@
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
 
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -42,7 +42,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 declare void @llvm.assume(i1) nounwind
 
 ; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; THRESHOLD-LABEL: @simple_select_eph(
 ; THRESHOLD-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; THRESHOLD-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -199,7 +199,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 
 ; Insert in an order different from the vector indices to make sure it
 ; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_insert_out_of_order(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -233,15 +233,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
   ret <4 x float> %rd
 }
 
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
 
 ; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -268,12 +268,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
-  call void @v4f32_user(<4 x float> %rd) #0
+  call void @v4f32_user(<4 x float> %rd)
   ret <4 x float> %rd
 }
 
 ; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_no_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -319,7 +319,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 
 ; Make sure infinite loop doesn't happen which I ran into when trying
 ; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
 ; CHECK-LABEL: @reconstruct(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -342,7 +342,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
   ret <4 x i32> %rd
 }
 
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -366,7 +366,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; Make sure when we construct partial vectors, we don't keep
 ; re-visiting the insertelement chains starting with undef
 ; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_partial_vector(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -487,7 +487,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
   ret <4 x double> %i4
 }
 
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -526,5 +526,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
   %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
   ret <8 x float> %vecinit7.i
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
index 0b896f4b3a36a..37c02d61d6516 100644
--- a/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
+++ b/llvm/test/Transforms/SLPVectorizer/insert-element-build-vector.ll
@@ -6,7 +6,7 @@
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,NOTHRESHOLD %}
 ; RUN: %if aarch64-registered-target %{ opt -S -passes=slp-vectorizer -slp-threshold=-10000 -slp-min-tree-size=0 -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s --check-prefixes=CHECK,MINTREESIZE %}
 
-define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -39,7 +39,7 @@ define <4 x float> @simple_select(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
   ret <4 x float> %rd
 }
 
-define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -77,7 +77,7 @@ define <8 x float> @simple_select2(<4 x float> %a, <4 x float> %b, <4 x i32> %c)
 declare void @llvm.assume(i1) nounwind
 
 ; This entire tree is ephemeral, don't vectorize any of it.
-define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; THRESHOLD-LABEL: @simple_select_eph(
 ; THRESHOLD-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; THRESHOLD-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -234,7 +234,7 @@ define <4 x float> @simple_select_eph(<4 x float> %a, <4 x float> %b, <4 x i32>
 
 ; Insert in an order different from the vector indices to make sure it
 ; doesn't matter
-define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_insert_out_of_order(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
@@ -268,15 +268,15 @@ define <4 x float> @simple_select_insert_out_of_order(<4 x float> %a, <4 x float
   ret <4 x float> %rd
 }
 
-declare void @v4f32_user(<4 x float>) #0
-declare void @f32_user(float) #0
+declare void @v4f32_user(<4 x float>)
+declare void @f32_user(float)
 
 ; Multiple users of the final constructed vector
-define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <4 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x float> [[A:%.*]], <4 x float> [[B:%.*]]
-; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]]) #[[ATTR0:[0-9]+]]
+; CHECK-NEXT:    call void @v4f32_user(<4 x float> [[TMP2]])
 ; CHECK-NEXT:    ret <4 x float> [[TMP2]]
 ;
   %c0 = extractelement <4 x i32> %c, i32 0
@@ -303,12 +303,12 @@ define <4 x float> @simple_select_users(<4 x float> %a, <4 x float> %b, <4 x i32
   %rb = insertelement <4 x float> %ra, float %s1, i32 1
   %rc = insertelement <4 x float> %rb, float %s2, i32 2
   %rd = insertelement <4 x float> %rc, float %s3, i32 3
-  call void @v4f32_user(<4 x float> %rd) #0
+  call void @v4f32_user(<4 x float> %rd)
   ret <4 x float> %rd
 }
 
 ; Unused insertelement
-define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_no_users(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shufflevector <4 x i32> [[C:%.*]], <4 x i32> poison, <2 x i32> <i32 0, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ne <2 x i32> [[TMP1]], zeroinitializer
@@ -355,7 +355,7 @@ define <4 x float> @simple_select_no_users(<4 x float> %a, <4 x float> %b, <4 x
 
 ; Make sure infinite loop doesn't happen which I ran into when trying
 ; to do this backwards this backwards
-define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
+define <4 x i32> @reconstruct(<4 x i32> %c) {
 ; CHECK-LABEL: @reconstruct(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -378,7 +378,7 @@ define <4 x i32> @reconstruct(<4 x i32> %c) #0 {
   ret <4 x i32> %rd
 }
 
-define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) #0 {
+define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %c) {
 ; CHECK-LABEL: @simple_select_v2(
 ; CHECK-NEXT:    [[TMP1:%.*]] = icmp ne <2 x i32> [[C:%.*]], zeroinitializer
 ; CHECK-NEXT:    [[TMP2:%.*]] = select <2 x i1> [[TMP1]], <2 x float> [[A:%.*]], <2 x float> [[B:%.*]]
@@ -402,7 +402,7 @@ define <2 x float> @simple_select_v2(<2 x float> %a, <2 x float> %b, <2 x i32> %
 ; Make sure when we construct partial vectors, we don't keep
 ; re-visiting the insertelement chains starting with zeroinitializer
 ; (low cost threshold needed to force this to happen)
-define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) #0 {
+define <4 x float> @simple_select_partial_vector(<4 x float> %a, <4 x float> %b, <4 x i32> %c) {
 ; CHECK-LABEL: @simple_select_partial_vector(
 ; CHECK-NEXT:    [[C0:%.*]] = extractelement <4 x i32> [[C:%.*]], i32 0
 ; CHECK-NEXT:    [[C1:%.*]] = extractelement <4 x i32> [[C]], i32 1
@@ -523,7 +523,7 @@ define <4 x double> @multi_tree(double %w, double %x, double %y, double %z) {
   ret <4 x double> %i4
 }
 
-define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr #0 {
+define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr {
 ; CHECK-LABEL: @_vadd256(
 ; CHECK-NEXT:    [[TMP1:%.*]] = fadd <8 x float> [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    ret <8 x float> [[TMP1]]
@@ -562,5 +562,3 @@ define <8 x float> @_vadd256(<8 x float> %a, <8 x float> %b) local_unnamed_addr
   %vecinit7.i = insertelement <8 x float> %vecinit6.i, float %add22, i32 7
   ret <8 x float> %vecinit7.i
 }
-
-attributes #0 = { nounwind ssp uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
index bc0597137bfe6..82ebdaa900fe1 100644
--- a/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
+++ b/llvm/test/Transforms/SROA/mem-par-metadata-sroa.ll
@@ -33,7 +33,6 @@
 ;     }
 ; }
 
-
 ; ModuleID = '<stdin>'
 source_filename = "mem-par-metadata-sroa1.cpp"
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
@@ -42,7 +41,7 @@ target triple = "x86_64-unknown-linux-gnu"
 %class.Complex = type { float, float }
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr #0 {
+define void @_Z4testP7Complexl(ptr nocapture %out, i64 %size) local_unnamed_addr {
 ; CHECK-LABEL: @_Z4testP7Complexl(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
@@ -108,10 +107,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1) #1
-
-attributes #0 = { norecurse nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i1)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
index 43fb2605ff646..d981626894ec3 100644
--- a/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
+++ b/llvm/test/Transforms/SafeStack/AArch64/abi_ssp.ll
@@ -1,7 +1,5 @@
 ; RUN: opt -safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 ; RUN: opt -passes=safe-stack -S -mtriple=aarch64-linux-android < %s -o - | FileCheck --check-prefixes=TLS,ANDROID %s
-; RUN: opt -passes=safe-stack -S -mtriple=aarch64-unknown-fuchsia < %s -o - | FileCheck --check-prefixes=TLS,FUCHSIA %s
 
 define void @foo() nounwind uwtable safestack sspreq {
 entry:
@@ -10,7 +8,6 @@ entry:
 
 ; TLS: %[[TP2:.*]] = call ptr @llvm.thread.pointer.p0()
 ; ANDROID: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 40
-; FUCHSIA: %[[B:.*]] = getelementptr i8, ptr %[[TP2]], i32 -16
 ; TLS: %[[StackGuard:.*]] = load ptr, ptr %[[B]]
 ; TLS: store ptr %[[StackGuard]], ptr %[[StackGuardSlot:.*]]
   %a = alloca i128, align 16
diff --git a/llvm/test/Transforms/SafeStack/ARM/debug.ll b/llvm/test/Transforms/SafeStack/ARM/debug.ll
index 207475a47a032..dfce13a81a2a9 100644
--- a/llvm/test/Transforms/SafeStack/ARM/debug.ll
+++ b/llvm/test/Transforms/SafeStack/ARM/debug.ll
@@ -42,16 +42,15 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: nounwind readnone speculatable
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #3
 
-declare void @Capture(ptr) local_unnamed_addr #4
+declare void @Capture(ptr) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-attributes #0 = { norecurse nounwind readonly safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind safestack "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { norecurse nounwind readonly safestack }
+attributes #1 = { nounwind safestack }
 attributes #2 = { argmemonly nounwind }
 attributes #3 = { nounwind readnone speculatable }
-attributes #4 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv7-a,+dsp,+neon,+vfp3,-thumb-mode" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #5 = { nounwind }
 
 !llvm.dbg.cu = !{!2}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
index fcb49357e63fa..92197a7ab163e 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc.ll
@@ -44,11 +44,10 @@ entry:
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-declare void @Capture(ptr) #2
+declare void @Capture(ptr)
 
-attributes #0 = { safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { safestack uwtable }
 attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!15, !16}
diff --git a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
index e60522fb3e53b..634231dc8d9a0 100644
--- a/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
+++ b/llvm/test/Transforms/SafeStack/X86/debug-loc2.ll
@@ -45,7 +45,7 @@ entry:
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.start.p0(ptr nocapture) #1
 
-declare void @capture(ptr) #2
+declare void @capture(ptr)
 
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #1
@@ -55,9 +55,8 @@ declare void @llvm.dbg.value(metadata, metadata, metadata) #3
 
 declare void @llvm.random.metadata.use(metadata)
 
-attributes #0 = { noinline safestack uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { noinline safestack uwtable }
 attributes #1 = { argmemonly nounwind }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #3 = { nounwind readnone }
 attributes #4 = { nounwind }
 
diff --git a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
index b8e64e82b9717..a48d46d5e540b 100644
--- a/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
+++ b/llvm/test/Transforms/SampleProfile/Inputs/profile-symbol-list.ll
@@ -90,10 +90,10 @@ while.end:                                        ; preds = %while.body
 ; Function Attrs: nofree nounwind
 declare dso_local i32 @printf(ptr nocapture readonly, ...) local_unnamed_addr #3
 
-attributes #0 = { noinline norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { norecurse nounwind readnone uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #2 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { nofree nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { noinline norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #1 = { norecurse nounwind readnone uwtable "use-sample-profile" }
+attributes #2 = { nofree norecurse nounwind uwtable "use-sample-profile" }
+attributes #3 = { nofree nounwind "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/branch.ll b/llvm/test/Transforms/SampleProfile/branch.ll
index ff5d8bbd5004f..83f9354aa6f87 100644
--- a/llvm/test/Transforms/SampleProfile/branch.ll
+++ b/llvm/test/Transforms/SampleProfile/branch.ll
@@ -62,7 +62,7 @@ if.end:                                           ; preds = %entry
   %1 = load ptr, ptr %argv.addr, align 8, !dbg !30
   %arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
   %2 = load ptr, ptr %arrayidx, align 8, !dbg !30
-  %call = call i32 @atoi(ptr %2) #4, !dbg !31
+  %call = call i32 @atoi(ptr %2), !dbg !31
   store i32 %call, ptr %limit, align 4, !dbg !29
   %3 = load i32, ptr %limit, align 4, !dbg !32
   %cmp1 = icmp sgt i32 %3, 100, !dbg !34
@@ -75,7 +75,7 @@ if.then.2:                                        ; preds = %if.end
   %4 = load ptr, ptr %argv.addr, align 8, !dbg !39
   %arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
   %5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
-  %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+  %call4 = call i32 @atoi(ptr %5), !dbg !40
   %conv = sitofp i32 %call4 to double, !dbg !40
   %mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
   store double %mul, ptr %s, align 8, !dbg !38
@@ -128,7 +128,7 @@ if.else:                                          ; preds = %if.end
   %16 = load ptr, ptr %argv.addr, align 8, !dbg !72
   %arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
   %17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
-  %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+  %call11 = call i32 @atoi(ptr %17), !dbg !74
   %conv12 = sitofp i32 %call11 to double, !dbg !74
   store double %conv12, ptr %result, align 8, !dbg !75
   br label %if.end.13
@@ -145,18 +145,14 @@ return:                                           ; preds = %if.end.13, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
+attributes #0 = { uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
index 077eab72b1547..ade2d73fb4684 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-import-list.ll
@@ -56,7 +56,7 @@ for.body:                                         ; preds = %for.body, %entry
 ; THRESHOLD-REPLAY: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320, i64 -2016976694713209516}
 ; THRESHOLD-REPLAY-NO-FUNCA: !{!"function_entry_count", i64 1, i64 446061515086924981, i64 3815895320998406042, i64 6309742469962978389, i64 7102633082150537521, i64 -2862076748587597320}
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
index fe3102355dccd..de51afd0ef45c 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline-debug.ll
@@ -88,8 +88,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
index 177329f954979..deabc275d99c9 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-inline.ll
@@ -109,8 +109,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
index f18425ed3a90a..3daa69ec8b0a2 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-summary.ll
@@ -75,8 +75,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
index 030b5aa188165..e4ee93970d6f7 100644
--- a/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
+++ b/llvm/test/Transforms/SampleProfile/csspgo-use-preinliner.ll
@@ -87,8 +87,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
index c7617c1de9bd8..4aacde86910b2 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_cold.ll
@@ -91,12 +91,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-declare void @baz(...) #3
+declare void @baz(...)
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
index 0e629216d4fca..e0cd88390ee1b 100644
--- a/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
+++ b/llvm/test/Transforms/SampleProfile/entry_counts_missing_dbginfo.ll
@@ -101,12 +101,11 @@ declare void @llvm.lifetime.start.p0(ptr nocapture) #2
 ; Function Attrs: argmemonly nounwind
 declare void @llvm.lifetime.end.p0(ptr nocapture) #2
 
-declare void @baz(...) #3
+declare void @baz(...)
 
-attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind ssp uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone speculatable }
 attributes #2 = { argmemonly nounwind }
-attributes #3 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="penryn" "target-features"="+cx16,+fxsr,+mmx,+sahf,+sse,+sse2,+sse3,+sse4.1,+ssse3,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
index 4a35cb1e13a23..8cc5d06205d8f 100644
--- a/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
+++ b/llvm/test/Transforms/SampleProfile/fsafdo_test.ll
@@ -6,7 +6,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @sum = dso_local local_unnamed_addr global i32 0, align 4
 
 declare i32 @bar(i32 %i) #0
-declare void @work(i32 %i) #2
+declare void @work(i32 %i)
 
 define dso_local void @foo() #0 !dbg !29 {
 ; CHECK: Printing analysis {{.*}} for function 'foo':
@@ -141,7 +141,7 @@ if.end9.3:
 ; CHECK: edge %if.end9.3 -> %for.cond1.preheader probability is 0x7f7cb227 / 0x80000000 = 99.60% [HOT edge]
 }
 
-define dso_local i32 @main() #3 !dbg !52 {
+define dso_local i32 @main() !dbg !52 {
 entry:
   br label %for.body, !dbg !53
 
@@ -157,10 +157,8 @@ for.end:
 }
 
 
-attributes #0 = { noinline nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile"}
+attributes #0 = { noinline nounwind uwtable "use-sample-profile"}
 attributes #1 = { argmemonly nounwind willreturn }
-attributes #2 = { nofree noinline norecurse nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind uwtable "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/gcc-simple.ll b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
index 315ac5afb9e24..2496d1894b63d 100644
--- a/llvm/test/Transforms/SampleProfile/gcc-simple.ll
+++ b/llvm/test/Transforms/SampleProfile/gcc-simple.ll
@@ -31,7 +31,7 @@ entry:
   %i.addr = alloca i64, align 8
   store i64 %i, ptr %i.addr, align 8
   call void @llvm.dbg.declare(metadata ptr %i.addr, metadata !16, metadata !17), !dbg !18
-  %call = call i32 @rand() #3, !dbg !19
+  %call = call i32 @rand(), !dbg !19
 ; CHECK: !prof ![[PROF1:[0-9]+]]
   %cmp = icmp slt i32 %call, 500, !dbg !21
   br i1 %cmp, label %if.then, label %if.else, !dbg !22
@@ -42,7 +42,7 @@ if.then:                                          ; preds = %entry
   br label %return, !dbg !23
 
 if.else:                                          ; preds = %entry
-  %call1 = call i32 @rand() #3, !dbg !25
+  %call1 = call i32 @rand(), !dbg !25
 ; CHECK: !prof ![[PROF3:[0-9]+]]
   %cmp2 = icmp sgt i32 %call1, 5000, !dbg !28
   br i1 %cmp2, label %if.then.3, label %if.else.4, !dbg !29
@@ -62,10 +62,10 @@ return:                                           ; preds = %if.else.4, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind
-declare i32 @rand() #2
+declare i32 @rand()
 
 ; Function Attrs: nounwind uwtable
 define i32 @main() #0 !dbg !9 {
@@ -141,10 +141,7 @@ for.end.6:                                        ; preds = %for.cond
 ; CHECK: ![[PROF7]] = !{!"branch_weights", i32 18943, i32 1}
 ; CHECK: ![[PROF8]] = !{!"branch_weights", i32 18942}
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/inline-act.ll b/llvm/test/Transforms/SampleProfile/inline-act.ll
index 3ab3efd9d1c31..e962642caa8eb 100644
--- a/llvm/test/Transforms/SampleProfile/inline-act.ll
+++ b/llvm/test/Transforms/SampleProfile/inline-act.ll
@@ -24,7 +24,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @t = global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
-define zeroext i1 @_Z3fooi(i32) #0 {
+define zeroext i1 @_Z3fooi(i32) {
   %switch.tableidx = sub i32 %0, 0
   %2 = icmp ult i32 %switch.tableidx, 4
   br i1 %2, label %switch.lookup, label %3
@@ -41,7 +41,7 @@ switch.lookup:                                    ; preds = %1
 }
 
 ; Function Attrs: nounwind uwtable
-define void @_Z3bari(i32) #0 !dbg !9 {
+define void @_Z3bari(i32) !dbg !9 {
   %2 = call zeroext i1 @_Z3fooi(i32 %0), !dbg !10
   br i1 %2, label %3, label %6, !dbg !10
 
@@ -55,8 +55,6 @@ define void @_Z3bari(i32) #0 !dbg !9 {
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3}
 !llvm.ident = !{!4}
diff --git a/llvm/test/Transforms/SampleProfile/misexpect.ll b/llvm/test/Transforms/SampleProfile/misexpect.ll
index 26f76ee5fccbf..a7667fcc43e13 100644
--- a/llvm/test/Transforms/SampleProfile/misexpect.ll
+++ b/llvm/test/Transforms/SampleProfile/misexpect.ll
@@ -68,7 +68,7 @@ if.end:                                           ; preds = %entry
   %1 = load ptr, ptr %argv.addr, align 8, !dbg !30
   %arrayidx = getelementptr inbounds ptr, ptr %1, i64 1, !dbg !30
   %2 = load ptr, ptr %arrayidx, align 8, !dbg !30
-  %call = call i32 @atoi(ptr %2) #4, !dbg !31
+  %call = call i32 @atoi(ptr %2), !dbg !31
   store i32 %call, ptr %limit, align 4, !dbg !29
   %3 = load i32, ptr %limit, align 4, !dbg !32
   %exp = call i32 @llvm.expect.i32(i32 %3, i32 0)
@@ -80,7 +80,7 @@ if.then.2:                                        ; preds = %if.end
   %4 = load ptr, ptr %argv.addr, align 8, !dbg !39
   %arrayidx3 = getelementptr inbounds ptr, ptr %4, i64 2, !dbg !39
   %5 = load ptr, ptr %arrayidx3, align 8, !dbg !39
-  %call4 = call i32 @atoi(ptr %5) #4, !dbg !40
+  %call4 = call i32 @atoi(ptr %5), !dbg !40
   %conv = sitofp i32 %call4 to double, !dbg !40
   %mul = fmul double 0x40370ABE6A337A81, %conv, !dbg !41
   store double %mul, ptr %s, align 8, !dbg !38
@@ -130,7 +130,7 @@ if.else:                                          ; preds = %if.end
   %16 = load ptr, ptr %argv.addr, align 8, !dbg !72
   %arrayidx10 = getelementptr inbounds ptr, ptr %16, i64 2, !dbg !72
   %17 = load ptr, ptr %arrayidx10, align 8, !dbg !72
-  %call11 = call i32 @atoi(ptr %17) #4, !dbg !74
+  %call11 = call i32 @atoi(ptr %17), !dbg !74
   %conv12 = sitofp i32 %call11 to double, !dbg !74
   store double %conv12, ptr %result, align 8, !dbg !75
   br label %if.end.13
@@ -147,23 +147,17 @@ return:                                           ; preds = %if.end.13, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readonly
-declare i32 @atoi(ptr) #2
+declare i32 @atoi(ptr)
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
 ; Function Attrs: nounwind readnone willreturn
-declare i32 @llvm.expect.i32(i32, i32) #5
+declare i32 @llvm.expect.i32(i32, i32)
 
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nounwind readonly "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind readonly }
-attributes #5 = { nounwind readnone willreturn }
+attributes #0 = { uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!13, !14}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
index 30a51989c9469..5ce8670426189 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-2.ll
@@ -17,7 +17,7 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
 
 ; Function Attrs: uwtable mustprogress
 define dso_local void @_Z3hoov() #0 !dbg !11 {
@@ -35,7 +35,7 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !22
 }
 
-declare !dbg !23 dso_local void @_Z10hoo_calleev() #2
+declare !dbg !23 dso_local void @_Z10hoo_calleev()
 
 ; MAX2-LABEL: @_Z3goov(
 ; MAX2: icmp eq ptr {{.*}} @_Z3hoov
@@ -78,12 +78,9 @@ entry:
 ; MAX4: ![[PROF_ID5]] = !{!"VP", i32 0, i64 13000, i64 4128940972712279918, i64 -1, i64 3137940972712279918, i64 -1, i64 2132940972712279918, i64 -1, i64 1850239051784516332, i64 -1}
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #3
+declare noundef i32 @puts(ptr nocapture noundef readonly)
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
index 9b9bbca48c90a..c5645cdf92225 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-3.ll
@@ -34,11 +34,10 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly) #1
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { nofree nounwind }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
index 57a2386951db5..c418372753c39 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp-4.ll
@@ -33,7 +33,7 @@ entry:
   ret void, !dbg !21
 }
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !25}
diff --git a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
index f3340ba1a587e..bf3cfe4cea91d 100644
--- a/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
+++ b/llvm/test/Transforms/SampleProfile/norepeated-icp.ll
@@ -15,7 +15,7 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...) #1
+declare dso_local noundef i32 @printf(ptr nocapture noundef readonly, ...)
 
 ; Function Attrs: uwtable mustprogress
 define dso_local void @_Z3goov() #0 !dbg !11 {
@@ -40,11 +40,9 @@ entry:
 }
 
 ; Function Attrs: nofree nounwind
-declare noundef i32 @puts(ptr nocapture noundef readonly) #2
+declare noundef i32 @puts(ptr nocapture noundef readonly)
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { nofree nounwind "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nofree nounwind }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5}
diff --git a/llvm/test/Transforms/SampleProfile/offset.ll b/llvm/test/Transforms/SampleProfile/offset.ll
index 0fdeb080cf942..0f0187ab416bb 100644
--- a/llvm/test/Transforms/SampleProfile/offset.ll
+++ b/llvm/test/Transforms/SampleProfile/offset.ll
@@ -47,7 +47,7 @@ return:                                           ; preds = %if.else, %if.then
 ; Function Attrs: nounwind readnone
 declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nounwind readnone }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-order.ll b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
index db368bcdfb9ef..eddbb0b2710b0 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-order.ll
@@ -118,8 +118,8 @@ entry:
 
 declare i32 @_Z3foo(i32)
 
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
index bb0abb19b2957..aad4e38235f34 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker-debug.ll
@@ -155,8 +155,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
index 9d2ac2f5d2bcf..a1bf35962d14f 100644
--- a/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-context-tracker.ll
@@ -152,8 +152,8 @@ entry:
 
 declare i32 @_Z3fibi(i32)
 
-attributes #0 = { nofree noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline norecurse nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree norecurse nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
index f85ab24e18100..3cb8cd1885c86 100644
--- a/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
+++ b/llvm/test/Transforms/SampleProfile/profile-topdown-order.ll
@@ -99,8 +99,8 @@ entry:
 
 declare i32 @_Z3foo(i32)
 
-attributes #0 = { nofree noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nofree nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nofree noinline nounwind uwtable "use-sample-profile" }
+attributes #1 = { nofree nounwind uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!2}
 !llvm.module.flags = !{!14, !15, !16}
diff --git a/llvm/test/Transforms/SampleProfile/propagate.ll b/llvm/test/Transforms/SampleProfile/propagate.ll
index ece85a8f8b319..3a07152c4d5da 100644
--- a/llvm/test/Transforms/SampleProfile/propagate.ll
+++ b/llvm/test/Transforms/SampleProfile/propagate.ll
@@ -123,7 +123,6 @@ for.cond8:                                        ; preds = %for.inc, %if.else7
 ; CHECK: edge %for.cond8 -> %for.body10 probability is 0x7e941a89 / 0x80000000 = 98.89% [HOT edge]
 ; CHECK: edge %for.cond8 -> %for.end probability is 0x016be577 / 0x80000000 = 1.11%
 
-
 for.body10:                                       ; preds = %for.cond8
   %14 = load i64, ptr %j, align 8, !dbg !69
   %15 = load i32, ptr %x.addr, align 4, !dbg !71
@@ -171,7 +170,7 @@ return:                                           ; preds = %if.end20, %if.then
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: norecurse uwtable
 define i32 @main() #2 !dbg !86 {
@@ -198,12 +197,10 @@ entry:
   ret i32 0, !dbg !104
 }
 
-declare i32 @printf(ptr, ...) #3
+declare i32 @printf(ptr, ...)
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { norecurse uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
-attributes #3 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
+attributes #2 = { norecurse uwtable "use-sample-profile" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
index 26ae1988ed748..7d6fd1b0569b8 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-discriminator.ll
@@ -2,11 +2,10 @@
 ; RUN: opt < %s -passes='default<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
 ; RUN: opt < %s -passes='thinlto-pre-link<O2>' -debug-info-for-profiling -pseudo-probe-for-profiling -S | FileCheck %s --check-prefix=PROBE
 
-
 @a = dso_local global i32 0, align 4
 
 ; Function Attrs: uwtable
-define void @_Z3foov(i32 %x) #0 !dbg !4 {
+define void @_Z3foov(i32 %x) !dbg !4 {
 bb0:
   %cmp = icmp eq i32 %x, 0, !dbg !10
   br i1 %cmp, label %bb1, label %bb2
@@ -30,13 +29,10 @@ bb3:
   ret void, !dbg !12
 }
 
-declare void @_Z3barv() #1
+declare void @_Z3barv()
 declare void @llvm.lifetime.start.p0(ptr nocapture) nounwind argmemonly
 declare void @llvm.lifetime.end.p0(ptr nocapture) nounwind argmemonly
 
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!7, !8}
 !llvm.ident = !{!9}
@@ -62,7 +58,6 @@ attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "fra
 ; DEBUG: ![[INST]] = !DILocation(line: 4, column: 15, scope: ![[INSTBLOCK:[0-9]+]])
 ; DEBUG: ![[INSTBLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 4)
 
-
 ; PROBE: ![[CALL1]] = !DILocation(line: 4, column: 3, scope: ![[CALL1BLOCK:[0-9]+]])
 ; PROBE: ![[CALL1BLOCK]] = !DILexicalBlockFile({{.*}} discriminator: 455147551)
 ; PROBE: ![[CALL2]] = !DILocation(line: 4, column: 9, scope: ![[CALL2BLOCK:[0-9]+]])
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
index 383289edd6947..92f04d525b57f 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-icp-factor.ll
@@ -135,7 +135,7 @@ declare dso_local i32 @printf(ptr, ...)
 ; Function Attrs: nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite)
 declare void @llvm.pseudoprobe(i64, i64, i32, i64) #3
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="true" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
 attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: readwrite) }
 attributes #3 = { nocallback nofree nosync nounwind willreturn memory(inaccessiblemem: readwrite) }
diff --git a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
index e45ddb11c3f41..08b7e4cb162af 100644
--- a/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
+++ b/llvm/test/Transforms/SampleProfile/pseudo-probe-no-debug-info.ll
@@ -1,6 +1,5 @@
 ; RUN: opt < %s -passes='pseudo-probe,cgscc(inline)' -S | FileCheck %s
 
-
 ; CHECK-LABEL: @caller(
 
 ; This instruction did not have a !dbg metadata in the callee but get a !dbg after inlined.
@@ -10,26 +9,23 @@
 ; CHECK-NOT:  call void @llvm.pseudoprobe({{.*}}), !dbg ![[#]]
 ; CHECK:  call void @llvm.pseudoprobe({{.*}})
 
-
 @a = common global i32 0, align 4
 @b = common global i32 0, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @callee() #0 {
+define void @callee() {
 entry:
   store i32 1, ptr @a, align 4
   ret void
 }
 
 ; Function Attrs: nounwind uwtable
-define void @caller() #0 !dbg !4 {
+define void @caller() !dbg !4 {
 entry:
   tail call void @callee(), !dbg !12
   ret void, !dbg !12
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
diff --git a/llvm/test/Transforms/SampleProfile/remarks.ll b/llvm/test/Transforms/SampleProfile/remarks.ll
index 3cb91b717c6ef..decf4b1aadbd9 100644
--- a/llvm/test/Transforms/SampleProfile/remarks.ll
+++ b/llvm/test/Transforms/SampleProfile/remarks.ll
@@ -202,10 +202,10 @@ entry:
   ret i32 %conv, !dbg !58
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" "use-sample-profile" }
+attributes #0 = { nounwind uwtable "use-sample-profile" }
 attributes #1 = { nounwind argmemonly }
 attributes #2 = { nounwind readnone }
-attributes #3 = { nounwind alwaysinline "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #3 = { nounwind alwaysinline }
 attributes #4 = { nounwind }
 
 !llvm.dbg.cu = !{!0}
diff --git a/llvm/test/Transforms/SampleProfile/uniqname.ll b/llvm/test/Transforms/SampleProfile/uniqname.ll
index 23c3ac92f8f0c..67988c7f753f4 100644
--- a/llvm/test/Transforms/SampleProfile/uniqname.ll
+++ b/llvm/test/Transforms/SampleProfile/uniqname.ll
@@ -65,9 +65,9 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !31
 }
 
-declare !dbg !32 dso_local void @_Z10hoo_calleev() #3
+declare !dbg !32 dso_local void @_Z10hoo_calleev()
 
-declare !dbg !33 dso_local void @_Z10moo_calleev() #3
+declare !dbg !33 dso_local void @_Z10moo_calleev()
 
 ; Function Attrs: uwtable mustprogress
 define internal void @_ZL3noov.__uniq.334154460836426447066042049082945760258() #1 !dbg !34 {
@@ -84,12 +84,11 @@ if.end:                                           ; preds = %if.then, %entry
   ret void, !dbg !37
 }
 
-declare !dbg !38 dso_local void @_Z10noo_calleev() #3
+declare !dbg !38 dso_local void @_Z10noo_calleev()
 
-attributes #0 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #1 = { uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #2 = { noinline uwtable mustprogress "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "sample-profile-suffix-elision-policy"="selected" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-sample-profile" "use-soft-float"="false" }
-attributes #3 = { "disable-tail-calls"="false" "frame-pointer"="none" "less-precise-fpmad"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #0 = { uwtable mustprogress "use-sample-profile" }
+attributes #1 = { uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
+attributes #2 = { noinline uwtable mustprogress "use-sample-profile" "sample-profile-suffix-elision-policy"="selected" }
 
 ; CHECK: ![[PROF_ID1]] = !{!"branch_weights", i32 5931}
 ; CHECK: ![[PROF_ID2]] = !{!"branch_weights", i32 2000}
diff --git a/llvm/test/Transforms/Scalarizer/dbginfo.ll b/llvm/test/Transforms/Scalarizer/dbginfo.ll
index 464e752054d2c..d6b8aa2d35ff2 100644
--- a/llvm/test/Transforms/Scalarizer/dbginfo.ll
+++ b/llvm/test/Transforms/Scalarizer/dbginfo.ll
@@ -2,7 +2,7 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) #0 !dbg !4 {
+define void @f1(ptr nocapture %a, ptr nocapture readonly %b, ptr nocapture readonly %c) !dbg !4 {
 ; CHECK: @f1(
 ; CHECK: %a.i1 = getelementptr i32, ptr %a, i32 1
 ; CHECK: %a.i2 = getelementptr i32, ptr %a, i32 2
@@ -45,10 +45,7 @@ entry:
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!18, !26}
diff --git a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
index 0f18dc2132ec5..46e38d9abd4c8 100644
--- a/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
+++ b/llvm/test/Transforms/SimplifyCFG/AArch64/prefer-fma.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S >%t
+; RUN: opt < %s -mtriple=aarch64-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S >%t
 ; RUN: FileCheck %s < %t
 ; ModuleID = 't.cc'
 
diff --git a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
index ccc8defc1dbe6..7aa6ced48820f 100644
--- a/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
+++ b/llvm/test/Transforms/SimplifyCFG/Hexagon/switch-to-lookup-table.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-p:32:32:32-a:0-n16:32-i64:64:64-i32:32:32-i16:16:16-i
 target triple = "hexagon-unknown--elf"
 
 ; Function Attrs: noinline nounwind
-define i32 @foo(i32 %x) #0 section ".tcm_text" {
+define i32 @foo(i32 %x) section ".tcm_text" {
 ; ENABLE-LABEL: @foo(
 ; ENABLE-NEXT:  entry:
 ; ENABLE-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[X:%.*]], 6
@@ -92,5 +92,3 @@ return:                                           ; preds = %sw.default, %sw.bb5
   %1 = load i32, ptr %retval, align 4
   ret i32 %1
 }
-
-attributes #0 = { noinline nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="hexagonv60" "target-features"="-hvx,-long-calls" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
index c7bc43e1faaed..b61d659673945 100644
--- a/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
+++ b/llvm/test/Transforms/SimplifyCFG/PowerPC/prefer-fma.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -enable-unsafe-fp-math -S | \
+; RUN: opt < %s -mtriple=powerpc64le-unknown-linux-gnu -passes=simplifycfg -simplifycfg-require-and-preserve-domtree=1 -hoist-common-insts=true -S | \
 ; RUN: FileCheck %s
 
 ; This case is copied from test/Transforms/SimplifyCFG/AArch64/
diff --git a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
index 6e6c97f12c86f..2d21a59332579 100644
--- a/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
+++ b/llvm/test/Transforms/SimplifyCFG/X86/merge-cleanuppads.ll
@@ -3,37 +3,33 @@ target datalayout = "e-m:w-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-windows-msvc18.0.0"
 
 ; Function Attrs: uwtable
-define void @test1() #0 personality ptr @__CxxFrameHandler3 {
+define void @test1() personality ptr @__CxxFrameHandler3 {
 entry:
   invoke void @may_throw(i32 3)
           to label %invoke.cont unwind label %ehcleanup
 
 invoke.cont:                                      ; preds = %entry
-  tail call void @may_throw(i32 2) #2
-  tail call void @may_throw(i32 1) #2
+  tail call void @may_throw(i32 2)
+  tail call void @may_throw(i32 1)
   ret void
 
 ehcleanup:                                        ; preds = %entry
   %cp = cleanuppad within none []
-  tail call void @may_throw(i32 2) #2 [ "funclet"(token %cp) ]
+  tail call void @may_throw(i32 2) [ "funclet"(token %cp) ]
   cleanupret from %cp unwind label %ehcleanup2
 
 ehcleanup2:
   %cp2 = cleanuppad within none []
-  tail call void @may_throw(i32 1) #2 [ "funclet"(token %cp2) ]
+  tail call void @may_throw(i32 1) [ "funclet"(token %cp2) ]
   cleanupret from %cp2 unwind to caller
 }
 
 ; CHECK-LABEL: define void @test1(
 ; CHECK: %[[cp:.*]] = cleanuppad within none []
-; CHECK: tail call void @may_throw(i32 2) #2 [ "funclet"(token %[[cp]]) ]
-; CHECK: tail call void @may_throw(i32 1) #2 [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 2) [ "funclet"(token %[[cp]]) ]
+; CHECK: tail call void @may_throw(i32 1) [ "funclet"(token %[[cp]]) ]
 ; CHECK: cleanupret from %[[cp]] unwind to caller
 
-declare void @may_throw(i32) #1
+declare void @may_throw(i32)
 
 declare i32 @__CxxFrameHandler3(...)
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
index 19e1c73cfcba2..0363792ac0afc 100644
--- a/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
+++ b/llvm/test/Transforms/SimplifyCFG/pr50060-constantfold-loopid.ll
@@ -13,7 +13,7 @@
 @C = dso_local global i32 0, align 4
 
 ; Function Attrs: nounwind
-define dso_local void @_Z6test01v() addrspace(1) #0 {
+define dso_local void @_Z6test01v() addrspace(1) {
 ; CHECK-LABEL: @_Z6test01v(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[J:%.*]] = alloca i32, align 4
@@ -22,7 +22,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK:       do.body:
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr @C, align 4, !tbaa [[TBAA2:![0-9]+]]
 ; CHECK-NEXT:    [[INC:%.*]] = add nsw i32 [[TMP0]], 1
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]]) #[[ATTR2:[0-9]+]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[J]])
 ; CHECK-NEXT:    store i32 0, ptr [[J]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
@@ -30,11 +30,11 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp slt i32 [[TMP1]], 3
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[J]])
 ; CHECK-NEXT:    br label [[DO_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; CHECK:       for.body:
 ; CHECK-NEXT:    store i32 undef, ptr [[I]], align 4
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.start.p0(ptr [[I]])
 ; CHECK-NEXT:    store i32 0, ptr [[I]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    br label [[FOR_COND1:%.*]]
 ; CHECK:       for.cond1:
@@ -43,7 +43,7 @@ define dso_local void @_Z6test01v() addrspace(1) #0 {
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_BODY4:%.*]], label [[FOR_COND_CLEANUP3:%.*]]
 ; CHECK:       for.cond.cleanup3:
-; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]]) #[[ATTR2]]
+; CHECK-NEXT:    call addrspace(1) void @llvm.lifetime.end.p0(ptr [[I]])
 ; CHECK-NEXT:    [[TMP4:%.*]] = load i32, ptr [[J]], align 4, !tbaa [[TBAA2]]
 ; CHECK-NEXT:    [[INC7:%.*]] = add nsw i32 [[TMP4]], 1
 ; CHECK-NEXT:    store i32 [[INC7]], ptr [[J]], align 4, !tbaa [[TBAA2]]
@@ -64,7 +64,7 @@ entry:
 do.body:                                          ; preds = %do.cond, %entry
   %0 = load i32, ptr @C, align 4, !tbaa !2
   %inc = add nsw i32 %0, 1
-  call addrspace(1) void @llvm.lifetime.start.p0(ptr %j) #2
+  call addrspace(1) void @llvm.lifetime.start.p0(ptr %j)
   store i32 0, ptr %j, align 4, !tbaa !2
   br label %for.cond
 
@@ -74,12 +74,12 @@ for.cond:                                         ; preds = %for.inc6, %do.body
   br i1 %cmp, label %for.body, label %for.cond.cleanup
 
 for.cond.cleanup:                                 ; preds = %for.cond
-  call addrspace(1) void @llvm.lifetime.end.p0(ptr %j) #2
+  call addrspace(1) void @llvm.lifetime.end.p0(ptr %j)
   br label %for.end8
 
 for.body:                                         ; preds = %for.cond
   store i32 undef, ptr %i, align 4
-  call addrspace(1) void @llvm.lifetime.start.p0(ptr %i) #2
+  call addrspace(1) void @llvm.lifetime.start.p0(ptr %i)
   store i32 0, ptr %i, align 4, !tbaa !2
   br label %for.cond1
 
@@ -90,7 +90,7 @@ for.cond1:                                        ; preds = %for.inc, %for.body
   br i1 %cmp2, label %for.body4, label %for.cond.cleanup3
 
 for.cond.cleanup3:                                ; preds = %for.cond1
-  call addrspace(1) void @llvm.lifetime.end.p0(ptr %i) #2
+  call addrspace(1) void @llvm.lifetime.end.p0(ptr %i)
   br label %for.end
 
 for.body4:                                        ; preds = %for.cond1
@@ -124,14 +124,10 @@ do.end:                                           ; preds = %do.cond
 }
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1) #1
+declare void @llvm.lifetime.start.p0(ptr nocapture) addrspace(1)
 
 ; Function Attrs: argmemonly nofree nosync nounwind willreturn
-declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1) #1
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nofree nosync nounwind willreturn }
-attributes #2 = { nounwind }
+declare void @llvm.lifetime.end.p0(ptr nocapture) addrspace(1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
index b3cbc3d049b04..0d3846d1eac16 100644
--- a/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
+++ b/llvm/test/Transforms/StraightLineStrengthReduce/AMDGPU/pr23975.ll
@@ -7,7 +7,7 @@ target triple = "amdgcn--"
 %struct.Matrix4x4 = type { [4 x [4 x float]] }
 
 ; Function Attrs: nounwind
-define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) #0 {
+define fastcc void @Accelerator_Intersect(ptr addrspace(1) nocapture readonly %leafTransformations) {
 ; CHECK-LABEL:  @Accelerator_Intersect(
 entry:
   %tmp = sext i32 undef to i64
@@ -17,5 +17,3 @@ entry:
   %tmp2 = load <4 x float>, ptr addrspace(1) undef, align 4
   ret void
 }
-
-attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-realign-stack" "stack-protector-buffer-size"="8" "target-cpu"="tahiti" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
index 91e9511c0baf3..11753cf3c7dd9 100644
--- a/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
+++ b/llvm/test/Transforms/StructurizeCFG/nested-loop-order.ll
@@ -63,7 +63,3 @@ ENDIF28:                                          ; preds = %ENDIF
   %tmp36 = icmp sgt i32 %tmp20, 2
   br i1 %tmp36, label %ENDLOOP, label %LOOP.outer
 }
-
-attributes #0 = { "enable-no-nans-fp-math"="true" "unsafe-fp-math"="true" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { readnone }
diff --git a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
index 2e96a92dd37d4..cc1dc4e6989a1 100644
--- a/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
+++ b/llvm/test/Transforms/Util/PredicateInfo/testandor.ll
@@ -994,3 +994,30 @@ define void @test_assume_deep_and_tree(i1 %a1) {
   call void @foo(i1 %a15)
   ret void
 }
+
+define i32 @test_and_with_phinode(i32 %x) {
+; CHECK-LABEL: @test_and_with_phinode(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XGE1:%.*]] = icmp uge i32 [[X:%.*]], 1
+; CHECK-NEXT:    [[XLT2:%.*]] = icmp ult i32 [[X]], 2
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[XGE1]], [[XLT2]]
+; CHECK:         [[X_0_1:%.*]] = bitcast i32 [[X]] to i32
+; CHECK:         [[X_0_2:%.*]] = bitcast i32 [[X_0_1]] to i32
+; CHECK-NEXT:    br i1 [[AND]], label [[PHI:%.*]], label [[NOPE:%.*]]
+; CHECK:       nope:
+; CHECK-NEXT:    br label [[PHI]]
+; CHECK:       phi:
+; CHECK-NEXT:    [[RES:%.*]] = phi i32 [ [[X_0_2]], [[ENTRY:%.*]] ], [ 1, [[NOPE]] ]
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+entry:
+  %xge1 = icmp uge i32 %x, 1
+  %xlt2 = icmp ult i32 %x, 2
+  %and = and i1 %xge1, %xlt2
+  br i1 %and, label %phi, label %nope
+nope:
+  br label %phi
+phi:
+  %res = phi i32 [ %x, %entry ], [ 1, %nope ]
+  ret i32 %res
+}
diff --git a/llvm/test/Transforms/Util/dbg-user-of-aext.ll b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
index 9e7935e14c32e..b3d1b9004a853 100644
--- a/llvm/test/Transforms/Util/dbg-user-of-aext.ll
+++ b/llvm/test/Transforms/Util/dbg-user-of-aext.ll
@@ -27,7 +27,7 @@
 %struct.foo = type { i8, i64 }
 
 ; Function Attrs: noinline nounwind uwtable
-define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) #0 !dbg !6 {
+define void @_Z1fbb3foo(i1 zeroext %b, i1 zeroext %frag, i8 %g.coerce0, i64 %g.coerce1) !dbg !6 {
 entry:
   %g = alloca %struct.foo, align 8
   %b.addr = alloca i8, align 1
@@ -51,10 +51,7 @@ entry:
 ; CHECK: ![[VAR_FRAG]] = !DILocalVariable(name: "frag"
 
 ; Function Attrs: nounwind readnone speculatable
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone speculatable }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
index ad23bf7425bc7..e9f0c8ccce7d8 100644
--- a/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
+++ b/llvm/test/Transforms/Util/libcalls-fast-math-inf-loop.ll
@@ -19,18 +19,18 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-unknown"
 
 ; Function Attrs: nounwind
-define float @fn(float %f) #0 {
+define float @fn(float %f) {
 ; CHECK: define float @fn(
 ; CHECK: call fast float @expf(
   %f.addr = alloca float, align 4
   store float %f, ptr %f.addr, align 4, !tbaa !1
   %1 = load float, ptr %f.addr, align 4, !tbaa !1
-  %call = call fast float @expf(float %1) #3
+  %call = call fast float @expf(float %1)
   ret float %call
 }
 
 ; Function Attrs: inlinehint nounwind readnone
-define available_externally float @expf(float %x) #1 {
+define available_externally float @expf(float %x) {
 ; CHECK: define available_externally float @expf(
 ; CHECK: fpext float
 ; CHECK: call fast double @exp(
@@ -39,17 +39,13 @@ define available_externally float @expf(float %x) #1 {
   store float %x, ptr %x.addr, align 4, !tbaa !1
   %1 = load float, ptr %x.addr, align 4, !tbaa !1
   %conv = fpext float %1 to double
-  %call = call fast double @exp(double %conv) #3
+  %call = call fast double @exp(double %conv)
   %conv1 = fptrunc double %call to float
   ret float %conv1
 }
 
 ; Function Attrs: nounwind readnone
-declare double @exp(double) #2
-
-attributes #0 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-features"="+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @exp(double)
 
 !llvm.ident = !{!0}
 
diff --git a/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
new file mode 100644
index 0000000000000..10566ae0dee8e
--- /dev/null
+++ b/llvm/test/Transforms/WholeProgramDevirt/speculative-devirt-single-impl.ll
@@ -0,0 +1,132 @@
+; -stats requires asserts
+; REQUIRES: asserts
+
+; Check that we can still devirtualize outside LTO mode when speculative devirtualization is enabled.
+; Check that we skip devirtualization for empty functions in speculative devirtualization mode
+
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively \
+; RUN: -pass-remarks=wholeprogramdevirt -stats %s 2>&1 | FileCheck %s
+
+target datalayout = "e-p:64:64"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK: remark: devirt-single.cc:30:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:41:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:51:32: single-impl: devirtualized a call to vf
+; CHECK: remark: devirt-single.cc:13:0: devirtualized vf
+; CHECK-NOT: devirtualized
+
+@vt1 = constant [1 x ptr] [ptr @vf], !type !8
+@vt2 = constant [1 x ptr] [ptr @vf_empty], !type !12
+
+define i1 @vf(ptr %this) #0 !dbg !7 {
+  ret i1 true
+}
+
+; This should NOT be devirtualized because during non-lto empty functions
+; are skipped.
+define void @vf_empty(ptr %this) !dbg !11 {
+  ret void
+}
+
+; CHECK: define void @call
+define void @call(ptr %obj) #1 !dbg !5 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.public.type.test(ptr %vtable, metadata !"typeid")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !6
+  ret void
+}
+
+
+; CHECK: define void @call1
+define void @call1(ptr %obj) #1 !dbg !9 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid1")
+  call void @llvm.assume(i1 %p)
+  %fptr = load ptr, ptr %vtable, align 8
+  ; CHECK: call i1 %fptr
+  %1 = call i1 %fptr(ptr %obj), !dbg !10
+  ret void
+}
+declare ptr @llvm.load.relative.i32(ptr, i32)
+
+@vt3 = private unnamed_addr constant [1 x i32] [
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr @vt3 to i64)) to i32)
+], align 4, !type !15
+
+; CHECK: define void @call2
+define void @call2(ptr %obj) #1 !dbg !13 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid2")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 0)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !14
+  ret void
+}
+
+@_ZTV1A.local = private unnamed_addr constant { [3 x i32] } { [3 x i32] [
+  i32 0,  ; offset to top
+  i32 0,  ; rtti
+  i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @vf to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)  ; vf_emptyunc offset
+] }, align 4, !type !18
+
+; CHECK: define void @call3
+define void @call3(ptr %obj) #1 !dbg !16 {
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"typeid3")
+  call void @llvm.assume(i1 %p)
+  %fptr = call ptr @llvm.load.relative.i32(ptr %vtable, i32 8)
+  ; CHECK: if.true.direct_targ:
+  ; CHECK:   call i1 @vf(
+  ; CHECK: if.false.orig_indirect:
+  ; CHECK:   call i1 %fptr(
+  call i1 %fptr(ptr %obj), !dbg !17
+  ret void
+}
+
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+!llvm.dbg.cu = !{!0}
+!llvm.module.flags = !{!2, !3}
+!llvm.ident = !{!4}
+
+!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !1, producer: "clang version 4.0.0 (trunk 278098)", isOptimized: false, runtimeVersion: 0, emissionKind: FullDebug)
+!1 = !DIFile(filename: "devirt-single.cc", directory: ".")
+!2 = !{i32 2, !"Dwarf Version", i32 4}
+!3 = !{i32 2, !"Debug Info Version", i32 3}
+!4 = !{!"clang version 4.0.0 (trunk 278098)"}
+!5 = distinct !DISubprogram(name: "call", linkageName: "_Z4callPv", scope: !1, file: !1, line: 29, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!6 = !DILocation(line: 30, column: 32, scope: !5)
+!7 = distinct !DISubprogram(name: "vf", linkageName: "_ZN3vt12vfEv", scope: !1, file: !1, line: 13, isLocal: false, isDefinition: true, scopeLine: 13, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!8 = !{i32 0, !"typeid"}
+
+!9 = distinct !DISubprogram(name: "call1", linkageName: "_Z5call1Pv", scope: !1, file: !1, line: 31, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!10 = !DILocation(line: 35, column: 32, scope: !9)
+!11 = distinct !DISubprogram(name: "vf_empty", linkageName: "_ZN3vt18vf_emptyEv", scope: !1, file: !1, line: 23, isLocal: false, isDefinition: true, scopeLine: 23, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!12 = !{i32 0, !"typeid1"}
+
+!13 = distinct !DISubprogram(name: "call2", linkageName: "_Z5call2Pv", scope: !1, file: !1, line: 40, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!14 = !DILocation(line: 41, column: 32, scope: !13)
+!15 = !{i32 0, !"typeid2"}
+
+!16 = distinct !DISubprogram(name: "call3", linkageName: "_Z5call3Pv", scope: !1, file: !1, line: 50, isLocal: false, isDefinition: true, scopeLine: 9, flags: DIFlagPrototyped, isOptimized: false, unit: !0)
+!17 = !DILocation(line: 51, column: 32, scope: !16)
+!18 = !{i32 0, !"typeid3"}
+
+
+
+; CHECK: 1 wholeprogramdevirt - Number of whole program devirtualization targets
+; CHECK: 3 wholeprogramdevirt - Number of single implementation devirtualizations
diff --git a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
index d8f5c912e9a50..8327e1cfdf1d2 100644
--- a/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
+++ b/llvm/test/Transforms/WholeProgramDevirt/virtual-const-prop-check.ll
@@ -11,6 +11,9 @@
 ; Check wildcard
 ; RUN: opt -S -passes=wholeprogramdevirt -whole-program-visibility -pass-remarks=wholeprogramdevirt -wholeprogramdevirt-skip=vf?i1 %s 2>&1 | FileCheck %s --check-prefix=SKIP
 
+; Check that no stats are reported in speculative devirtualization mode as the virtual const prop is disabled.
+; RUN: opt -S -passes=wholeprogramdevirt -devirtualize-speculatively -stats %s 2>&1 | FileCheck %s --check-prefix=CHECK-SPECULATIVE-WPD
+
 target datalayout = "e-p:64:64"
 target triple = "x86_64-unknown-linux-gnu"
 
@@ -225,3 +228,7 @@ declare ptr @llvm.load.relative.i32(ptr, i32)
 ; CHECK: 2 wholeprogramdevirt - Number of unique return value optimizations
 ; CHECK: 2 wholeprogramdevirt - Number of virtual constant propagations
 ; CHECK: 2 wholeprogramdevirt - Number of 1 bit virtual constant propagations
+
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of unique return value optimizations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of virtual constant propagations
+; CHECK-SPECULATIVE-WPD-NOT: 0 wholeprogramdevirt - Number of 1 bit virtual constant propagations
diff --git a/llvm/test/Verifier/atomics.ll b/llvm/test/Verifier/atomics.ll
index f835b98b24345..17bf5a0528d73 100644
--- a/llvm/test/Verifier/atomics.ll
+++ b/llvm/test/Verifier/atomics.ll
@@ -1,14 +1,15 @@
 ; RUN: not opt -passes=verify < %s 2>&1 | FileCheck %s
+; CHECK: atomic store operand must have integer, pointer, floating point, or vector type!
+; CHECK: atomic load operand must have integer, pointer, floating point, or vector type!
 
-; CHECK: atomic store operand must have integer, pointer, or floating point type!
-; CHECK: atomic load operand must have integer, pointer, or floating point type!
+%ty = type { i32 };
 
-define void @foo(ptr %P, <1 x i64> %v) {
-  store atomic <1 x i64> %v, ptr %P unordered, align 8
+define void @foo(ptr %P, %ty %v) {
+  store atomic %ty %v, ptr %P unordered, align 8
   ret void
 }
 
-define <1 x i64> @bar(ptr %P) {
-  %v = load atomic <1 x i64>, ptr %P unordered, align 8
-  ret <1 x i64> %v
+define %ty @bar(ptr %P) {
+  %v = load atomic %ty, ptr %P unordered, align 8
+  ret %ty %v
 }
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
index b2362f83caf4f..ade228d7a3665 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-flowaware.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 3630.00  3672.00  3714.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 3630.00  3672.00  3714.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
index f9aa10817e360..9d60e12e8ab90 100644
--- a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.ll
@@ -49,7 +49,7 @@ entry:
 ; CHECK-FUNC-LEVEL-ABC: Function: abc
 ; CHECK-FUNC-LEVEL-NEXT-ABC:  [ 878.00  889.00  900.00 ]
 
-; CHECK-FUNC-DEF: Error: Function 'def' not found
+; CHECK-FUNC-DEF: error: Function 'def' not found
 
 ; CHECK-BB-LEVEL: Function: abc
 ; CHECK-BB-LEVEL-NEXT: entry: [ 878.00  889.00  900.00 ]
diff --git a/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
new file mode 100644
index 0000000000000..ef835fe302f2e
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/embeddings-symbolic.mir
@@ -0,0 +1,92 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-DEFAULT
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=func --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-FUNC-LEVEL-ADD
+# RUN: not llvm-ir2vec embeddings --mode=mir --level=func --function=missing_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-MISSING
+# RUN: llvm-ir2vec embeddings --mode=mir --level=bb --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-BB-LEVEL
+# RUN: llvm-ir2vec embeddings --mode=mir --level=inst --function=add_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s | FileCheck %s -check-prefix=CHECK-INST-LEVEL
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    %result = mul nsw i32 %sum, 2
+    ret i32 %result
+  }
+  
+  define dso_local void @simple_function() {
+  entry:
+    ret void
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+  - { id: 3, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    %3:gr32 = ADD32rr %2, %2, implicit-def dead $eflags
+    $eax = COPY %3
+    RET 0, $eax
+
+---
+name:            simple_function
+alignment:       16
+tracksRegLiveness: true
+body:             |
+  bb.0.entry:
+    RET 0
+
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function add_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-DEFAULT: MIR2Vec embeddings for machine function simple_function:
+# CHECK-DEFAULT-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-FUNC-LEVEL-NEXT: Function vector:  [ 1.10  1.20  1.30 ]
+
+# CHECK-FUNC-LEVEL-ADD: MIR2Vec embeddings for machine function add_function:
+# CHECK-FUNC-LEVEL-ADD-NEXT: Function vector:  [ 26.50  27.10  27.70 ]
+# CHECK-FUNC-LEVEL-ADD-NOT: simple_function
+
+# CHECK-FUNC-MISSING: error: Function 'missing_function' not found
+
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 26.50  27.10  27.70 ]
+# CHECK-BB-LEVEL: MIR2Vec embeddings for machine function simple_function:
+# CHECK-BB-LEVEL-NEXT: Basic block vectors:
+# CHECK-BB-LEVEL-NEXT: MBB entry:  [ 1.10  1.20  1.30 ]
+
+# CHECK-INST-LEVEL: MIR2Vec embeddings for machine function add_function:
+# CHECK-INST-LEVEL-NEXT: Instruction vectors:
+# CHECK-INST-LEVEL: %1:gr32 = COPY $esi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL-NEXT: %0:gr32 = COPY $edi
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: %2:gr32 = nsw ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: %3:gr32 = ADD32rr
+# CHECK-INST-LEVEL:  ->  [ 3.70  3.80  3.90 ]
+# CHECK-INST-LEVEL: $eax = COPY %3:gr32
+# CHECK-INST-LEVEL-NEXT:  ->  [ 6.00  6.10  6.20 ]
+# CHECK-INST-LEVEL: RET 0, $eax
+# CHECK-INST-LEVEL-NEXT:  ->  [ 1.10  1.20  1.30 ]
diff --git a/llvm/test/tools/llvm-ir2vec/entities.mir b/llvm/test/tools/llvm-ir2vec/entities.mir
new file mode 100644
index 0000000000000..60d9c7a783c4c
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/entities.mir
@@ -0,0 +1,28 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec entities --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_x86_entities.txt %t1.log
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @test_function(i32 noundef %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    %0:gr32 = COPY $edi
+    $eax = COPY %0
+    RET 0, $eax
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.ll b/llvm/test/tools/llvm-ir2vec/error-handling.ll
index b944ea0eaae8b..8e9e45513434c 100644
--- a/llvm/test/tools/llvm-ir2vec/error-handling.ll
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.ll
@@ -10,4 +10,4 @@ entry:
 }
 
 ; CHECK-NO-VOCAB: error: IR2Vec vocabulary file path not specified; You may need to set it using --ir2vec-vocab-path
-; CHECK-FUNC-NOT-FOUND: Error: Function 'nonexistent' not found
+; CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent' not found
diff --git a/llvm/test/tools/llvm-ir2vec/error-handling.mir b/llvm/test/tools/llvm-ir2vec/error-handling.mir
new file mode 100644
index 0000000000000..caec454c9c9d5
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/error-handling.mir
@@ -0,0 +1,41 @@
+# REQUIRES: x86_64-linux
+# Test error handling and input validation for llvm-ir2vec tool in MIR mode
+
+# RUN: not llvm-ir2vec embeddings --mode=mir %s 2>&1 | FileCheck %s -check-prefix=CHECK-NO-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/nonexistent-vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-VOCAB-NOT-FOUND
+# RUN: not llvm-ir2vec embeddings --mode=mir --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_invalid_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-INVALID-VOCAB
+# RUN: not llvm-ir2vec embeddings --mode=mir --function=nonexistent_function --mir2vec-vocab-path=%S/../../CodeGen/MIR2Vec/Inputs/mir2vec_dummy_3D_vocab.json %s 2>&1 | FileCheck %s -check-prefix=CHECK-FUNC-NOT-FOUND
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @test_function(i32 noundef %a) {
+  entry:
+    ret i32 %a
+  }
+...
+---
+name:            test_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+body:             |
+  bb.0.entry:
+    liveins: $edi
+  
+    %0:gr32 = COPY $edi
+    $eax = COPY %0
+    RET 0, $eax
+
+# CHECK-NO-VOCAB: error: Failed to load MIR2Vec vocabulary - MIR2Vec vocabulary file path not specified; set it using --mir2vec-vocab-path
+
+# CHECK-VOCAB-NOT-FOUND: error: Failed to load MIR2Vec vocabulary
+# CHECK-VOCAB-NOT-FOUND: No such file or directory
+
+# CHECK-INVALID-VOCAB: error: Failed to load MIR2Vec vocabulary - Missing 'Opcodes' section in vocabulary file
+
+# CHECK-FUNC-NOT-FOUND: error: Function 'nonexistent_function' not found
diff --git a/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
new file mode 100644
index 0000000000000..2406f19eebcdd
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/lit.local.cfg
@@ -0,0 +1,3 @@
+# Don't treat files in this directory as tests
+# These are reference data files, not test scripts
+config.suffixes = []
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
new file mode 100644
index 0000000000000..dfbac4ce0c4d3
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_triplets.txt
@@ -0,0 +1,33 @@
+MAX_RELATION=4
+187	7072	1
+187	6968	2
+187	187	0
+187	7072	1
+187	6969	2
+187	10	0
+10	7072	1
+10	7072	2
+10	7072	3
+10	6961	4
+10	187	0
+187	6952	1
+187	7072	2
+187	1555	0
+1555	6882	1
+1555	6952	2
+187	7072	1
+187	6968	2
+187	187	0
+187	7072	1
+187	6969	2
+187	601	0
+601	7072	1
+601	7072	2
+601	7072	3
+601	6961	4
+601	187	0
+187	6952	1
+187	7072	2
+187	1555	0
+1555	6882	1
+1555	6952	2
diff --git a/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
new file mode 100644
index 0000000000000..dc436d123fd35
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/output/reference_x86_entities.txt
@@ -0,0 +1,7174 @@
+7173
+AAA	0
+AAD	1
+AADD	2
+AAM	3
+AAND	4
+AAS	5
+ABS_F	6
+ABS_Fp	7
+ADC	8
+ADCX	9
+ADD	10
+ADDPDrm	11
+ADDPDrr	12
+ADDPSrm	13
+ADDPSrr	14
+ADDR	15
+ADDSDrm	16
+ADDSDrm_Int	17
+ADDSDrr	18
+ADDSDrr_Int	19
+ADDSSrm	20
+ADDSSrm_Int	21
+ADDSSrr	22
+ADDSSrr_Int	23
+ADDSUBPDrm	24
+ADDSUBPDrr	25
+ADDSUBPSrm	26
+ADDSUBPSrr	27
+ADD_F	28
+ADD_FI	29
+ADD_FPrST	30
+ADD_FST	31
+ADD_Fp	32
+ADD_FpI	33
+ADD_FrST	34
+ADJCALLSTACKDOWN	35
+ADJCALLSTACKUP	36
+ADOX	37
+AESDEC	38
+AESDECLASTrm	39
+AESDECLASTrr	40
+AESDECWIDE	41
+AESDECrm	42
+AESDECrr	43
+AESENC	44
+AESENCLASTrm	45
+AESENCLASTrr	46
+AESENCWIDE	47
+AESENCrm	48
+AESENCrr	49
+AESIMCrm	50
+AESIMCrr	51
+AESKEYGENASSISTrmi	52
+AESKEYGENASSISTrri	53
+AND	54
+ANDN	55
+ANDNPDrm	56
+ANDNPDrr	57
+ANDNPSrm	58
+ANDNPSrr	59
+ANDPDrm	60
+ANDPDrr	61
+ANDPSrm	62
+ANDPSrr	63
+ANNOTATION_LABEL	64
+AOR	65
+ARITH_FENCE	66
+ARPL	67
+ASAN_CHECK_MEMACCESS	68
+AVX	69
+AVX_SET	70
+AXOR	71
+BEXTR	72
+BEXTRI	73
+BLCFILL	74
+BLCI	75
+BLCIC	76
+BLCMSK	77
+BLCS	78
+BLENDPDrmi	79
+BLENDPDrri	80
+BLENDPSrmi	81
+BLENDPSrri	82
+BLENDVPDrm	83
+BLENDVPDrr	84
+BLENDVPSrm	85
+BLENDVPSrr	86
+BLSFILL	87
+BLSI	88
+BLSIC	89
+BLSMSK	90
+BLSR	91
+BOUNDS	92
+BSF	93
+BSR	94
+BSWAP	95
+BT	96
+BTC	97
+BTR	98
+BTS	99
+BUNDLE	100
+BZHI	101
+CALL	102
+CALLpcrel	103
+CATCHRET	104
+CBW	105
+CCMP	106
+CDQ	107
+CDQE	108
+CFCMOV	109
+CFI_INSTRUCTION	110
+CHS_F	111
+CHS_Fp	112
+CLAC	113
+CLC	114
+CLD	115
+CLDEMOTE	116
+CLEANUPRET	117
+CLFLUSH	118
+CLFLUSHOPT	119
+CLGI	120
+CLI	121
+CLRSSBSY	122
+CLTS	123
+CLUI	124
+CLWB	125
+CLZERO	126
+CMC	127
+CMOV	128
+CMOVBE_F	129
+CMOVBE_Fp	130
+CMOVB_F	131
+CMOVB_Fp	132
+CMOVE_F	133
+CMOVE_Fp	134
+CMOVNBE_F	135
+CMOVNBE_Fp	136
+CMOVNB_F	137
+CMOVNB_Fp	138
+CMOVNE_F	139
+CMOVNE_Fp	140
+CMOVNP_F	141
+CMOVNP_Fp	142
+CMOVP_F	143
+CMOVP_Fp	144
+CMOV_FR	145
+CMOV_GR	146
+CMOV_RFP	147
+CMOV_VK	148
+CMOV_VR	149
+CMP	150
+CMPCCXADDmr	151
+CMPPDrmi	152
+CMPPDrri	153
+CMPPSrmi	154
+CMPPSrri	155
+CMPSB	156
+CMPSDrmi	157
+CMPSDrmi_Int	158
+CMPSDrri	159
+CMPSDrri_Int	160
+CMPSL	161
+CMPSQ	162
+CMPSSrmi	163
+CMPSSrmi_Int	164
+CMPSSrri	165
+CMPSSrri_Int	166
+CMPSW	167
+CMPXCHG	168
+COMISDrm	169
+COMISDrm_Int	170
+COMISDrr	171
+COMISDrr_Int	172
+COMISSrm	173
+COMISSrm_Int	174
+COMISSrr	175
+COMISSrr_Int	176
+COMP_FST	177
+COM_FIPr	178
+COM_FIr	179
+COM_FST	180
+COM_FpIr	181
+COM_Fpr	182
+CONVERGENCECTRL_ANCHOR	183
+CONVERGENCECTRL_ENTRY	184
+CONVERGENCECTRL_GLUE	185
+CONVERGENCECTRL_LOOP	186
+COPY	187
+COPY_TO_REGCLASS	188
+CPUID	189
+CQO	190
+CRC	191
+CS_PREFIX	192
+CTEST	193
+CVTDQ	194
+CVTPD	195
+CVTPS	196
+CVTSD	197
+CVTSI	198
+CVTSS	199
+CVTTPD	200
+CVTTPS	201
+CVTTSD	202
+CVTTSS	203
+CWD	204
+CWDE	205
+DAA	206
+DAS	207
+DATA	208
+DBG_INSTR_REF	209
+DBG_LABEL	210
+DBG_PHI	211
+DBG_VALUE	212
+DBG_VALUE_LIST	213
+DEC	214
+DIV	215
+DIVPDrm	216
+DIVPDrr	217
+DIVPSrm	218
+DIVPSrr	219
+DIVR_F	220
+DIVR_FI	221
+DIVR_FPrST	222
+DIVR_FST	223
+DIVR_Fp	224
+DIVR_FpI	225
+DIVR_FrST	226
+DIVSDrm	227
+DIVSDrm_Int	228
+DIVSDrr	229
+DIVSDrr_Int	230
+DIVSSrm	231
+DIVSSrm_Int	232
+DIVSSrr	233
+DIVSSrr_Int	234
+DIV_F	235
+DIV_FI	236
+DIV_FPrST	237
+DIV_FST	238
+DIV_Fp	239
+DIV_FpI	240
+DIV_FrST	241
+DPPDrmi	242
+DPPDrri	243
+DPPSrmi	244
+DPPSrri	245
+DS_PREFIX	246
+DYN_ALLOCA	247
+EH_LABEL	248
+EH_RETURN	249
+EH_SjLj_LongJmp	250
+EH_SjLj_SetJmp	251
+EH_SjLj_Setup	252
+ENCLS	253
+ENCLU	254
+ENCLV	255
+ENCODEKEY	256
+ENDBR	257
+ENQCMD	258
+ENQCMDS	259
+ENTER	260
+ERETS	261
+ERETU	262
+ES_PREFIX	263
+EXTRACTPSmri	264
+EXTRACTPSrri	265
+EXTRACT_SUBREG	266
+EXTRQ	267
+EXTRQI	268
+F	269
+FAKE_USE	270
+FARCALL	271
+FARJMP	272
+FAULTING_OP	273
+FBLDm	274
+FBSTPm	275
+FCOM	276
+FCOMP	277
+FCOMPP	278
+FCOS	279
+FDECSTP	280
+FEMMS	281
+FENTRY_CALL	282
+FFREE	283
+FFREEP	284
+FICOM	285
+FICOMP	286
+FINCSTP	287
+FLDCW	288
+FLDENVm	289
+FLDL	290
+FLDLG	291
+FLDLN	292
+FLDPI	293
+FNCLEX	294
+FNINIT	295
+FNOP	296
+FNSTCW	297
+FNSTSW	298
+FNSTSWm	299
+FP	300
+FPATAN	301
+FPREM	302
+FPTAN	303
+FRNDINT	304
+FRSTORm	305
+FSAVEm	306
+FSCALE	307
+FSIN	308
+FSINCOS	309
+FSTENVm	310
+FS_PREFIX	311
+FXRSTOR	312
+FXSAVE	313
+FXTRACT	314
+FYL	315
+FsFLD	316
+GC_LABEL	317
+GETSEC	318
+GF	319
+GS_PREFIX	320
+G_ABDS	321
+G_ABDU	322
+G_ABS	323
+G_ADD	324
+G_ADDRSPACE_CAST	325
+G_AND	326
+G_ANYEXT	327
+G_ASHR	328
+G_ASSERT_ALIGN	329
+G_ASSERT_SEXT	330
+G_ASSERT_ZEXT	331
+G_ATOMICRMW_ADD	332
+G_ATOMICRMW_AND	333
+G_ATOMICRMW_FADD	334
+G_ATOMICRMW_FMAX	335
+G_ATOMICRMW_FMAXIMUM	336
+G_ATOMICRMW_FMIN	337
+G_ATOMICRMW_FMINIMUM	338
+G_ATOMICRMW_FSUB	339
+G_ATOMICRMW_MAX	340
+G_ATOMICRMW_MIN	341
+G_ATOMICRMW_NAND	342
+G_ATOMICRMW_OR	343
+G_ATOMICRMW_SUB	344
+G_ATOMICRMW_UDEC_WRAP	345
+G_ATOMICRMW_UINC_WRAP	346
+G_ATOMICRMW_UMAX	347
+G_ATOMICRMW_UMIN	348
+G_ATOMICRMW_USUB_COND	349
+G_ATOMICRMW_USUB_SAT	350
+G_ATOMICRMW_XCHG	351
+G_ATOMICRMW_XOR	352
+G_ATOMIC_CMPXCHG	353
+G_ATOMIC_CMPXCHG_WITH_SUCCESS	354
+G_BITCAST	355
+G_BITREVERSE	356
+G_BLOCK_ADDR	357
+G_BR	358
+G_BRCOND	359
+G_BRINDIRECT	360
+G_BRJT	361
+G_BSWAP	362
+G_BUILD_VECTOR	363
+G_BUILD_VECTOR_TRUNC	364
+G_BZERO	365
+G_CONCAT_VECTORS	366
+G_CONSTANT	367
+G_CONSTANT_FOLD_BARRIER	368
+G_CONSTANT_POOL	369
+G_CTLZ	370
+G_CTLZ_ZERO_UNDEF	371
+G_CTPOP	372
+G_CTTZ	373
+G_CTTZ_ZERO_UNDEF	374
+G_DEBUGTRAP	375
+G_DYN_STACKALLOC	376
+G_EXTRACT	377
+G_EXTRACT_SUBVECTOR	378
+G_EXTRACT_VECTOR_ELT	379
+G_FABS	380
+G_FACOS	381
+G_FADD	382
+G_FASIN	383
+G_FATAN	384
+G_FCANONICALIZE	385
+G_FCEIL	386
+G_FCMP	387
+G_FCONSTANT	388
+G_FCOPYSIGN	389
+G_FCOS	390
+G_FCOSH	391
+G_FDIV	392
+G_FENCE	393
+G_FEXP	394
+G_FFLOOR	395
+G_FFREXP	396
+G_FILD	397
+G_FIST	398
+G_FLDCW	399
+G_FLDEXP	400
+G_FLOG	401
+G_FMA	402
+G_FMAD	403
+G_FMAXIMUM	404
+G_FMAXIMUMNUM	405
+G_FMAXNUM	406
+G_FMAXNUM_IEEE	407
+G_FMINIMUM	408
+G_FMINIMUMNUM	409
+G_FMINNUM	410
+G_FMINNUM_IEEE	411
+G_FMODF	412
+G_FMUL	413
+G_FNEARBYINT	414
+G_FNEG	415
+G_FNSTCW	416
+G_FPEXT	417
+G_FPOW	418
+G_FPOWI	419
+G_FPTOSI	420
+G_FPTOSI_SAT	421
+G_FPTOUI	422
+G_FPTOUI_SAT	423
+G_FPTRUNC	424
+G_FRAME_INDEX	425
+G_FREEZE	426
+G_FREM	427
+G_FRINT	428
+G_FSHL	429
+G_FSHR	430
+G_FSIN	431
+G_FSINCOS	432
+G_FSINH	433
+G_FSQRT	434
+G_FSUB	435
+G_FTAN	436
+G_FTANH	437
+G_GET_FPENV	438
+G_GET_FPMODE	439
+G_GET_ROUNDING	440
+G_GLOBAL_VALUE	441
+G_ICMP	442
+G_IMPLICIT_DEF	443
+G_INDEXED_LOAD	444
+G_INDEXED_SEXTLOAD	445
+G_INDEXED_STORE	446
+G_INDEXED_ZEXTLOAD	447
+G_INSERT	448
+G_INSERT_SUBVECTOR	449
+G_INSERT_VECTOR_ELT	450
+G_INTRINSIC	451
+G_INTRINSIC_CONVERGENT	452
+G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS	453
+G_INTRINSIC_FPTRUNC_ROUND	454
+G_INTRINSIC_LLRINT	455
+G_INTRINSIC_LRINT	456
+G_INTRINSIC_ROUND	457
+G_INTRINSIC_ROUNDEVEN	458
+G_INTRINSIC_TRUNC	459
+G_INTRINSIC_W_SIDE_EFFECTS	460
+G_INTTOPTR	461
+G_INVOKE_REGION_START	462
+G_IS_FPCLASS	463
+G_JUMP_TABLE	464
+G_LLROUND	465
+G_LOAD	466
+G_LROUND	467
+G_LSHR	468
+G_MEMCPY	469
+G_MEMCPY_INLINE	470
+G_MEMMOVE	471
+G_MEMSET	472
+G_MERGE_VALUES	473
+G_MUL	474
+G_OR	475
+G_PHI	476
+G_PREFETCH	477
+G_PTRAUTH_GLOBAL_VALUE	478
+G_PTRMASK	479
+G_PTRTOINT	480
+G_PTR_ADD	481
+G_READCYCLECOUNTER	482
+G_READSTEADYCOUNTER	483
+G_READ_REGISTER	484
+G_RESET_FPENV	485
+G_RESET_FPMODE	486
+G_ROTL	487
+G_ROTR	488
+G_SADDE	489
+G_SADDO	490
+G_SADDSAT	491
+G_SBFX	492
+G_SCMP	493
+G_SDIV	494
+G_SDIVFIX	495
+G_SDIVFIXSAT	496
+G_SDIVREM	497
+G_SELECT	498
+G_SET_FPENV	499
+G_SET_FPMODE	500
+G_SET_ROUNDING	501
+G_SEXT	502
+G_SEXTLOAD	503
+G_SEXT_INREG	504
+G_SHL	505
+G_SHUFFLE_VECTOR	506
+G_SITOFP	507
+G_SMAX	508
+G_SMIN	509
+G_SMULFIX	510
+G_SMULFIXSAT	511
+G_SMULH	512
+G_SMULO	513
+G_SPLAT_VECTOR	514
+G_SREM	515
+G_SSHLSAT	516
+G_SSUBE	517
+G_SSUBO	518
+G_SSUBSAT	519
+G_STACKRESTORE	520
+G_STACKSAVE	521
+G_STEP_VECTOR	522
+G_STORE	523
+G_STRICT_FADD	524
+G_STRICT_FDIV	525
+G_STRICT_FLDEXP	526
+G_STRICT_FMA	527
+G_STRICT_FMUL	528
+G_STRICT_FREM	529
+G_STRICT_FSQRT	530
+G_STRICT_FSUB	531
+G_SUB	532
+G_TRAP	533
+G_TRUNC	534
+G_TRUNC_SSAT_S	535
+G_TRUNC_SSAT_U	536
+G_TRUNC_USAT_U	537
+G_UADDE	538
+G_UADDO	539
+G_UADDSAT	540
+G_UBFX	541
+G_UBSANTRAP	542
+G_UCMP	543
+G_UDIV	544
+G_UDIVFIX	545
+G_UDIVFIXSAT	546
+G_UDIVREM	547
+G_UITOFP	548
+G_UMAX	549
+G_UMIN	550
+G_UMULFIX	551
+G_UMULFIXSAT	552
+G_UMULH	553
+G_UMULO	554
+G_UNMERGE_VALUES	555
+G_UREM	556
+G_USHLSAT	557
+G_USUBE	558
+G_USUBO	559
+G_USUBSAT	560
+G_VAARG	561
+G_VASTART	562
+G_VECREDUCE_ADD	563
+G_VECREDUCE_AND	564
+G_VECREDUCE_FADD	565
+G_VECREDUCE_FMAX	566
+G_VECREDUCE_FMAXIMUM	567
+G_VECREDUCE_FMIN	568
+G_VECREDUCE_FMINIMUM	569
+G_VECREDUCE_FMUL	570
+G_VECREDUCE_MUL	571
+G_VECREDUCE_OR	572
+G_VECREDUCE_SEQ_FADD	573
+G_VECREDUCE_SEQ_FMUL	574
+G_VECREDUCE_SMAX	575
+G_VECREDUCE_SMIN	576
+G_VECREDUCE_UMAX	577
+G_VECREDUCE_UMIN	578
+G_VECREDUCE_XOR	579
+G_VECTOR_COMPRESS	580
+G_VSCALE	581
+G_WRITE_REGISTER	582
+G_XOR	583
+G_ZEXT	584
+G_ZEXTLOAD	585
+HADDPDrm	586
+HADDPDrr	587
+HADDPSrm	588
+HADDPSrr	589
+HLT	590
+HRESET	591
+HSUBPDrm	592
+HSUBPDrr	593
+HSUBPSrm	594
+HSUBPSrr	595
+ICALL_BRANCH_FUNNEL	596
+IDIV	597
+ILD_F	598
+ILD_Fp	599
+IMPLICIT_DEF	600
+IMUL	601
+IMULZU	602
+IN	603
+INC	604
+INCSSPD	605
+INCSSPQ	606
+INDIRECT_THUNK_CALL	607
+INDIRECT_THUNK_TCRETURN	608
+INIT_UNDEF	609
+INLINEASM	610
+INLINEASM_BR	611
+INSB	612
+INSERTPSrmi	613
+INSERTPSrri	614
+INSERTQ	615
+INSERTQI	616
+INSERT_SUBREG	617
+INSL	618
+INSW	619
+INT	620
+INTO	621
+INVD	622
+INVEPT	623
+INVLPG	624
+INVLPGA	625
+INVLPGB	626
+INVPCID	627
+INVVPID	628
+IRET	629
+ISTT_FP	630
+ISTT_Fp	631
+IST_F	632
+IST_FP	633
+IST_Fp	634
+Int_eh_sjlj_setup_dispatch	635
+JCC	636
+JCXZ	637
+JECXZ	638
+JMP	639
+JMPABS	640
+JRCXZ	641
+JUMP_TABLE_DEBUG_INFO	642
+KADDBkk	643
+KADDDkk	644
+KADDQkk	645
+KADDWkk	646
+KANDBkk	647
+KANDDkk	648
+KANDNBkk	649
+KANDNDkk	650
+KANDNQkk	651
+KANDNWkk	652
+KANDQkk	653
+KANDWkk	654
+KCFI_CHECK	655
+KILL	656
+KMOVBkk	657
+KMOVBkk_EVEX	658
+KMOVBkm	659
+KMOVBkm_EVEX	660
+KMOVBkr	661
+KMOVBkr_EVEX	662
+KMOVBmk	663
+KMOVBmk_EVEX	664
+KMOVBrk	665
+KMOVBrk_EVEX	666
+KMOVDkk	667
+KMOVDkk_EVEX	668
+KMOVDkm	669
+KMOVDkm_EVEX	670
+KMOVDkr	671
+KMOVDkr_EVEX	672
+KMOVDmk	673
+KMOVDmk_EVEX	674
+KMOVDrk	675
+KMOVDrk_EVEX	676
+KMOVQkk	677
+KMOVQkk_EVEX	678
+KMOVQkm	679
+KMOVQkm_EVEX	680
+KMOVQkr	681
+KMOVQkr_EVEX	682
+KMOVQmk	683
+KMOVQmk_EVEX	684
+KMOVQrk	685
+KMOVQrk_EVEX	686
+KMOVWkk	687
+KMOVWkk_EVEX	688
+KMOVWkm	689
+KMOVWkm_EVEX	690
+KMOVWkr	691
+KMOVWkr_EVEX	692
+KMOVWmk	693
+KMOVWmk_EVEX	694
+KMOVWrk	695
+KMOVWrk_EVEX	696
+KNOTBkk	697
+KNOTDkk	698
+KNOTQkk	699
+KNOTWkk	700
+KORBkk	701
+KORDkk	702
+KORQkk	703
+KORTESTBkk	704
+KORTESTDkk	705
+KORTESTQkk	706
+KORTESTWkk	707
+KORWkk	708
+KSET	709
+KSHIFTLBki	710
+KSHIFTLDki	711
+KSHIFTLQki	712
+KSHIFTLWki	713
+KSHIFTRBki	714
+KSHIFTRDki	715
+KSHIFTRQki	716
+KSHIFTRWki	717
+KTESTBkk	718
+KTESTDkk	719
+KTESTQkk	720
+KTESTWkk	721
+KUNPCKBWkk	722
+KUNPCKDQkk	723
+KUNPCKWDkk	724
+KXNORBkk	725
+KXNORDkk	726
+KXNORQkk	727
+KXNORWkk	728
+KXORBkk	729
+KXORDkk	730
+KXORQkk	731
+KXORWkk	732
+LAHF	733
+LAR	734
+LCMPXCHG	735
+LDDQUrm	736
+LDMXCSR	737
+LDS	738
+LDTILECFG	739
+LDTILECFG_EVEX	740
+LD_F	741
+LD_Fp	742
+LD_Frr	743
+LEA	744
+LEAVE	745
+LES	746
+LFENCE	747
+LFS	748
+LGDT	749
+LGS	750
+LIDT	751
+LIFETIME_END	752
+LIFETIME_START	753
+LKGS	754
+LLDT	755
+LLWPCB	756
+LMSW	757
+LOADIWKEY	758
+LOAD_STACK_GUARD	759
+LOCAL_ESCAPE	760
+LOCK_ADD	761
+LOCK_AND	762
+LOCK_BTC	763
+LOCK_BTC_RM	764
+LOCK_BTR	765
+LOCK_BTR_RM	766
+LOCK_BTS	767
+LOCK_BTS_RM	768
+LOCK_DEC	769
+LOCK_INC	770
+LOCK_OR	771
+LOCK_PREFIX	772
+LOCK_SUB	773
+LOCK_XOR	774
+LODSB	775
+LODSL	776
+LODSQ	777
+LODSW	778
+LOOP	779
+LOOPE	780
+LOOPNE	781
+LRET	782
+LRETI	783
+LSL	784
+LSS	785
+LTRm	786
+LTRr	787
+LWPINS	788
+LWPVAL	789
+LXADD	790
+LZCNT	791
+MASKMOVDQU	792
+MASKPAIR	793
+MAXCPDrm	794
+MAXCPDrr	795
+MAXCPSrm	796
+MAXCPSrr	797
+MAXCSDrm	798
+MAXCSDrr	799
+MAXCSSrm	800
+MAXCSSrr	801
+MAXPDrm	802
+MAXPDrr	803
+MAXPSrm	804
+MAXPSrr	805
+MAXSDrm	806
+MAXSDrm_Int	807
+MAXSDrr	808
+MAXSDrr_Int	809
+MAXSSrm	810
+MAXSSrm_Int	811
+MAXSSrr	812
+MAXSSrr_Int	813
+MEMBARRIER	814
+MFENCE	815
+MINCPDrm	816
+MINCPDrr	817
+MINCPSrm	818
+MINCPSrr	819
+MINCSDrm	820
+MINCSDrr	821
+MINCSSrm	822
+MINCSSrr	823
+MINPDrm	824
+MINPDrr	825
+MINPSrm	826
+MINPSrr	827
+MINSDrm	828
+MINSDrm_Int	829
+MINSDrr	830
+MINSDrr_Int	831
+MINSSrm	832
+MINSSrm_Int	833
+MINSSrr	834
+MINSSrr_Int	835
+MMX_CVTPD	836
+MMX_CVTPI	837
+MMX_CVTPS	838
+MMX_CVTTPD	839
+MMX_CVTTPS	840
+MMX_EMMS	841
+MMX_MASKMOVQ	842
+MMX_MOVD	843
+MMX_MOVDQ	844
+MMX_MOVFR	845
+MMX_MOVNTQmr	846
+MMX_MOVQ	847
+MMX_PABSBrm	848
+MMX_PABSBrr	849
+MMX_PABSDrm	850
+MMX_PABSDrr	851
+MMX_PABSWrm	852
+MMX_PABSWrr	853
+MMX_PACKSSDWrm	854
+MMX_PACKSSDWrr	855
+MMX_PACKSSWBrm	856
+MMX_PACKSSWBrr	857
+MMX_PACKUSWBrm	858
+MMX_PACKUSWBrr	859
+MMX_PADDBrm	860
+MMX_PADDBrr	861
+MMX_PADDDrm	862
+MMX_PADDDrr	863
+MMX_PADDQrm	864
+MMX_PADDQrr	865
+MMX_PADDSBrm	866
+MMX_PADDSBrr	867
+MMX_PADDSWrm	868
+MMX_PADDSWrr	869
+MMX_PADDUSBrm	870
+MMX_PADDUSBrr	871
+MMX_PADDUSWrm	872
+MMX_PADDUSWrr	873
+MMX_PADDWrm	874
+MMX_PADDWrr	875
+MMX_PALIGNRrmi	876
+MMX_PALIGNRrri	877
+MMX_PANDNrm	878
+MMX_PANDNrr	879
+MMX_PANDrm	880
+MMX_PANDrr	881
+MMX_PAVGBrm	882
+MMX_PAVGBrr	883
+MMX_PAVGWrm	884
+MMX_PAVGWrr	885
+MMX_PCMPEQBrm	886
+MMX_PCMPEQBrr	887
+MMX_PCMPEQDrm	888
+MMX_PCMPEQDrr	889
+MMX_PCMPEQWrm	890
+MMX_PCMPEQWrr	891
+MMX_PCMPGTBrm	892
+MMX_PCMPGTBrr	893
+MMX_PCMPGTDrm	894
+MMX_PCMPGTDrr	895
+MMX_PCMPGTWrm	896
+MMX_PCMPGTWrr	897
+MMX_PEXTRWrri	898
+MMX_PHADDDrm	899
+MMX_PHADDDrr	900
+MMX_PHADDSWrm	901
+MMX_PHADDSWrr	902
+MMX_PHADDWrm	903
+MMX_PHADDWrr	904
+MMX_PHSUBDrm	905
+MMX_PHSUBDrr	906
+MMX_PHSUBSWrm	907
+MMX_PHSUBSWrr	908
+MMX_PHSUBWrm	909
+MMX_PHSUBWrr	910
+MMX_PINSRWrmi	911
+MMX_PINSRWrri	912
+MMX_PMADDUBSWrm	913
+MMX_PMADDUBSWrr	914
+MMX_PMADDWDrm	915
+MMX_PMADDWDrr	916
+MMX_PMAXSWrm	917
+MMX_PMAXSWrr	918
+MMX_PMAXUBrm	919
+MMX_PMAXUBrr	920
+MMX_PMINSWrm	921
+MMX_PMINSWrr	922
+MMX_PMINUBrm	923
+MMX_PMINUBrr	924
+MMX_PMOVMSKBrr	925
+MMX_PMULHRSWrm	926
+MMX_PMULHRSWrr	927
+MMX_PMULHUWrm	928
+MMX_PMULHUWrr	929
+MMX_PMULHWrm	930
+MMX_PMULHWrr	931
+MMX_PMULLWrm	932
+MMX_PMULLWrr	933
+MMX_PMULUDQrm	934
+MMX_PMULUDQrr	935
+MMX_PORrm	936
+MMX_PORrr	937
+MMX_PSADBWrm	938
+MMX_PSADBWrr	939
+MMX_PSHUFBrm	940
+MMX_PSHUFBrr	941
+MMX_PSHUFWmi	942
+MMX_PSHUFWri	943
+MMX_PSIGNBrm	944
+MMX_PSIGNBrr	945
+MMX_PSIGNDrm	946
+MMX_PSIGNDrr	947
+MMX_PSIGNWrm	948
+MMX_PSIGNWrr	949
+MMX_PSLLDri	950
+MMX_PSLLDrm	951
+MMX_PSLLDrr	952
+MMX_PSLLQri	953
+MMX_PSLLQrm	954
+MMX_PSLLQrr	955
+MMX_PSLLWri	956
+MMX_PSLLWrm	957
+MMX_PSLLWrr	958
+MMX_PSRADri	959
+MMX_PSRADrm	960
+MMX_PSRADrr	961
+MMX_PSRAWri	962
+MMX_PSRAWrm	963
+MMX_PSRAWrr	964
+MMX_PSRLDri	965
+MMX_PSRLDrm	966
+MMX_PSRLDrr	967
+MMX_PSRLQri	968
+MMX_PSRLQrm	969
+MMX_PSRLQrr	970
+MMX_PSRLWri	971
+MMX_PSRLWrm	972
+MMX_PSRLWrr	973
+MMX_PSUBBrm	974
+MMX_PSUBBrr	975
+MMX_PSUBDrm	976
+MMX_PSUBDrr	977
+MMX_PSUBQrm	978
+MMX_PSUBQrr	979
+MMX_PSUBSBrm	980
+MMX_PSUBSBrr	981
+MMX_PSUBSWrm	982
+MMX_PSUBSWrr	983
+MMX_PSUBUSBrm	984
+MMX_PSUBUSBrr	985
+MMX_PSUBUSWrm	986
+MMX_PSUBUSWrr	987
+MMX_PSUBWrm	988
+MMX_PSUBWrr	989
+MMX_PUNPCKHBWrm	990
+MMX_PUNPCKHBWrr	991
+MMX_PUNPCKHDQrm	992
+MMX_PUNPCKHDQrr	993
+MMX_PUNPCKHWDrm	994
+MMX_PUNPCKHWDrr	995
+MMX_PUNPCKLBWrm	996
+MMX_PUNPCKLBWrr	997
+MMX_PUNPCKLDQrm	998
+MMX_PUNPCKLDQrr	999
+MMX_PUNPCKLWDrm	1000
+MMX_PUNPCKLWDrr	1001
+MMX_PXORrm	1002
+MMX_PXORrr	1003
+MMX_SET	1004
+MONITOR	1005
+MONITORX	1006
+MONTMUL	1007
+MORESTACK_RET	1008
+MORESTACK_RET_RESTORE_R	1009
+MOV	1010
+MOVAPDmr	1011
+MOVAPDrm	1012
+MOVAPDrr	1013
+MOVAPDrr_REV	1014
+MOVAPSmr	1015
+MOVAPSrm	1016
+MOVAPSrr	1017
+MOVAPSrr_REV	1018
+MOVBE	1019
+MOVDDUPrm	1020
+MOVDDUPrr	1021
+MOVDI	1022
+MOVDIR	1023
+MOVDIRI	1024
+MOVDQAmr	1025
+MOVDQArm	1026
+MOVDQArr	1027
+MOVDQArr_REV	1028
+MOVDQUmr	1029
+MOVDQUrm	1030
+MOVDQUrr	1031
+MOVDQUrr_REV	1032
+MOVHLPSrr	1033
+MOVHPDmr	1034
+MOVHPDrm	1035
+MOVHPSmr	1036
+MOVHPSrm	1037
+MOVLHPSrr	1038
+MOVLPDmr	1039
+MOVLPDrm	1040
+MOVLPSmr	1041
+MOVLPSrm	1042
+MOVMSKPDrr	1043
+MOVMSKPSrr	1044
+MOVNTDQArm	1045
+MOVNTDQmr	1046
+MOVNTI	1047
+MOVNTImr	1048
+MOVNTPDmr	1049
+MOVNTPSmr	1050
+MOVNTSD	1051
+MOVNTSS	1052
+MOVPC	1053
+MOVPDI	1054
+MOVPQI	1055
+MOVPQIto	1056
+MOVQI	1057
+MOVRS	1058
+MOVSB	1059
+MOVSDmr	1060
+MOVSDrm	1061
+MOVSDrm_alt	1062
+MOVSDrr	1063
+MOVSDrr_REV	1064
+MOVSDto	1065
+MOVSHDUPrm	1066
+MOVSHDUPrr	1067
+MOVSHPmr	1068
+MOVSHPrm	1069
+MOVSL	1070
+MOVSLDUPrm	1071
+MOVSLDUPrr	1072
+MOVSQ	1073
+MOVSS	1074
+MOVSSmr	1075
+MOVSSrm	1076
+MOVSSrm_alt	1077
+MOVSSrr	1078
+MOVSSrr_REV	1079
+MOVSW	1080
+MOVSX	1081
+MOVUPDmr	1082
+MOVUPDrm	1083
+MOVUPDrr	1084
+MOVUPDrr_REV	1085
+MOVUPSmr	1086
+MOVUPSrm	1087
+MOVUPSrr	1088
+MOVUPSrr_REV	1089
+MOVZPQILo	1090
+MOVZX	1091
+MPSADBWrmi	1092
+MPSADBWrri	1093
+MUL	1094
+MULPDrm	1095
+MULPDrr	1096
+MULPSrm	1097
+MULPSrr	1098
+MULSDrm	1099
+MULSDrm_Int	1100
+MULSDrr	1101
+MULSDrr_Int	1102
+MULSSrm	1103
+MULSSrm_Int	1104
+MULSSrr	1105
+MULSSrr_Int	1106
+MULX	1107
+MUL_F	1108
+MUL_FI	1109
+MUL_FPrST	1110
+MUL_FST	1111
+MUL_Fp	1112
+MUL_FpI	1113
+MUL_FrST	1114
+MWAITX	1115
+MWAITX_SAVE_RBX	1116
+MWAITXrrr	1117
+MWAITrr	1118
+NEG	1119
+NOOP	1120
+NOOPL	1121
+NOOPLr	1122
+NOOPQ	1123
+NOOPQr	1124
+NOOPW	1125
+NOOPWr	1126
+NOT	1127
+OR	1128
+ORPDrm	1129
+ORPDrr	1130
+ORPSrm	1131
+ORPSrr	1132
+OUT	1133
+OUTSB	1134
+OUTSL	1135
+OUTSW	1136
+PABSBrm	1137
+PABSBrr	1138
+PABSDrm	1139
+PABSDrr	1140
+PABSWrm	1141
+PABSWrr	1142
+PACKSSDWrm	1143
+PACKSSDWrr	1144
+PACKSSWBrm	1145
+PACKSSWBrr	1146
+PACKUSDWrm	1147
+PACKUSDWrr	1148
+PACKUSWBrm	1149
+PACKUSWBrr	1150
+PADDBrm	1151
+PADDBrr	1152
+PADDDrm	1153
+PADDDrr	1154
+PADDQrm	1155
+PADDQrr	1156
+PADDSBrm	1157
+PADDSBrr	1158
+PADDSWrm	1159
+PADDSWrr	1160
+PADDUSBrm	1161
+PADDUSBrr	1162
+PADDUSWrm	1163
+PADDUSWrr	1164
+PADDWrm	1165
+PADDWrr	1166
+PALIGNRrmi	1167
+PALIGNRrri	1168
+PANDNrm	1169
+PANDNrr	1170
+PANDrm	1171
+PANDrr	1172
+PATCHABLE_EVENT_CALL	1173
+PATCHABLE_FUNCTION_ENTER	1174
+PATCHABLE_FUNCTION_EXIT	1175
+PATCHABLE_OP	1176
+PATCHABLE_RET	1177
+PATCHABLE_TAIL_CALL	1178
+PATCHABLE_TYPED_EVENT_CALL	1179
+PATCHPOINT	1180
+PAUSE	1181
+PAVGBrm	1182
+PAVGBrr	1183
+PAVGUSBrm	1184
+PAVGUSBrr	1185
+PAVGWrm	1186
+PAVGWrr	1187
+PBLENDVBrm	1188
+PBLENDVBrr	1189
+PBLENDWrmi	1190
+PBLENDWrri	1191
+PBNDKB	1192
+PCLMULQDQrmi	1193
+PCLMULQDQrri	1194
+PCMPEQBrm	1195
+PCMPEQBrr	1196
+PCMPEQDrm	1197
+PCMPEQDrr	1198
+PCMPEQQrm	1199
+PCMPEQQrr	1200
+PCMPEQWrm	1201
+PCMPEQWrr	1202
+PCMPESTRIrmi	1203
+PCMPESTRIrri	1204
+PCMPESTRMrmi	1205
+PCMPESTRMrri	1206
+PCMPGTBrm	1207
+PCMPGTBrr	1208
+PCMPGTDrm	1209
+PCMPGTDrr	1210
+PCMPGTQrm	1211
+PCMPGTQrr	1212
+PCMPGTWrm	1213
+PCMPGTWrr	1214
+PCMPISTRIrmi	1215
+PCMPISTRIrri	1216
+PCMPISTRMrmi	1217
+PCMPISTRMrri	1218
+PCONFIG	1219
+PDEP	1220
+PEXT	1221
+PEXTRBmri	1222
+PEXTRBrri	1223
+PEXTRDmri	1224
+PEXTRDrri	1225
+PEXTRQmri	1226
+PEXTRQrri	1227
+PEXTRWmri	1228
+PEXTRWrri	1229
+PEXTRWrri_REV	1230
+PF	1231
+PFACCrm	1232
+PFACCrr	1233
+PFADDrm	1234
+PFADDrr	1235
+PFCMPEQrm	1236
+PFCMPEQrr	1237
+PFCMPGErm	1238
+PFCMPGErr	1239
+PFCMPGTrm	1240
+PFCMPGTrr	1241
+PFMAXrm	1242
+PFMAXrr	1243
+PFMINrm	1244
+PFMINrr	1245
+PFMULrm	1246
+PFMULrr	1247
+PFNACCrm	1248
+PFNACCrr	1249
+PFPNACCrm	1250
+PFPNACCrr	1251
+PFRCPIT	1252
+PFRCPrm	1253
+PFRCPrr	1254
+PFRSQIT	1255
+PFRSQRTrm	1256
+PFRSQRTrr	1257
+PFSUBRrm	1258
+PFSUBRrr	1259
+PFSUBrm	1260
+PFSUBrr	1261
+PHADDDrm	1262
+PHADDDrr	1263
+PHADDSWrm	1264
+PHADDSWrr	1265
+PHADDWrm	1266
+PHADDWrr	1267
+PHI	1268
+PHMINPOSUWrm	1269
+PHMINPOSUWrr	1270
+PHSUBDrm	1271
+PHSUBDrr	1272
+PHSUBSWrm	1273
+PHSUBSWrr	1274
+PHSUBWrm	1275
+PHSUBWrr	1276
+PI	1277
+PINSRBrmi	1278
+PINSRBrri	1279
+PINSRDrmi	1280
+PINSRDrri	1281
+PINSRQrmi	1282
+PINSRQrri	1283
+PINSRWrmi	1284
+PINSRWrri	1285
+PLDTILECFGV	1286
+PLEA	1287
+PMADDUBSWrm	1288
+PMADDUBSWrr	1289
+PMADDWDrm	1290
+PMADDWDrr	1291
+PMAXSBrm	1292
+PMAXSBrr	1293
+PMAXSDrm	1294
+PMAXSDrr	1295
+PMAXSWrm	1296
+PMAXSWrr	1297
+PMAXUBrm	1298
+PMAXUBrr	1299
+PMAXUDrm	1300
+PMAXUDrr	1301
+PMAXUWrm	1302
+PMAXUWrr	1303
+PMINSBrm	1304
+PMINSBrr	1305
+PMINSDrm	1306
+PMINSDrr	1307
+PMINSWrm	1308
+PMINSWrr	1309
+PMINUBrm	1310
+PMINUBrr	1311
+PMINUDrm	1312
+PMINUDrr	1313
+PMINUWrm	1314
+PMINUWrr	1315
+PMOVMSKBrr	1316
+PMOVSXBDrm	1317
+PMOVSXBDrr	1318
+PMOVSXBQrm	1319
+PMOVSXBQrr	1320
+PMOVSXBWrm	1321
+PMOVSXBWrr	1322
+PMOVSXDQrm	1323
+PMOVSXDQrr	1324
+PMOVSXWDrm	1325
+PMOVSXWDrr	1326
+PMOVSXWQrm	1327
+PMOVSXWQrr	1328
+PMOVZXBDrm	1329
+PMOVZXBDrr	1330
+PMOVZXBQrm	1331
+PMOVZXBQrr	1332
+PMOVZXBWrm	1333
+PMOVZXBWrr	1334
+PMOVZXDQrm	1335
+PMOVZXDQrr	1336
+PMOVZXWDrm	1337
+PMOVZXWDrr	1338
+PMOVZXWQrm	1339
+PMOVZXWQrr	1340
+PMULDQrm	1341
+PMULDQrr	1342
+PMULHRSWrm	1343
+PMULHRSWrr	1344
+PMULHRWrm	1345
+PMULHRWrr	1346
+PMULHUWrm	1347
+PMULHUWrr	1348
+PMULHWrm	1349
+PMULHWrr	1350
+PMULLDrm	1351
+PMULLDrr	1352
+PMULLWrm	1353
+PMULLWrr	1354
+PMULUDQrm	1355
+PMULUDQrr	1356
+POP	1357
+POPA	1358
+POPCNT	1359
+POPDS	1360
+POPES	1361
+POPF	1362
+POPFS	1363
+POPGS	1364
+POPP	1365
+POPSS	1366
+PORrm	1367
+PORrr	1368
+PREALLOCATED_ARG	1369
+PREALLOCATED_SETUP	1370
+PREFETCH	1371
+PREFETCHIT	1372
+PREFETCHNTA	1373
+PREFETCHRST	1374
+PREFETCHT	1375
+PREFETCHW	1376
+PREFETCHWT	1377
+PROBED_ALLOCA	1378
+PSADBWrm	1379
+PSADBWrr	1380
+PSEUDO_PROBE	1381
+PSHUFBrm	1382
+PSHUFBrr	1383
+PSHUFDmi	1384
+PSHUFDri	1385
+PSHUFHWmi	1386
+PSHUFHWri	1387
+PSHUFLWmi	1388
+PSHUFLWri	1389
+PSIGNBrm	1390
+PSIGNBrr	1391
+PSIGNDrm	1392
+PSIGNDrr	1393
+PSIGNWrm	1394
+PSIGNWrr	1395
+PSLLDQri	1396
+PSLLDri	1397
+PSLLDrm	1398
+PSLLDrr	1399
+PSLLQri	1400
+PSLLQrm	1401
+PSLLQrr	1402
+PSLLWri	1403
+PSLLWrm	1404
+PSLLWrr	1405
+PSMASH	1406
+PSRADri	1407
+PSRADrm	1408
+PSRADrr	1409
+PSRAWri	1410
+PSRAWrm	1411
+PSRAWrr	1412
+PSRLDQri	1413
+PSRLDri	1414
+PSRLDrm	1415
+PSRLDrr	1416
+PSRLQri	1417
+PSRLQrm	1418
+PSRLQrr	1419
+PSRLWri	1420
+PSRLWrm	1421
+PSRLWrr	1422
+PSUBBrm	1423
+PSUBBrr	1424
+PSUBDrm	1425
+PSUBDrr	1426
+PSUBQrm	1427
+PSUBQrr	1428
+PSUBSBrm	1429
+PSUBSBrr	1430
+PSUBSWrm	1431
+PSUBSWrr	1432
+PSUBUSBrm	1433
+PSUBUSBrr	1434
+PSUBUSWrm	1435
+PSUBUSWrr	1436
+PSUBWrm	1437
+PSUBWrr	1438
+PSWAPDrm	1439
+PSWAPDrr	1440
+PT	1441
+PTCMMIMFP	1442
+PTCMMRLFP	1443
+PTCONJTCMMIMFP	1444
+PTCONJTFP	1445
+PTCVTROWD	1446
+PTCVTROWPS	1447
+PTDPBF	1448
+PTDPBHF	1449
+PTDPBSSD	1450
+PTDPBSSDV	1451
+PTDPBSUD	1452
+PTDPBSUDV	1453
+PTDPBUSD	1454
+PTDPBUSDV	1455
+PTDPBUUD	1456
+PTDPBUUDV	1457
+PTDPFP	1458
+PTDPHBF	1459
+PTDPHF	1460
+PTESTrm	1461
+PTESTrr	1462
+PTILELOADD	1463
+PTILELOADDRS	1464
+PTILELOADDRST	1465
+PTILELOADDRSV	1466
+PTILELOADDT	1467
+PTILELOADDV	1468
+PTILEMOVROWrre	1469
+PTILEMOVROWrreV	1470
+PTILEMOVROWrri	1471
+PTILEMOVROWrriV	1472
+PTILEPAIRLOAD	1473
+PTILEPAIRSTORE	1474
+PTILESTORED	1475
+PTILESTOREDV	1476
+PTILEZERO	1477
+PTILEZEROV	1478
+PTMMULTF	1479
+PTTCMMIMFP	1480
+PTTCMMRLFP	1481
+PTTDPBF	1482
+PTTDPFP	1483
+PTTMMULTF	1484
+PTTRANSPOSED	1485
+PTTRANSPOSEDV	1486
+PTWRITE	1487
+PTWRITEm	1488
+PTWRITEr	1489
+PUNPCKHBWrm	1490
+PUNPCKHBWrr	1491
+PUNPCKHDQrm	1492
+PUNPCKHDQrr	1493
+PUNPCKHQDQrm	1494
+PUNPCKHQDQrr	1495
+PUNPCKHWDrm	1496
+PUNPCKHWDrr	1497
+PUNPCKLBWrm	1498
+PUNPCKLBWrr	1499
+PUNPCKLDQrm	1500
+PUNPCKLDQrr	1501
+PUNPCKLQDQrm	1502
+PUNPCKLQDQrr	1503
+PUNPCKLWDrm	1504
+PUNPCKLWDrr	1505
+PUSH	1506
+PUSHA	1507
+PUSHCS	1508
+PUSHDS	1509
+PUSHES	1510
+PUSHF	1511
+PUSHFS	1512
+PUSHGS	1513
+PUSHP	1514
+PUSHSS	1515
+PVALIDATE	1516
+PXORrm	1517
+PXORrr	1518
+RCL	1519
+RCPPSm	1520
+RCPPSr	1521
+RCPSSm	1522
+RCPSSm_Int	1523
+RCPSSr	1524
+RCPSSr_Int	1525
+RCR	1526
+RDFLAGS	1527
+RDFSBASE	1528
+RDGSBASE	1529
+RDMSR	1530
+RDMSRLIST	1531
+RDMSRri	1532
+RDMSRri_EVEX	1533
+RDPID	1534
+RDPKRUr	1535
+RDPMC	1536
+RDPRU	1537
+RDRAND	1538
+RDSEED	1539
+RDSSPD	1540
+RDSSPQ	1541
+RDTSC	1542
+RDTSCP	1543
+REG_SEQUENCE	1544
+REPNE_PREFIX	1545
+REP_MOVSB	1546
+REP_MOVSD	1547
+REP_MOVSQ	1548
+REP_MOVSW	1549
+REP_PREFIX	1550
+REP_STOSB	1551
+REP_STOSD	1552
+REP_STOSQ	1553
+REP_STOSW	1554
+RET	1555
+RETI	1556
+REX	1557
+RMPADJUST	1558
+RMPQUERY	1559
+RMPUPDATE	1560
+ROL	1561
+ROR	1562
+RORX	1563
+ROUNDPDmi	1564
+ROUNDPDri	1565
+ROUNDPSmi	1566
+ROUNDPSri	1567
+ROUNDSDmi	1568
+ROUNDSDmi_Int	1569
+ROUNDSDri	1570
+ROUNDSDri_Int	1571
+ROUNDSSmi	1572
+ROUNDSSmi_Int	1573
+ROUNDSSri	1574
+ROUNDSSri_Int	1575
+RSM	1576
+RSQRTPSm	1577
+RSQRTPSr	1578
+RSQRTSSm	1579
+RSQRTSSm_Int	1580
+RSQRTSSr	1581
+RSQRTSSr_Int	1582
+RSTORSSP	1583
+SAHF	1584
+SALC	1585
+SAR	1586
+SARX	1587
+SAVEPREVSSP	1588
+SBB	1589
+SCASB	1590
+SCASL	1591
+SCASQ	1592
+SCASW	1593
+SEAMCALL	1594
+SEAMOPS	1595
+SEAMRET	1596
+SEG_ALLOCA	1597
+SEH_BeginEpilogue	1598
+SEH_EndEpilogue	1599
+SEH_EndPrologue	1600
+SEH_PushFrame	1601
+SEH_PushReg	1602
+SEH_SaveReg	1603
+SEH_SaveXMM	1604
+SEH_SetFrame	1605
+SEH_StackAlign	1606
+SEH_StackAlloc	1607
+SEH_UnwindV	1608
+SEH_UnwindVersion	1609
+SENDUIPI	1610
+SERIALIZE	1611
+SETB_C	1612
+SETCCm	1613
+SETCCm_EVEX	1614
+SETCCr	1615
+SETCCr_EVEX	1616
+SETSSBSY	1617
+SETZUCCm	1618
+SETZUCCr	1619
+SFENCE	1620
+SGDT	1621
+SHA	1622
+SHL	1623
+SHLD	1624
+SHLDROT	1625
+SHLX	1626
+SHR	1627
+SHRD	1628
+SHRDROT	1629
+SHRX	1630
+SHUFPDrmi	1631
+SHUFPDrri	1632
+SHUFPSrmi	1633
+SHUFPSrri	1634
+SIDT	1635
+SKINIT	1636
+SLDT	1637
+SLWPCB	1638
+SMSW	1639
+SQRTPDm	1640
+SQRTPDr	1641
+SQRTPSm	1642
+SQRTPSr	1643
+SQRTSDm	1644
+SQRTSDm_Int	1645
+SQRTSDr	1646
+SQRTSDr_Int	1647
+SQRTSSm	1648
+SQRTSSm_Int	1649
+SQRTSSr	1650
+SQRTSSr_Int	1651
+SQRT_F	1652
+SQRT_Fp	1653
+SS_PREFIX	1654
+STAC	1655
+STACKALLOC_W_PROBING	1656
+STACKMAP	1657
+STATEPOINT	1658
+STC	1659
+STD	1660
+STGI	1661
+STI	1662
+STMXCSR	1663
+STOSB	1664
+STOSL	1665
+STOSQ	1666
+STOSW	1667
+STR	1668
+STRm	1669
+STTILECFG	1670
+STTILECFG_EVEX	1671
+STUI	1672
+ST_F	1673
+ST_FP	1674
+ST_FPrr	1675
+ST_Fp	1676
+ST_FpP	1677
+ST_Frr	1678
+SUB	1679
+SUBPDrm	1680
+SUBPDrr	1681
+SUBPSrm	1682
+SUBPSrr	1683
+SUBREG_TO_REG	1684
+SUBR_F	1685
+SUBR_FI	1686
+SUBR_FPrST	1687
+SUBR_FST	1688
+SUBR_Fp	1689
+SUBR_FpI	1690
+SUBR_FrST	1691
+SUBSDrm	1692
+SUBSDrm_Int	1693
+SUBSDrr	1694
+SUBSDrr_Int	1695
+SUBSSrm	1696
+SUBSSrm_Int	1697
+SUBSSrr	1698
+SUBSSrr_Int	1699
+SUB_F	1700
+SUB_FI	1701
+SUB_FPrST	1702
+SUB_FST	1703
+SUB_Fp	1704
+SUB_FpI	1705
+SUB_FrST	1706
+SWAPGS	1707
+SYSCALL	1708
+SYSENTER	1709
+SYSEXIT	1710
+SYSRET	1711
+T	1712
+TAILJMPd	1713
+TAILJMPd_CC	1714
+TAILJMPm	1715
+TAILJMPr	1716
+TCMMIMFP	1717
+TCMMRLFP	1718
+TCONJTCMMIMFP	1719
+TCONJTFP	1720
+TCRETURN_HIPE	1721
+TCRETURN_WIN	1722
+TCRETURN_WINmi	1723
+TCRETURNdi	1724
+TCRETURNdicc	1725
+TCRETURNmi	1726
+TCRETURNri	1727
+TCVTROWD	1728
+TCVTROWPS	1729
+TDCALL	1730
+TDPBF	1731
+TDPBHF	1732
+TDPBSSD	1733
+TDPBSUD	1734
+TDPBUSD	1735
+TDPBUUD	1736
+TDPFP	1737
+TDPHBF	1738
+TDPHF	1739
+TEST	1740
+TESTUI	1741
+TILELOADD	1742
+TILELOADDRS	1743
+TILELOADDRST	1744
+TILELOADDRS_EVEX	1745
+TILELOADDT	1746
+TILELOADD_EVEX	1747
+TILEMOVROWrre	1748
+TILEMOVROWrri	1749
+TILERELEASE	1750
+TILESTORED	1751
+TILESTORED_EVEX	1752
+TILEZERO	1753
+TLBSYNC	1754
+TLSCall	1755
+TLS_addr	1756
+TLS_addrX	1757
+TLS_base_addr	1758
+TLS_base_addrX	1759
+TLS_desc	1760
+TMMULTF	1761
+TPAUSE	1762
+TRAP	1763
+TST_F	1764
+TST_Fp	1765
+TTCMMIMFP	1766
+TTCMMRLFP	1767
+TTDPBF	1768
+TTDPFP	1769
+TTMMULTF	1770
+TTRANSPOSED	1771
+TZCNT	1772
+TZMSK	1773
+UBSAN_UD	1774
+UCOMISDrm	1775
+UCOMISDrm_Int	1776
+UCOMISDrr	1777
+UCOMISDrr_Int	1778
+UCOMISSrm	1779
+UCOMISSrm_Int	1780
+UCOMISSrr	1781
+UCOMISSrr_Int	1782
+UCOM_FIPr	1783
+UCOM_FIr	1784
+UCOM_FPPr	1785
+UCOM_FPr	1786
+UCOM_FpIr	1787
+UCOM_Fpr	1788
+UCOM_Fr	1789
+UD	1790
+UIRET	1791
+UMONITOR	1792
+UMWAIT	1793
+UNPCKHPDrm	1794
+UNPCKHPDrr	1795
+UNPCKHPSrm	1796
+UNPCKHPSrr	1797
+UNPCKLPDrm	1798
+UNPCKLPDrr	1799
+UNPCKLPSrm	1800
+UNPCKLPSrr	1801
+URDMSRri	1802
+URDMSRri_EVEX	1803
+URDMSRrr	1804
+URDMSRrr_EVEX	1805
+UWRMSRir	1806
+UWRMSRir_EVEX	1807
+UWRMSRrr	1808
+UWRMSRrr_EVEX	1809
+V	1810
+VAARG	1811
+VAARG_X	1812
+VADDBF	1813
+VADDPDYrm	1814
+VADDPDYrr	1815
+VADDPDZ	1816
+VADDPDZrm	1817
+VADDPDZrmb	1818
+VADDPDZrmbk	1819
+VADDPDZrmbkz	1820
+VADDPDZrmk	1821
+VADDPDZrmkz	1822
+VADDPDZrr	1823
+VADDPDZrrb	1824
+VADDPDZrrbk	1825
+VADDPDZrrbkz	1826
+VADDPDZrrk	1827
+VADDPDZrrkz	1828
+VADDPDrm	1829
+VADDPDrr	1830
+VADDPHZ	1831
+VADDPHZrm	1832
+VADDPHZrmb	1833
+VADDPHZrmbk	1834
+VADDPHZrmbkz	1835
+VADDPHZrmk	1836
+VADDPHZrmkz	1837
+VADDPHZrr	1838
+VADDPHZrrb	1839
+VADDPHZrrbk	1840
+VADDPHZrrbkz	1841
+VADDPHZrrk	1842
+VADDPHZrrkz	1843
+VADDPSYrm	1844
+VADDPSYrr	1845
+VADDPSZ	1846
+VADDPSZrm	1847
+VADDPSZrmb	1848
+VADDPSZrmbk	1849
+VADDPSZrmbkz	1850
+VADDPSZrmk	1851
+VADDPSZrmkz	1852
+VADDPSZrr	1853
+VADDPSZrrb	1854
+VADDPSZrrbk	1855
+VADDPSZrrbkz	1856
+VADDPSZrrk	1857
+VADDPSZrrkz	1858
+VADDPSrm	1859
+VADDPSrr	1860
+VADDSDZrm	1861
+VADDSDZrm_Int	1862
+VADDSDZrmk_Int	1863
+VADDSDZrmkz_Int	1864
+VADDSDZrr	1865
+VADDSDZrr_Int	1866
+VADDSDZrrb_Int	1867
+VADDSDZrrbk_Int	1868
+VADDSDZrrbkz_Int	1869
+VADDSDZrrk_Int	1870
+VADDSDZrrkz_Int	1871
+VADDSDrm	1872
+VADDSDrm_Int	1873
+VADDSDrr	1874
+VADDSDrr_Int	1875
+VADDSHZrm	1876
+VADDSHZrm_Int	1877
+VADDSHZrmk_Int	1878
+VADDSHZrmkz_Int	1879
+VADDSHZrr	1880
+VADDSHZrr_Int	1881
+VADDSHZrrb_Int	1882
+VADDSHZrrbk_Int	1883
+VADDSHZrrbkz_Int	1884
+VADDSHZrrk_Int	1885
+VADDSHZrrkz_Int	1886
+VADDSSZrm	1887
+VADDSSZrm_Int	1888
+VADDSSZrmk_Int	1889
+VADDSSZrmkz_Int	1890
+VADDSSZrr	1891
+VADDSSZrr_Int	1892
+VADDSSZrrb_Int	1893
+VADDSSZrrbk_Int	1894
+VADDSSZrrbkz_Int	1895
+VADDSSZrrk_Int	1896
+VADDSSZrrkz_Int	1897
+VADDSSrm	1898
+VADDSSrm_Int	1899
+VADDSSrr	1900
+VADDSSrr_Int	1901
+VADDSUBPDYrm	1902
+VADDSUBPDYrr	1903
+VADDSUBPDrm	1904
+VADDSUBPDrr	1905
+VADDSUBPSYrm	1906
+VADDSUBPSYrr	1907
+VADDSUBPSrm	1908
+VADDSUBPSrr	1909
+VAESDECLASTYrm	1910
+VAESDECLASTYrr	1911
+VAESDECLASTZ	1912
+VAESDECLASTZrm	1913
+VAESDECLASTZrr	1914
+VAESDECLASTrm	1915
+VAESDECLASTrr	1916
+VAESDECYrm	1917
+VAESDECYrr	1918
+VAESDECZ	1919
+VAESDECZrm	1920
+VAESDECZrr	1921
+VAESDECrm	1922
+VAESDECrr	1923
+VAESENCLASTYrm	1924
+VAESENCLASTYrr	1925
+VAESENCLASTZ	1926
+VAESENCLASTZrm	1927
+VAESENCLASTZrr	1928
+VAESENCLASTrm	1929
+VAESENCLASTrr	1930
+VAESENCYrm	1931
+VAESENCYrr	1932
+VAESENCZ	1933
+VAESENCZrm	1934
+VAESENCZrr	1935
+VAESENCrm	1936
+VAESENCrr	1937
+VAESIMCrm	1938
+VAESIMCrr	1939
+VAESKEYGENASSISTrmi	1940
+VAESKEYGENASSISTrri	1941
+VALIGNDZ	1942
+VALIGNDZrmbi	1943
+VALIGNDZrmbik	1944
+VALIGNDZrmbikz	1945
+VALIGNDZrmi	1946
+VALIGNDZrmik	1947
+VALIGNDZrmikz	1948
+VALIGNDZrri	1949
+VALIGNDZrrik	1950
+VALIGNDZrrikz	1951
+VALIGNQZ	1952
+VALIGNQZrmbi	1953
+VALIGNQZrmbik	1954
+VALIGNQZrmbikz	1955
+VALIGNQZrmi	1956
+VALIGNQZrmik	1957
+VALIGNQZrmikz	1958
+VALIGNQZrri	1959
+VALIGNQZrrik	1960
+VALIGNQZrrikz	1961
+VANDNPDYrm	1962
+VANDNPDYrr	1963
+VANDNPDZ	1964
+VANDNPDZrm	1965
+VANDNPDZrmb	1966
+VANDNPDZrmbk	1967
+VANDNPDZrmbkz	1968
+VANDNPDZrmk	1969
+VANDNPDZrmkz	1970
+VANDNPDZrr	1971
+VANDNPDZrrk	1972
+VANDNPDZrrkz	1973
+VANDNPDrm	1974
+VANDNPDrr	1975
+VANDNPSYrm	1976
+VANDNPSYrr	1977
+VANDNPSZ	1978
+VANDNPSZrm	1979
+VANDNPSZrmb	1980
+VANDNPSZrmbk	1981
+VANDNPSZrmbkz	1982
+VANDNPSZrmk	1983
+VANDNPSZrmkz	1984
+VANDNPSZrr	1985
+VANDNPSZrrk	1986
+VANDNPSZrrkz	1987
+VANDNPSrm	1988
+VANDNPSrr	1989
+VANDPDYrm	1990
+VANDPDYrr	1991
+VANDPDZ	1992
+VANDPDZrm	1993
+VANDPDZrmb	1994
+VANDPDZrmbk	1995
+VANDPDZrmbkz	1996
+VANDPDZrmk	1997
+VANDPDZrmkz	1998
+VANDPDZrr	1999
+VANDPDZrrk	2000
+VANDPDZrrkz	2001
+VANDPDrm	2002
+VANDPDrr	2003
+VANDPSYrm	2004
+VANDPSYrr	2005
+VANDPSZ	2006
+VANDPSZrm	2007
+VANDPSZrmb	2008
+VANDPSZrmbk	2009
+VANDPSZrmbkz	2010
+VANDPSZrmk	2011
+VANDPSZrmkz	2012
+VANDPSZrr	2013
+VANDPSZrrk	2014
+VANDPSZrrkz	2015
+VANDPSrm	2016
+VANDPSrr	2017
+VASTART_SAVE_XMM_REGS	2018
+VBCSTNEBF	2019
+VBCSTNESH	2020
+VBLENDMPDZ	2021
+VBLENDMPDZrm	2022
+VBLENDMPDZrmb	2023
+VBLENDMPDZrmbk	2024
+VBLENDMPDZrmbkz	2025
+VBLENDMPDZrmk	2026
+VBLENDMPDZrmkz	2027
+VBLENDMPDZrr	2028
+VBLENDMPDZrrk	2029
+VBLENDMPDZrrkz	2030
+VBLENDMPSZ	2031
+VBLENDMPSZrm	2032
+VBLENDMPSZrmb	2033
+VBLENDMPSZrmbk	2034
+VBLENDMPSZrmbkz	2035
+VBLENDMPSZrmk	2036
+VBLENDMPSZrmkz	2037
+VBLENDMPSZrr	2038
+VBLENDMPSZrrk	2039
+VBLENDMPSZrrkz	2040
+VBLENDPDYrmi	2041
+VBLENDPDYrri	2042
+VBLENDPDrmi	2043
+VBLENDPDrri	2044
+VBLENDPSYrmi	2045
+VBLENDPSYrri	2046
+VBLENDPSrmi	2047
+VBLENDPSrri	2048
+VBLENDVPDYrmr	2049
+VBLENDVPDYrrr	2050
+VBLENDVPDrmr	2051
+VBLENDVPDrrr	2052
+VBLENDVPSYrmr	2053
+VBLENDVPSYrrr	2054
+VBLENDVPSrmr	2055
+VBLENDVPSrrr	2056
+VBROADCASTF	2057
+VBROADCASTI	2058
+VBROADCASTSDYrm	2059
+VBROADCASTSDYrr	2060
+VBROADCASTSDZ	2061
+VBROADCASTSDZrm	2062
+VBROADCASTSDZrmk	2063
+VBROADCASTSDZrmkz	2064
+VBROADCASTSDZrr	2065
+VBROADCASTSDZrrk	2066
+VBROADCASTSDZrrkz	2067
+VBROADCASTSSYrm	2068
+VBROADCASTSSYrr	2069
+VBROADCASTSSZ	2070
+VBROADCASTSSZrm	2071
+VBROADCASTSSZrmk	2072
+VBROADCASTSSZrmkz	2073
+VBROADCASTSSZrr	2074
+VBROADCASTSSZrrk	2075
+VBROADCASTSSZrrkz	2076
+VBROADCASTSSrm	2077
+VBROADCASTSSrr	2078
+VCMPBF	2079
+VCMPPDYrmi	2080
+VCMPPDYrri	2081
+VCMPPDZ	2082
+VCMPPDZrmbi	2083
+VCMPPDZrmbik	2084
+VCMPPDZrmi	2085
+VCMPPDZrmik	2086
+VCMPPDZrri	2087
+VCMPPDZrrib	2088
+VCMPPDZrribk	2089
+VCMPPDZrrik	2090
+VCMPPDrmi	2091
+VCMPPDrri	2092
+VCMPPHZ	2093
+VCMPPHZrmbi	2094
+VCMPPHZrmbik	2095
+VCMPPHZrmi	2096
+VCMPPHZrmik	2097
+VCMPPHZrri	2098
+VCMPPHZrrib	2099
+VCMPPHZrribk	2100
+VCMPPHZrrik	2101
+VCMPPSYrmi	2102
+VCMPPSYrri	2103
+VCMPPSZ	2104
+VCMPPSZrmbi	2105
+VCMPPSZrmbik	2106
+VCMPPSZrmi	2107
+VCMPPSZrmik	2108
+VCMPPSZrri	2109
+VCMPPSZrrib	2110
+VCMPPSZrribk	2111
+VCMPPSZrrik	2112
+VCMPPSrmi	2113
+VCMPPSrri	2114
+VCMPSDZrmi	2115
+VCMPSDZrmi_Int	2116
+VCMPSDZrmik_Int	2117
+VCMPSDZrri	2118
+VCMPSDZrri_Int	2119
+VCMPSDZrrib_Int	2120
+VCMPSDZrribk_Int	2121
+VCMPSDZrrik_Int	2122
+VCMPSDrmi	2123
+VCMPSDrmi_Int	2124
+VCMPSDrri	2125
+VCMPSDrri_Int	2126
+VCMPSHZrmi	2127
+VCMPSHZrmi_Int	2128
+VCMPSHZrmik_Int	2129
+VCMPSHZrri	2130
+VCMPSHZrri_Int	2131
+VCMPSHZrrib_Int	2132
+VCMPSHZrribk_Int	2133
+VCMPSHZrrik_Int	2134
+VCMPSSZrmi	2135
+VCMPSSZrmi_Int	2136
+VCMPSSZrmik_Int	2137
+VCMPSSZrri	2138
+VCMPSSZrri_Int	2139
+VCMPSSZrrib_Int	2140
+VCMPSSZrribk_Int	2141
+VCMPSSZrrik_Int	2142
+VCMPSSrmi	2143
+VCMPSSrmi_Int	2144
+VCMPSSrri	2145
+VCMPSSrri_Int	2146
+VCOMISBF	2147
+VCOMISDZrm	2148
+VCOMISDZrm_Int	2149
+VCOMISDZrr	2150
+VCOMISDZrr_Int	2151
+VCOMISDZrrb	2152
+VCOMISDrm	2153
+VCOMISDrm_Int	2154
+VCOMISDrr	2155
+VCOMISDrr_Int	2156
+VCOMISHZrm	2157
+VCOMISHZrm_Int	2158
+VCOMISHZrr	2159
+VCOMISHZrr_Int	2160
+VCOMISHZrrb	2161
+VCOMISSZrm	2162
+VCOMISSZrm_Int	2163
+VCOMISSZrr	2164
+VCOMISSZrr_Int	2165
+VCOMISSZrrb	2166
+VCOMISSrm	2167
+VCOMISSrm_Int	2168
+VCOMISSrr	2169
+VCOMISSrr_Int	2170
+VCOMPRESSPDZ	2171
+VCOMPRESSPDZmr	2172
+VCOMPRESSPDZmrk	2173
+VCOMPRESSPDZrr	2174
+VCOMPRESSPDZrrk	2175
+VCOMPRESSPDZrrkz	2176
+VCOMPRESSPSZ	2177
+VCOMPRESSPSZmr	2178
+VCOMPRESSPSZmrk	2179
+VCOMPRESSPSZrr	2180
+VCOMPRESSPSZrrk	2181
+VCOMPRESSPSZrrkz	2182
+VCOMXSDZrm_Int	2183
+VCOMXSDZrr_Int	2184
+VCOMXSDZrrb_Int	2185
+VCOMXSHZrm_Int	2186
+VCOMXSHZrr_Int	2187
+VCOMXSHZrrb_Int	2188
+VCOMXSSZrm_Int	2189
+VCOMXSSZrr_Int	2190
+VCOMXSSZrrb_Int	2191
+VCVT	2192
+VCVTBF	2193
+VCVTBIASPH	2194
+VCVTDQ	2195
+VCVTHF	2196
+VCVTNE	2197
+VCVTNEEBF	2198
+VCVTNEEPH	2199
+VCVTNEOBF	2200
+VCVTNEOPH	2201
+VCVTNEPS	2202
+VCVTPD	2203
+VCVTPH	2204
+VCVTPS	2205
+VCVTQQ	2206
+VCVTSD	2207
+VCVTSH	2208
+VCVTSI	2209
+VCVTSS	2210
+VCVTTBF	2211
+VCVTTPD	2212
+VCVTTPH	2213
+VCVTTPS	2214
+VCVTTSD	2215
+VCVTTSH	2216
+VCVTTSS	2217
+VCVTUDQ	2218
+VCVTUQQ	2219
+VCVTUSI	2220
+VCVTUW	2221
+VCVTW	2222
+VDBPSADBWZ	2223
+VDBPSADBWZrmi	2224
+VDBPSADBWZrmik	2225
+VDBPSADBWZrmikz	2226
+VDBPSADBWZrri	2227
+VDBPSADBWZrrik	2228
+VDBPSADBWZrrikz	2229
+VDIVBF	2230
+VDIVPDYrm	2231
+VDIVPDYrr	2232
+VDIVPDZ	2233
+VDIVPDZrm	2234
+VDIVPDZrmb	2235
+VDIVPDZrmbk	2236
+VDIVPDZrmbkz	2237
+VDIVPDZrmk	2238
+VDIVPDZrmkz	2239
+VDIVPDZrr	2240
+VDIVPDZrrb	2241
+VDIVPDZrrbk	2242
+VDIVPDZrrbkz	2243
+VDIVPDZrrk	2244
+VDIVPDZrrkz	2245
+VDIVPDrm	2246
+VDIVPDrr	2247
+VDIVPHZ	2248
+VDIVPHZrm	2249
+VDIVPHZrmb	2250
+VDIVPHZrmbk	2251
+VDIVPHZrmbkz	2252
+VDIVPHZrmk	2253
+VDIVPHZrmkz	2254
+VDIVPHZrr	2255
+VDIVPHZrrb	2256
+VDIVPHZrrbk	2257
+VDIVPHZrrbkz	2258
+VDIVPHZrrk	2259
+VDIVPHZrrkz	2260
+VDIVPSYrm	2261
+VDIVPSYrr	2262
+VDIVPSZ	2263
+VDIVPSZrm	2264
+VDIVPSZrmb	2265
+VDIVPSZrmbk	2266
+VDIVPSZrmbkz	2267
+VDIVPSZrmk	2268
+VDIVPSZrmkz	2269
+VDIVPSZrr	2270
+VDIVPSZrrb	2271
+VDIVPSZrrbk	2272
+VDIVPSZrrbkz	2273
+VDIVPSZrrk	2274
+VDIVPSZrrkz	2275
+VDIVPSrm	2276
+VDIVPSrr	2277
+VDIVSDZrm	2278
+VDIVSDZrm_Int	2279
+VDIVSDZrmk_Int	2280
+VDIVSDZrmkz_Int	2281
+VDIVSDZrr	2282
+VDIVSDZrr_Int	2283
+VDIVSDZrrb_Int	2284
+VDIVSDZrrbk_Int	2285
+VDIVSDZrrbkz_Int	2286
+VDIVSDZrrk_Int	2287
+VDIVSDZrrkz_Int	2288
+VDIVSDrm	2289
+VDIVSDrm_Int	2290
+VDIVSDrr	2291
+VDIVSDrr_Int	2292
+VDIVSHZrm	2293
+VDIVSHZrm_Int	2294
+VDIVSHZrmk_Int	2295
+VDIVSHZrmkz_Int	2296
+VDIVSHZrr	2297
+VDIVSHZrr_Int	2298
+VDIVSHZrrb_Int	2299
+VDIVSHZrrbk_Int	2300
+VDIVSHZrrbkz_Int	2301
+VDIVSHZrrk_Int	2302
+VDIVSHZrrkz_Int	2303
+VDIVSSZrm	2304
+VDIVSSZrm_Int	2305
+VDIVSSZrmk_Int	2306
+VDIVSSZrmkz_Int	2307
+VDIVSSZrr	2308
+VDIVSSZrr_Int	2309
+VDIVSSZrrb_Int	2310
+VDIVSSZrrbk_Int	2311
+VDIVSSZrrbkz_Int	2312
+VDIVSSZrrk_Int	2313
+VDIVSSZrrkz_Int	2314
+VDIVSSrm	2315
+VDIVSSrm_Int	2316
+VDIVSSrr	2317
+VDIVSSrr_Int	2318
+VDPBF	2319
+VDPPDrmi	2320
+VDPPDrri	2321
+VDPPHPSZ	2322
+VDPPHPSZm	2323
+VDPPHPSZmb	2324
+VDPPHPSZmbk	2325
+VDPPHPSZmbkz	2326
+VDPPHPSZmk	2327
+VDPPHPSZmkz	2328
+VDPPHPSZr	2329
+VDPPHPSZrk	2330
+VDPPHPSZrkz	2331
+VDPPSYrmi	2332
+VDPPSYrri	2333
+VDPPSrmi	2334
+VDPPSrri	2335
+VERRm	2336
+VERRr	2337
+VERWm	2338
+VERWr	2339
+VEXP	2340
+VEXPANDPDZ	2341
+VEXPANDPDZrm	2342
+VEXPANDPDZrmk	2343
+VEXPANDPDZrmkz	2344
+VEXPANDPDZrr	2345
+VEXPANDPDZrrk	2346
+VEXPANDPDZrrkz	2347
+VEXPANDPSZ	2348
+VEXPANDPSZrm	2349
+VEXPANDPSZrmk	2350
+VEXPANDPSZrmkz	2351
+VEXPANDPSZrr	2352
+VEXPANDPSZrrk	2353
+VEXPANDPSZrrkz	2354
+VEXTRACTF	2355
+VEXTRACTI	2356
+VEXTRACTPSZmri	2357
+VEXTRACTPSZrri	2358
+VEXTRACTPSmri	2359
+VEXTRACTPSrri	2360
+VFCMADDCPHZ	2361
+VFCMADDCPHZm	2362
+VFCMADDCPHZmb	2363
+VFCMADDCPHZmbk	2364
+VFCMADDCPHZmbkz	2365
+VFCMADDCPHZmk	2366
+VFCMADDCPHZmkz	2367
+VFCMADDCPHZr	2368
+VFCMADDCPHZrb	2369
+VFCMADDCPHZrbk	2370
+VFCMADDCPHZrbkz	2371
+VFCMADDCPHZrk	2372
+VFCMADDCPHZrkz	2373
+VFCMADDCSHZm	2374
+VFCMADDCSHZmk	2375
+VFCMADDCSHZmkz	2376
+VFCMADDCSHZr	2377
+VFCMADDCSHZrb	2378
+VFCMADDCSHZrbk	2379
+VFCMADDCSHZrbkz	2380
+VFCMADDCSHZrk	2381
+VFCMADDCSHZrkz	2382
+VFCMULCPHZ	2383
+VFCMULCPHZrm	2384
+VFCMULCPHZrmb	2385
+VFCMULCPHZrmbk	2386
+VFCMULCPHZrmbkz	2387
+VFCMULCPHZrmk	2388
+VFCMULCPHZrmkz	2389
+VFCMULCPHZrr	2390
+VFCMULCPHZrrb	2391
+VFCMULCPHZrrbk	2392
+VFCMULCPHZrrbkz	2393
+VFCMULCPHZrrk	2394
+VFCMULCPHZrrkz	2395
+VFCMULCSHZrm	2396
+VFCMULCSHZrmk	2397
+VFCMULCSHZrmkz	2398
+VFCMULCSHZrr	2399
+VFCMULCSHZrrb	2400
+VFCMULCSHZrrbk	2401
+VFCMULCSHZrrbkz	2402
+VFCMULCSHZrrk	2403
+VFCMULCSHZrrkz	2404
+VFIXUPIMMPDZ	2405
+VFIXUPIMMPDZrmbi	2406
+VFIXUPIMMPDZrmbik	2407
+VFIXUPIMMPDZrmbikz	2408
+VFIXUPIMMPDZrmi	2409
+VFIXUPIMMPDZrmik	2410
+VFIXUPIMMPDZrmikz	2411
+VFIXUPIMMPDZrri	2412
+VFIXUPIMMPDZrrib	2413
+VFIXUPIMMPDZrribk	2414
+VFIXUPIMMPDZrribkz	2415
+VFIXUPIMMPDZrrik	2416
+VFIXUPIMMPDZrrikz	2417
+VFIXUPIMMPSZ	2418
+VFIXUPIMMPSZrmbi	2419
+VFIXUPIMMPSZrmbik	2420
+VFIXUPIMMPSZrmbikz	2421
+VFIXUPIMMPSZrmi	2422
+VFIXUPIMMPSZrmik	2423
+VFIXUPIMMPSZrmikz	2424
+VFIXUPIMMPSZrri	2425
+VFIXUPIMMPSZrrib	2426
+VFIXUPIMMPSZrribk	2427
+VFIXUPIMMPSZrribkz	2428
+VFIXUPIMMPSZrrik	2429
+VFIXUPIMMPSZrrikz	2430
+VFIXUPIMMSDZrmi	2431
+VFIXUPIMMSDZrmik	2432
+VFIXUPIMMSDZrmikz	2433
+VFIXUPIMMSDZrri	2434
+VFIXUPIMMSDZrrib	2435
+VFIXUPIMMSDZrribk	2436
+VFIXUPIMMSDZrribkz	2437
+VFIXUPIMMSDZrrik	2438
+VFIXUPIMMSDZrrikz	2439
+VFIXUPIMMSSZrmi	2440
+VFIXUPIMMSSZrmik	2441
+VFIXUPIMMSSZrmikz	2442
+VFIXUPIMMSSZrri	2443
+VFIXUPIMMSSZrrib	2444
+VFIXUPIMMSSZrribk	2445
+VFIXUPIMMSSZrribkz	2446
+VFIXUPIMMSSZrrik	2447
+VFIXUPIMMSSZrrikz	2448
+VFMADD	2449
+VFMADDCPHZ	2450
+VFMADDCPHZm	2451
+VFMADDCPHZmb	2452
+VFMADDCPHZmbk	2453
+VFMADDCPHZmbkz	2454
+VFMADDCPHZmk	2455
+VFMADDCPHZmkz	2456
+VFMADDCPHZr	2457
+VFMADDCPHZrb	2458
+VFMADDCPHZrbk	2459
+VFMADDCPHZrbkz	2460
+VFMADDCPHZrk	2461
+VFMADDCPHZrkz	2462
+VFMADDCSHZm	2463
+VFMADDCSHZmk	2464
+VFMADDCSHZmkz	2465
+VFMADDCSHZr	2466
+VFMADDCSHZrb	2467
+VFMADDCSHZrbk	2468
+VFMADDCSHZrbkz	2469
+VFMADDCSHZrk	2470
+VFMADDCSHZrkz	2471
+VFMADDPD	2472
+VFMADDPS	2473
+VFMADDSD	2474
+VFMADDSS	2475
+VFMADDSUB	2476
+VFMADDSUBPD	2477
+VFMADDSUBPS	2478
+VFMSUB	2479
+VFMSUBADD	2480
+VFMSUBADDPD	2481
+VFMSUBADDPS	2482
+VFMSUBPD	2483
+VFMSUBPS	2484
+VFMSUBSD	2485
+VFMSUBSS	2486
+VFMULCPHZ	2487
+VFMULCPHZrm	2488
+VFMULCPHZrmb	2489
+VFMULCPHZrmbk	2490
+VFMULCPHZrmbkz	2491
+VFMULCPHZrmk	2492
+VFMULCPHZrmkz	2493
+VFMULCPHZrr	2494
+VFMULCPHZrrb	2495
+VFMULCPHZrrbk	2496
+VFMULCPHZrrbkz	2497
+VFMULCPHZrrk	2498
+VFMULCPHZrrkz	2499
+VFMULCSHZrm	2500
+VFMULCSHZrmk	2501
+VFMULCSHZrmkz	2502
+VFMULCSHZrr	2503
+VFMULCSHZrrb	2504
+VFMULCSHZrrbk	2505
+VFMULCSHZrrbkz	2506
+VFMULCSHZrrk	2507
+VFMULCSHZrrkz	2508
+VFNMADD	2509
+VFNMADDPD	2510
+VFNMADDPS	2511
+VFNMADDSD	2512
+VFNMADDSS	2513
+VFNMSUB	2514
+VFNMSUBPD	2515
+VFNMSUBPS	2516
+VFNMSUBSD	2517
+VFNMSUBSS	2518
+VFPCLASSBF	2519
+VFPCLASSPDZ	2520
+VFPCLASSPDZmbi	2521
+VFPCLASSPDZmbik	2522
+VFPCLASSPDZmi	2523
+VFPCLASSPDZmik	2524
+VFPCLASSPDZri	2525
+VFPCLASSPDZrik	2526
+VFPCLASSPHZ	2527
+VFPCLASSPHZmbi	2528
+VFPCLASSPHZmbik	2529
+VFPCLASSPHZmi	2530
+VFPCLASSPHZmik	2531
+VFPCLASSPHZri	2532
+VFPCLASSPHZrik	2533
+VFPCLASSPSZ	2534
+VFPCLASSPSZmbi	2535
+VFPCLASSPSZmbik	2536
+VFPCLASSPSZmi	2537
+VFPCLASSPSZmik	2538
+VFPCLASSPSZri	2539
+VFPCLASSPSZrik	2540
+VFPCLASSSDZmi	2541
+VFPCLASSSDZmik	2542
+VFPCLASSSDZri	2543
+VFPCLASSSDZrik	2544
+VFPCLASSSHZmi	2545
+VFPCLASSSHZmik	2546
+VFPCLASSSHZri	2547
+VFPCLASSSHZrik	2548
+VFPCLASSSSZmi	2549
+VFPCLASSSSZmik	2550
+VFPCLASSSSZri	2551
+VFPCLASSSSZrik	2552
+VFRCZPDYrm	2553
+VFRCZPDYrr	2554
+VFRCZPDrm	2555
+VFRCZPDrr	2556
+VFRCZPSYrm	2557
+VFRCZPSYrr	2558
+VFRCZPSrm	2559
+VFRCZPSrr	2560
+VFRCZSDrm	2561
+VFRCZSDrr	2562
+VFRCZSSrm	2563
+VFRCZSSrr	2564
+VGATHERDPDYrm	2565
+VGATHERDPDZ	2566
+VGATHERDPDZrm	2567
+VGATHERDPDrm	2568
+VGATHERDPSYrm	2569
+VGATHERDPSZ	2570
+VGATHERDPSZrm	2571
+VGATHERDPSrm	2572
+VGATHERPF	2573
+VGATHERQPDYrm	2574
+VGATHERQPDZ	2575
+VGATHERQPDZrm	2576
+VGATHERQPDrm	2577
+VGATHERQPSYrm	2578
+VGATHERQPSZ	2579
+VGATHERQPSZrm	2580
+VGATHERQPSrm	2581
+VGETEXPBF	2582
+VGETEXPPDZ	2583
+VGETEXPPDZm	2584
+VGETEXPPDZmb	2585
+VGETEXPPDZmbk	2586
+VGETEXPPDZmbkz	2587
+VGETEXPPDZmk	2588
+VGETEXPPDZmkz	2589
+VGETEXPPDZr	2590
+VGETEXPPDZrb	2591
+VGETEXPPDZrbk	2592
+VGETEXPPDZrbkz	2593
+VGETEXPPDZrk	2594
+VGETEXPPDZrkz	2595
+VGETEXPPHZ	2596
+VGETEXPPHZm	2597
+VGETEXPPHZmb	2598
+VGETEXPPHZmbk	2599
+VGETEXPPHZmbkz	2600
+VGETEXPPHZmk	2601
+VGETEXPPHZmkz	2602
+VGETEXPPHZr	2603
+VGETEXPPHZrb	2604
+VGETEXPPHZrbk	2605
+VGETEXPPHZrbkz	2606
+VGETEXPPHZrk	2607
+VGETEXPPHZrkz	2608
+VGETEXPPSZ	2609
+VGETEXPPSZm	2610
+VGETEXPPSZmb	2611
+VGETEXPPSZmbk	2612
+VGETEXPPSZmbkz	2613
+VGETEXPPSZmk	2614
+VGETEXPPSZmkz	2615
+VGETEXPPSZr	2616
+VGETEXPPSZrb	2617
+VGETEXPPSZrbk	2618
+VGETEXPPSZrbkz	2619
+VGETEXPPSZrk	2620
+VGETEXPPSZrkz	2621
+VGETEXPSDZm	2622
+VGETEXPSDZmk	2623
+VGETEXPSDZmkz	2624
+VGETEXPSDZr	2625
+VGETEXPSDZrb	2626
+VGETEXPSDZrbk	2627
+VGETEXPSDZrbkz	2628
+VGETEXPSDZrk	2629
+VGETEXPSDZrkz	2630
+VGETEXPSHZm	2631
+VGETEXPSHZmk	2632
+VGETEXPSHZmkz	2633
+VGETEXPSHZr	2634
+VGETEXPSHZrb	2635
+VGETEXPSHZrbk	2636
+VGETEXPSHZrbkz	2637
+VGETEXPSHZrk	2638
+VGETEXPSHZrkz	2639
+VGETEXPSSZm	2640
+VGETEXPSSZmk	2641
+VGETEXPSSZmkz	2642
+VGETEXPSSZr	2643
+VGETEXPSSZrb	2644
+VGETEXPSSZrbk	2645
+VGETEXPSSZrbkz	2646
+VGETEXPSSZrk	2647
+VGETEXPSSZrkz	2648
+VGETMANTBF	2649
+VGETMANTPDZ	2650
+VGETMANTPDZrmbi	2651
+VGETMANTPDZrmbik	2652
+VGETMANTPDZrmbikz	2653
+VGETMANTPDZrmi	2654
+VGETMANTPDZrmik	2655
+VGETMANTPDZrmikz	2656
+VGETMANTPDZrri	2657
+VGETMANTPDZrrib	2658
+VGETMANTPDZrribk	2659
+VGETMANTPDZrribkz	2660
+VGETMANTPDZrrik	2661
+VGETMANTPDZrrikz	2662
+VGETMANTPHZ	2663
+VGETMANTPHZrmbi	2664
+VGETMANTPHZrmbik	2665
+VGETMANTPHZrmbikz	2666
+VGETMANTPHZrmi	2667
+VGETMANTPHZrmik	2668
+VGETMANTPHZrmikz	2669
+VGETMANTPHZrri	2670
+VGETMANTPHZrrib	2671
+VGETMANTPHZrribk	2672
+VGETMANTPHZrribkz	2673
+VGETMANTPHZrrik	2674
+VGETMANTPHZrrikz	2675
+VGETMANTPSZ	2676
+VGETMANTPSZrmbi	2677
+VGETMANTPSZrmbik	2678
+VGETMANTPSZrmbikz	2679
+VGETMANTPSZrmi	2680
+VGETMANTPSZrmik	2681
+VGETMANTPSZrmikz	2682
+VGETMANTPSZrri	2683
+VGETMANTPSZrrib	2684
+VGETMANTPSZrribk	2685
+VGETMANTPSZrribkz	2686
+VGETMANTPSZrrik	2687
+VGETMANTPSZrrikz	2688
+VGETMANTSDZrmi	2689
+VGETMANTSDZrmik	2690
+VGETMANTSDZrmikz	2691
+VGETMANTSDZrri	2692
+VGETMANTSDZrrib	2693
+VGETMANTSDZrribk	2694
+VGETMANTSDZrribkz	2695
+VGETMANTSDZrrik	2696
+VGETMANTSDZrrikz	2697
+VGETMANTSHZrmi	2698
+VGETMANTSHZrmik	2699
+VGETMANTSHZrmikz	2700
+VGETMANTSHZrri	2701
+VGETMANTSHZrrib	2702
+VGETMANTSHZrribk	2703
+VGETMANTSHZrribkz	2704
+VGETMANTSHZrrik	2705
+VGETMANTSHZrrikz	2706
+VGETMANTSSZrmi	2707
+VGETMANTSSZrmik	2708
+VGETMANTSSZrmikz	2709
+VGETMANTSSZrri	2710
+VGETMANTSSZrrib	2711
+VGETMANTSSZrribk	2712
+VGETMANTSSZrribkz	2713
+VGETMANTSSZrrik	2714
+VGETMANTSSZrrikz	2715
+VGF	2716
+VHADDPDYrm	2717
+VHADDPDYrr	2718
+VHADDPDrm	2719
+VHADDPDrr	2720
+VHADDPSYrm	2721
+VHADDPSYrr	2722
+VHADDPSrm	2723
+VHADDPSrr	2724
+VHSUBPDYrm	2725
+VHSUBPDYrr	2726
+VHSUBPDrm	2727
+VHSUBPDrr	2728
+VHSUBPSYrm	2729
+VHSUBPSYrr	2730
+VHSUBPSrm	2731
+VHSUBPSrr	2732
+VINSERTF	2733
+VINSERTI	2734
+VINSERTPSZrmi	2735
+VINSERTPSZrri	2736
+VINSERTPSrmi	2737
+VINSERTPSrri	2738
+VLDDQUYrm	2739
+VLDDQUrm	2740
+VLDMXCSR	2741
+VMASKMOVDQU	2742
+VMASKMOVPDYmr	2743
+VMASKMOVPDYrm	2744
+VMASKMOVPDmr	2745
+VMASKMOVPDrm	2746
+VMASKMOVPSYmr	2747
+VMASKMOVPSYrm	2748
+VMASKMOVPSmr	2749
+VMASKMOVPSrm	2750
+VMAXBF	2751
+VMAXCPDYrm	2752
+VMAXCPDYrr	2753
+VMAXCPDZ	2754
+VMAXCPDZrm	2755
+VMAXCPDZrmb	2756
+VMAXCPDZrmbk	2757
+VMAXCPDZrmbkz	2758
+VMAXCPDZrmk	2759
+VMAXCPDZrmkz	2760
+VMAXCPDZrr	2761
+VMAXCPDZrrk	2762
+VMAXCPDZrrkz	2763
+VMAXCPDrm	2764
+VMAXCPDrr	2765
+VMAXCPHZ	2766
+VMAXCPHZrm	2767
+VMAXCPHZrmb	2768
+VMAXCPHZrmbk	2769
+VMAXCPHZrmbkz	2770
+VMAXCPHZrmk	2771
+VMAXCPHZrmkz	2772
+VMAXCPHZrr	2773
+VMAXCPHZrrk	2774
+VMAXCPHZrrkz	2775
+VMAXCPSYrm	2776
+VMAXCPSYrr	2777
+VMAXCPSZ	2778
+VMAXCPSZrm	2779
+VMAXCPSZrmb	2780
+VMAXCPSZrmbk	2781
+VMAXCPSZrmbkz	2782
+VMAXCPSZrmk	2783
+VMAXCPSZrmkz	2784
+VMAXCPSZrr	2785
+VMAXCPSZrrk	2786
+VMAXCPSZrrkz	2787
+VMAXCPSrm	2788
+VMAXCPSrr	2789
+VMAXCSDZrm	2790
+VMAXCSDZrr	2791
+VMAXCSDrm	2792
+VMAXCSDrr	2793
+VMAXCSHZrm	2794
+VMAXCSHZrr	2795
+VMAXCSSZrm	2796
+VMAXCSSZrr	2797
+VMAXCSSrm	2798
+VMAXCSSrr	2799
+VMAXPDYrm	2800
+VMAXPDYrr	2801
+VMAXPDZ	2802
+VMAXPDZrm	2803
+VMAXPDZrmb	2804
+VMAXPDZrmbk	2805
+VMAXPDZrmbkz	2806
+VMAXPDZrmk	2807
+VMAXPDZrmkz	2808
+VMAXPDZrr	2809
+VMAXPDZrrb	2810
+VMAXPDZrrbk	2811
+VMAXPDZrrbkz	2812
+VMAXPDZrrk	2813
+VMAXPDZrrkz	2814
+VMAXPDrm	2815
+VMAXPDrr	2816
+VMAXPHZ	2817
+VMAXPHZrm	2818
+VMAXPHZrmb	2819
+VMAXPHZrmbk	2820
+VMAXPHZrmbkz	2821
+VMAXPHZrmk	2822
+VMAXPHZrmkz	2823
+VMAXPHZrr	2824
+VMAXPHZrrb	2825
+VMAXPHZrrbk	2826
+VMAXPHZrrbkz	2827
+VMAXPHZrrk	2828
+VMAXPHZrrkz	2829
+VMAXPSYrm	2830
+VMAXPSYrr	2831
+VMAXPSZ	2832
+VMAXPSZrm	2833
+VMAXPSZrmb	2834
+VMAXPSZrmbk	2835
+VMAXPSZrmbkz	2836
+VMAXPSZrmk	2837
+VMAXPSZrmkz	2838
+VMAXPSZrr	2839
+VMAXPSZrrb	2840
+VMAXPSZrrbk	2841
+VMAXPSZrrbkz	2842
+VMAXPSZrrk	2843
+VMAXPSZrrkz	2844
+VMAXPSrm	2845
+VMAXPSrr	2846
+VMAXSDZrm	2847
+VMAXSDZrm_Int	2848
+VMAXSDZrmk_Int	2849
+VMAXSDZrmkz_Int	2850
+VMAXSDZrr	2851
+VMAXSDZrr_Int	2852
+VMAXSDZrrb_Int	2853
+VMAXSDZrrbk_Int	2854
+VMAXSDZrrbkz_Int	2855
+VMAXSDZrrk_Int	2856
+VMAXSDZrrkz_Int	2857
+VMAXSDrm	2858
+VMAXSDrm_Int	2859
+VMAXSDrr	2860
+VMAXSDrr_Int	2861
+VMAXSHZrm	2862
+VMAXSHZrm_Int	2863
+VMAXSHZrmk_Int	2864
+VMAXSHZrmkz_Int	2865
+VMAXSHZrr	2866
+VMAXSHZrr_Int	2867
+VMAXSHZrrb_Int	2868
+VMAXSHZrrbk_Int	2869
+VMAXSHZrrbkz_Int	2870
+VMAXSHZrrk_Int	2871
+VMAXSHZrrkz_Int	2872
+VMAXSSZrm	2873
+VMAXSSZrm_Int	2874
+VMAXSSZrmk_Int	2875
+VMAXSSZrmkz_Int	2876
+VMAXSSZrr	2877
+VMAXSSZrr_Int	2878
+VMAXSSZrrb_Int	2879
+VMAXSSZrrbk_Int	2880
+VMAXSSZrrbkz_Int	2881
+VMAXSSZrrk_Int	2882
+VMAXSSZrrkz_Int	2883
+VMAXSSrm	2884
+VMAXSSrm_Int	2885
+VMAXSSrr	2886
+VMAXSSrr_Int	2887
+VMCALL	2888
+VMCLEARm	2889
+VMFUNC	2890
+VMINBF	2891
+VMINCPDYrm	2892
+VMINCPDYrr	2893
+VMINCPDZ	2894
+VMINCPDZrm	2895
+VMINCPDZrmb	2896
+VMINCPDZrmbk	2897
+VMINCPDZrmbkz	2898
+VMINCPDZrmk	2899
+VMINCPDZrmkz	2900
+VMINCPDZrr	2901
+VMINCPDZrrk	2902
+VMINCPDZrrkz	2903
+VMINCPDrm	2904
+VMINCPDrr	2905
+VMINCPHZ	2906
+VMINCPHZrm	2907
+VMINCPHZrmb	2908
+VMINCPHZrmbk	2909
+VMINCPHZrmbkz	2910
+VMINCPHZrmk	2911
+VMINCPHZrmkz	2912
+VMINCPHZrr	2913
+VMINCPHZrrk	2914
+VMINCPHZrrkz	2915
+VMINCPSYrm	2916
+VMINCPSYrr	2917
+VMINCPSZ	2918
+VMINCPSZrm	2919
+VMINCPSZrmb	2920
+VMINCPSZrmbk	2921
+VMINCPSZrmbkz	2922
+VMINCPSZrmk	2923
+VMINCPSZrmkz	2924
+VMINCPSZrr	2925
+VMINCPSZrrk	2926
+VMINCPSZrrkz	2927
+VMINCPSrm	2928
+VMINCPSrr	2929
+VMINCSDZrm	2930
+VMINCSDZrr	2931
+VMINCSDrm	2932
+VMINCSDrr	2933
+VMINCSHZrm	2934
+VMINCSHZrr	2935
+VMINCSSZrm	2936
+VMINCSSZrr	2937
+VMINCSSrm	2938
+VMINCSSrr	2939
+VMINMAXBF	2940
+VMINMAXPDZ	2941
+VMINMAXPDZrmbi	2942
+VMINMAXPDZrmbik	2943
+VMINMAXPDZrmbikz	2944
+VMINMAXPDZrmi	2945
+VMINMAXPDZrmik	2946
+VMINMAXPDZrmikz	2947
+VMINMAXPDZrri	2948
+VMINMAXPDZrrib	2949
+VMINMAXPDZrribk	2950
+VMINMAXPDZrribkz	2951
+VMINMAXPDZrrik	2952
+VMINMAXPDZrrikz	2953
+VMINMAXPHZ	2954
+VMINMAXPHZrmbi	2955
+VMINMAXPHZrmbik	2956
+VMINMAXPHZrmbikz	2957
+VMINMAXPHZrmi	2958
+VMINMAXPHZrmik	2959
+VMINMAXPHZrmikz	2960
+VMINMAXPHZrri	2961
+VMINMAXPHZrrib	2962
+VMINMAXPHZrribk	2963
+VMINMAXPHZrribkz	2964
+VMINMAXPHZrrik	2965
+VMINMAXPHZrrikz	2966
+VMINMAXPSZ	2967
+VMINMAXPSZrmbi	2968
+VMINMAXPSZrmbik	2969
+VMINMAXPSZrmbikz	2970
+VMINMAXPSZrmi	2971
+VMINMAXPSZrmik	2972
+VMINMAXPSZrmikz	2973
+VMINMAXPSZrri	2974
+VMINMAXPSZrrib	2975
+VMINMAXPSZrribk	2976
+VMINMAXPSZrribkz	2977
+VMINMAXPSZrrik	2978
+VMINMAXPSZrrikz	2979
+VMINMAXSDrmi	2980
+VMINMAXSDrmi_Int	2981
+VMINMAXSDrmik_Int	2982
+VMINMAXSDrmikz_Int	2983
+VMINMAXSDrri	2984
+VMINMAXSDrri_Int	2985
+VMINMAXSDrrib_Int	2986
+VMINMAXSDrribk_Int	2987
+VMINMAXSDrribkz_Int	2988
+VMINMAXSDrrik_Int	2989
+VMINMAXSDrrikz_Int	2990
+VMINMAXSHrmi	2991
+VMINMAXSHrmi_Int	2992
+VMINMAXSHrmik_Int	2993
+VMINMAXSHrmikz_Int	2994
+VMINMAXSHrri	2995
+VMINMAXSHrri_Int	2996
+VMINMAXSHrrib_Int	2997
+VMINMAXSHrribk_Int	2998
+VMINMAXSHrribkz_Int	2999
+VMINMAXSHrrik_Int	3000
+VMINMAXSHrrikz_Int	3001
+VMINMAXSSrmi	3002
+VMINMAXSSrmi_Int	3003
+VMINMAXSSrmik_Int	3004
+VMINMAXSSrmikz_Int	3005
+VMINMAXSSrri	3006
+VMINMAXSSrri_Int	3007
+VMINMAXSSrrib_Int	3008
+VMINMAXSSrribk_Int	3009
+VMINMAXSSrribkz_Int	3010
+VMINMAXSSrrik_Int	3011
+VMINMAXSSrrikz_Int	3012
+VMINPDYrm	3013
+VMINPDYrr	3014
+VMINPDZ	3015
+VMINPDZrm	3016
+VMINPDZrmb	3017
+VMINPDZrmbk	3018
+VMINPDZrmbkz	3019
+VMINPDZrmk	3020
+VMINPDZrmkz	3021
+VMINPDZrr	3022
+VMINPDZrrb	3023
+VMINPDZrrbk	3024
+VMINPDZrrbkz	3025
+VMINPDZrrk	3026
+VMINPDZrrkz	3027
+VMINPDrm	3028
+VMINPDrr	3029
+VMINPHZ	3030
+VMINPHZrm	3031
+VMINPHZrmb	3032
+VMINPHZrmbk	3033
+VMINPHZrmbkz	3034
+VMINPHZrmk	3035
+VMINPHZrmkz	3036
+VMINPHZrr	3037
+VMINPHZrrb	3038
+VMINPHZrrbk	3039
+VMINPHZrrbkz	3040
+VMINPHZrrk	3041
+VMINPHZrrkz	3042
+VMINPSYrm	3043
+VMINPSYrr	3044
+VMINPSZ	3045
+VMINPSZrm	3046
+VMINPSZrmb	3047
+VMINPSZrmbk	3048
+VMINPSZrmbkz	3049
+VMINPSZrmk	3050
+VMINPSZrmkz	3051
+VMINPSZrr	3052
+VMINPSZrrb	3053
+VMINPSZrrbk	3054
+VMINPSZrrbkz	3055
+VMINPSZrrk	3056
+VMINPSZrrkz	3057
+VMINPSrm	3058
+VMINPSrr	3059
+VMINSDZrm	3060
+VMINSDZrm_Int	3061
+VMINSDZrmk_Int	3062
+VMINSDZrmkz_Int	3063
+VMINSDZrr	3064
+VMINSDZrr_Int	3065
+VMINSDZrrb_Int	3066
+VMINSDZrrbk_Int	3067
+VMINSDZrrbkz_Int	3068
+VMINSDZrrk_Int	3069
+VMINSDZrrkz_Int	3070
+VMINSDrm	3071
+VMINSDrm_Int	3072
+VMINSDrr	3073
+VMINSDrr_Int	3074
+VMINSHZrm	3075
+VMINSHZrm_Int	3076
+VMINSHZrmk_Int	3077
+VMINSHZrmkz_Int	3078
+VMINSHZrr	3079
+VMINSHZrr_Int	3080
+VMINSHZrrb_Int	3081
+VMINSHZrrbk_Int	3082
+VMINSHZrrbkz_Int	3083
+VMINSHZrrk_Int	3084
+VMINSHZrrkz_Int	3085
+VMINSSZrm	3086
+VMINSSZrm_Int	3087
+VMINSSZrmk_Int	3088
+VMINSSZrmkz_Int	3089
+VMINSSZrr	3090
+VMINSSZrr_Int	3091
+VMINSSZrrb_Int	3092
+VMINSSZrrbk_Int	3093
+VMINSSZrrbkz_Int	3094
+VMINSSZrrk_Int	3095
+VMINSSZrrkz_Int	3096
+VMINSSrm	3097
+VMINSSrm_Int	3098
+VMINSSrr	3099
+VMINSSrr_Int	3100
+VMLAUNCH	3101
+VMLOAD	3102
+VMMCALL	3103
+VMOV	3104
+VMOVAPDYmr	3105
+VMOVAPDYrm	3106
+VMOVAPDYrr	3107
+VMOVAPDYrr_REV	3108
+VMOVAPDZ	3109
+VMOVAPDZmr	3110
+VMOVAPDZmrk	3111
+VMOVAPDZrm	3112
+VMOVAPDZrmk	3113
+VMOVAPDZrmkz	3114
+VMOVAPDZrr	3115
+VMOVAPDZrr_REV	3116
+VMOVAPDZrrk	3117
+VMOVAPDZrrk_REV	3118
+VMOVAPDZrrkz	3119
+VMOVAPDZrrkz_REV	3120
+VMOVAPDmr	3121
+VMOVAPDrm	3122
+VMOVAPDrr	3123
+VMOVAPDrr_REV	3124
+VMOVAPSYmr	3125
+VMOVAPSYrm	3126
+VMOVAPSYrr	3127
+VMOVAPSYrr_REV	3128
+VMOVAPSZ	3129
+VMOVAPSZmr	3130
+VMOVAPSZmrk	3131
+VMOVAPSZrm	3132
+VMOVAPSZrmk	3133
+VMOVAPSZrmkz	3134
+VMOVAPSZrr	3135
+VMOVAPSZrr_REV	3136
+VMOVAPSZrrk	3137
+VMOVAPSZrrk_REV	3138
+VMOVAPSZrrkz	3139
+VMOVAPSZrrkz_REV	3140
+VMOVAPSmr	3141
+VMOVAPSrm	3142
+VMOVAPSrr	3143
+VMOVAPSrr_REV	3144
+VMOVDDUPYrm	3145
+VMOVDDUPYrr	3146
+VMOVDDUPZ	3147
+VMOVDDUPZrm	3148
+VMOVDDUPZrmk	3149
+VMOVDDUPZrmkz	3150
+VMOVDDUPZrr	3151
+VMOVDDUPZrrk	3152
+VMOVDDUPZrrkz	3153
+VMOVDDUPrm	3154
+VMOVDDUPrr	3155
+VMOVDI	3156
+VMOVDQA	3157
+VMOVDQAYmr	3158
+VMOVDQAYrm	3159
+VMOVDQAYrr	3160
+VMOVDQAYrr_REV	3161
+VMOVDQAmr	3162
+VMOVDQArm	3163
+VMOVDQArr	3164
+VMOVDQArr_REV	3165
+VMOVDQU	3166
+VMOVDQUYmr	3167
+VMOVDQUYrm	3168
+VMOVDQUYrr	3169
+VMOVDQUYrr_REV	3170
+VMOVDQUmr	3171
+VMOVDQUrm	3172
+VMOVDQUrr	3173
+VMOVDQUrr_REV	3174
+VMOVHLPSZrr	3175
+VMOVHLPSrr	3176
+VMOVHPDZ	3177
+VMOVHPDmr	3178
+VMOVHPDrm	3179
+VMOVHPSZ	3180
+VMOVHPSmr	3181
+VMOVHPSrm	3182
+VMOVLHPSZrr	3183
+VMOVLHPSrr	3184
+VMOVLPDZ	3185
+VMOVLPDmr	3186
+VMOVLPDrm	3187
+VMOVLPSZ	3188
+VMOVLPSmr	3189
+VMOVLPSrm	3190
+VMOVMSKPDYrr	3191
+VMOVMSKPDrr	3192
+VMOVMSKPSYrr	3193
+VMOVMSKPSrr	3194
+VMOVNTDQAYrm	3195
+VMOVNTDQAZ	3196
+VMOVNTDQAZrm	3197
+VMOVNTDQArm	3198
+VMOVNTDQYmr	3199
+VMOVNTDQZ	3200
+VMOVNTDQZmr	3201
+VMOVNTDQmr	3202
+VMOVNTPDYmr	3203
+VMOVNTPDZ	3204
+VMOVNTPDZmr	3205
+VMOVNTPDmr	3206
+VMOVNTPSYmr	3207
+VMOVNTPSZ	3208
+VMOVNTPSZmr	3209
+VMOVNTPSmr	3210
+VMOVPDI	3211
+VMOVPQI	3212
+VMOVPQIto	3213
+VMOVQI	3214
+VMOVRSBZ	3215
+VMOVRSBZm	3216
+VMOVRSBZmk	3217
+VMOVRSBZmkz	3218
+VMOVRSDZ	3219
+VMOVRSDZm	3220
+VMOVRSDZmk	3221
+VMOVRSDZmkz	3222
+VMOVRSQZ	3223
+VMOVRSQZm	3224
+VMOVRSQZmk	3225
+VMOVRSQZmkz	3226
+VMOVRSWZ	3227
+VMOVRSWZm	3228
+VMOVRSWZmk	3229
+VMOVRSWZmkz	3230
+VMOVSDZmr	3231
+VMOVSDZmrk	3232
+VMOVSDZrm	3233
+VMOVSDZrm_alt	3234
+VMOVSDZrmk	3235
+VMOVSDZrmkz	3236
+VMOVSDZrr	3237
+VMOVSDZrr_REV	3238
+VMOVSDZrrk	3239
+VMOVSDZrrk_REV	3240
+VMOVSDZrrkz	3241
+VMOVSDZrrkz_REV	3242
+VMOVSDmr	3243
+VMOVSDrm	3244
+VMOVSDrm_alt	3245
+VMOVSDrr	3246
+VMOVSDrr_REV	3247
+VMOVSDto	3248
+VMOVSH	3249
+VMOVSHDUPYrm	3250
+VMOVSHDUPYrr	3251
+VMOVSHDUPZ	3252
+VMOVSHDUPZrm	3253
+VMOVSHDUPZrmk	3254
+VMOVSHDUPZrmkz	3255
+VMOVSHDUPZrr	3256
+VMOVSHDUPZrrk	3257
+VMOVSHDUPZrrkz	3258
+VMOVSHDUPrm	3259
+VMOVSHDUPrr	3260
+VMOVSHZmr	3261
+VMOVSHZmrk	3262
+VMOVSHZrm	3263
+VMOVSHZrm_alt	3264
+VMOVSHZrmk	3265
+VMOVSHZrmkz	3266
+VMOVSHZrr	3267
+VMOVSHZrr_REV	3268
+VMOVSHZrrk	3269
+VMOVSHZrrk_REV	3270
+VMOVSHZrrkz	3271
+VMOVSHZrrkz_REV	3272
+VMOVSHtoW	3273
+VMOVSLDUPYrm	3274
+VMOVSLDUPYrr	3275
+VMOVSLDUPZ	3276
+VMOVSLDUPZrm	3277
+VMOVSLDUPZrmk	3278
+VMOVSLDUPZrmkz	3279
+VMOVSLDUPZrr	3280
+VMOVSLDUPZrrk	3281
+VMOVSLDUPZrrkz	3282
+VMOVSLDUPrm	3283
+VMOVSLDUPrr	3284
+VMOVSS	3285
+VMOVSSZmr	3286
+VMOVSSZmrk	3287
+VMOVSSZrm	3288
+VMOVSSZrm_alt	3289
+VMOVSSZrmk	3290
+VMOVSSZrmkz	3291
+VMOVSSZrr	3292
+VMOVSSZrr_REV	3293
+VMOVSSZrrk	3294
+VMOVSSZrrk_REV	3295
+VMOVSSZrrkz	3296
+VMOVSSZrrkz_REV	3297
+VMOVSSmr	3298
+VMOVSSrm	3299
+VMOVSSrm_alt	3300
+VMOVSSrr	3301
+VMOVSSrr_REV	3302
+VMOVUPDYmr	3303
+VMOVUPDYrm	3304
+VMOVUPDYrr	3305
+VMOVUPDYrr_REV	3306
+VMOVUPDZ	3307
+VMOVUPDZmr	3308
+VMOVUPDZmrk	3309
+VMOVUPDZrm	3310
+VMOVUPDZrmk	3311
+VMOVUPDZrmkz	3312
+VMOVUPDZrr	3313
+VMOVUPDZrr_REV	3314
+VMOVUPDZrrk	3315
+VMOVUPDZrrk_REV	3316
+VMOVUPDZrrkz	3317
+VMOVUPDZrrkz_REV	3318
+VMOVUPDmr	3319
+VMOVUPDrm	3320
+VMOVUPDrr	3321
+VMOVUPDrr_REV	3322
+VMOVUPSYmr	3323
+VMOVUPSYrm	3324
+VMOVUPSYrr	3325
+VMOVUPSYrr_REV	3326
+VMOVUPSZ	3327
+VMOVUPSZmr	3328
+VMOVUPSZmrk	3329
+VMOVUPSZrm	3330
+VMOVUPSZrmk	3331
+VMOVUPSZrmkz	3332
+VMOVUPSZrr	3333
+VMOVUPSZrr_REV	3334
+VMOVUPSZrrk	3335
+VMOVUPSZrrk_REV	3336
+VMOVUPSZrrkz	3337
+VMOVUPSZrrkz_REV	3338
+VMOVUPSmr	3339
+VMOVUPSrm	3340
+VMOVUPSrr	3341
+VMOVUPSrr_REV	3342
+VMOVW	3343
+VMOVWmr	3344
+VMOVWrm	3345
+VMOVZPDILo	3346
+VMOVZPQILo	3347
+VMOVZPWILo	3348
+VMPSADBWYrmi	3349
+VMPSADBWYrri	3350
+VMPSADBWZ	3351
+VMPSADBWZrmi	3352
+VMPSADBWZrmik	3353
+VMPSADBWZrmikz	3354
+VMPSADBWZrri	3355
+VMPSADBWZrrik	3356
+VMPSADBWZrrikz	3357
+VMPSADBWrmi	3358
+VMPSADBWrri	3359
+VMPTRLDm	3360
+VMPTRSTm	3361
+VMREAD	3362
+VMRESUME	3363
+VMRUN	3364
+VMSAVE	3365
+VMULBF	3366
+VMULPDYrm	3367
+VMULPDYrr	3368
+VMULPDZ	3369
+VMULPDZrm	3370
+VMULPDZrmb	3371
+VMULPDZrmbk	3372
+VMULPDZrmbkz	3373
+VMULPDZrmk	3374
+VMULPDZrmkz	3375
+VMULPDZrr	3376
+VMULPDZrrb	3377
+VMULPDZrrbk	3378
+VMULPDZrrbkz	3379
+VMULPDZrrk	3380
+VMULPDZrrkz	3381
+VMULPDrm	3382
+VMULPDrr	3383
+VMULPHZ	3384
+VMULPHZrm	3385
+VMULPHZrmb	3386
+VMULPHZrmbk	3387
+VMULPHZrmbkz	3388
+VMULPHZrmk	3389
+VMULPHZrmkz	3390
+VMULPHZrr	3391
+VMULPHZrrb	3392
+VMULPHZrrbk	3393
+VMULPHZrrbkz	3394
+VMULPHZrrk	3395
+VMULPHZrrkz	3396
+VMULPSYrm	3397
+VMULPSYrr	3398
+VMULPSZ	3399
+VMULPSZrm	3400
+VMULPSZrmb	3401
+VMULPSZrmbk	3402
+VMULPSZrmbkz	3403
+VMULPSZrmk	3404
+VMULPSZrmkz	3405
+VMULPSZrr	3406
+VMULPSZrrb	3407
+VMULPSZrrbk	3408
+VMULPSZrrbkz	3409
+VMULPSZrrk	3410
+VMULPSZrrkz	3411
+VMULPSrm	3412
+VMULPSrr	3413
+VMULSDZrm	3414
+VMULSDZrm_Int	3415
+VMULSDZrmk_Int	3416
+VMULSDZrmkz_Int	3417
+VMULSDZrr	3418
+VMULSDZrr_Int	3419
+VMULSDZrrb_Int	3420
+VMULSDZrrbk_Int	3421
+VMULSDZrrbkz_Int	3422
+VMULSDZrrk_Int	3423
+VMULSDZrrkz_Int	3424
+VMULSDrm	3425
+VMULSDrm_Int	3426
+VMULSDrr	3427
+VMULSDrr_Int	3428
+VMULSHZrm	3429
+VMULSHZrm_Int	3430
+VMULSHZrmk_Int	3431
+VMULSHZrmkz_Int	3432
+VMULSHZrr	3433
+VMULSHZrr_Int	3434
+VMULSHZrrb_Int	3435
+VMULSHZrrbk_Int	3436
+VMULSHZrrbkz_Int	3437
+VMULSHZrrk_Int	3438
+VMULSHZrrkz_Int	3439
+VMULSSZrm	3440
+VMULSSZrm_Int	3441
+VMULSSZrmk_Int	3442
+VMULSSZrmkz_Int	3443
+VMULSSZrr	3444
+VMULSSZrr_Int	3445
+VMULSSZrrb_Int	3446
+VMULSSZrrbk_Int	3447
+VMULSSZrrbkz_Int	3448
+VMULSSZrrk_Int	3449
+VMULSSZrrkz_Int	3450
+VMULSSrm	3451
+VMULSSrm_Int	3452
+VMULSSrr	3453
+VMULSSrr_Int	3454
+VMWRITE	3455
+VMXOFF	3456
+VMXON	3457
+VORPDYrm	3458
+VORPDYrr	3459
+VORPDZ	3460
+VORPDZrm	3461
+VORPDZrmb	3462
+VORPDZrmbk	3463
+VORPDZrmbkz	3464
+VORPDZrmk	3465
+VORPDZrmkz	3466
+VORPDZrr	3467
+VORPDZrrk	3468
+VORPDZrrkz	3469
+VORPDrm	3470
+VORPDrr	3471
+VORPSYrm	3472
+VORPSYrr	3473
+VORPSZ	3474
+VORPSZrm	3475
+VORPSZrmb	3476
+VORPSZrmbk	3477
+VORPSZrmbkz	3478
+VORPSZrmk	3479
+VORPSZrmkz	3480
+VORPSZrr	3481
+VORPSZrrk	3482
+VORPSZrrkz	3483
+VORPSrm	3484
+VORPSrr	3485
+VP	3486
+VPABSBYrm	3487
+VPABSBYrr	3488
+VPABSBZ	3489
+VPABSBZrm	3490
+VPABSBZrmk	3491
+VPABSBZrmkz	3492
+VPABSBZrr	3493
+VPABSBZrrk	3494
+VPABSBZrrkz	3495
+VPABSBrm	3496
+VPABSBrr	3497
+VPABSDYrm	3498
+VPABSDYrr	3499
+VPABSDZ	3500
+VPABSDZrm	3501
+VPABSDZrmb	3502
+VPABSDZrmbk	3503
+VPABSDZrmbkz	3504
+VPABSDZrmk	3505
+VPABSDZrmkz	3506
+VPABSDZrr	3507
+VPABSDZrrk	3508
+VPABSDZrrkz	3509
+VPABSDrm	3510
+VPABSDrr	3511
+VPABSQZ	3512
+VPABSQZrm	3513
+VPABSQZrmb	3514
+VPABSQZrmbk	3515
+VPABSQZrmbkz	3516
+VPABSQZrmk	3517
+VPABSQZrmkz	3518
+VPABSQZrr	3519
+VPABSQZrrk	3520
+VPABSQZrrkz	3521
+VPABSWYrm	3522
+VPABSWYrr	3523
+VPABSWZ	3524
+VPABSWZrm	3525
+VPABSWZrmk	3526
+VPABSWZrmkz	3527
+VPABSWZrr	3528
+VPABSWZrrk	3529
+VPABSWZrrkz	3530
+VPABSWrm	3531
+VPABSWrr	3532
+VPACKSSDWYrm	3533
+VPACKSSDWYrr	3534
+VPACKSSDWZ	3535
+VPACKSSDWZrm	3536
+VPACKSSDWZrmb	3537
+VPACKSSDWZrmbk	3538
+VPACKSSDWZrmbkz	3539
+VPACKSSDWZrmk	3540
+VPACKSSDWZrmkz	3541
+VPACKSSDWZrr	3542
+VPACKSSDWZrrk	3543
+VPACKSSDWZrrkz	3544
+VPACKSSDWrm	3545
+VPACKSSDWrr	3546
+VPACKSSWBYrm	3547
+VPACKSSWBYrr	3548
+VPACKSSWBZ	3549
+VPACKSSWBZrm	3550
+VPACKSSWBZrmk	3551
+VPACKSSWBZrmkz	3552
+VPACKSSWBZrr	3553
+VPACKSSWBZrrk	3554
+VPACKSSWBZrrkz	3555
+VPACKSSWBrm	3556
+VPACKSSWBrr	3557
+VPACKUSDWYrm	3558
+VPACKUSDWYrr	3559
+VPACKUSDWZ	3560
+VPACKUSDWZrm	3561
+VPACKUSDWZrmb	3562
+VPACKUSDWZrmbk	3563
+VPACKUSDWZrmbkz	3564
+VPACKUSDWZrmk	3565
+VPACKUSDWZrmkz	3566
+VPACKUSDWZrr	3567
+VPACKUSDWZrrk	3568
+VPACKUSDWZrrkz	3569
+VPACKUSDWrm	3570
+VPACKUSDWrr	3571
+VPACKUSWBYrm	3572
+VPACKUSWBYrr	3573
+VPACKUSWBZ	3574
+VPACKUSWBZrm	3575
+VPACKUSWBZrmk	3576
+VPACKUSWBZrmkz	3577
+VPACKUSWBZrr	3578
+VPACKUSWBZrrk	3579
+VPACKUSWBZrrkz	3580
+VPACKUSWBrm	3581
+VPACKUSWBrr	3582
+VPADDBYrm	3583
+VPADDBYrr	3584
+VPADDBZ	3585
+VPADDBZrm	3586
+VPADDBZrmk	3587
+VPADDBZrmkz	3588
+VPADDBZrr	3589
+VPADDBZrrk	3590
+VPADDBZrrkz	3591
+VPADDBrm	3592
+VPADDBrr	3593
+VPADDDYrm	3594
+VPADDDYrr	3595
+VPADDDZ	3596
+VPADDDZrm	3597
+VPADDDZrmb	3598
+VPADDDZrmbk	3599
+VPADDDZrmbkz	3600
+VPADDDZrmk	3601
+VPADDDZrmkz	3602
+VPADDDZrr	3603
+VPADDDZrrk	3604
+VPADDDZrrkz	3605
+VPADDDrm	3606
+VPADDDrr	3607
+VPADDQYrm	3608
+VPADDQYrr	3609
+VPADDQZ	3610
+VPADDQZrm	3611
+VPADDQZrmb	3612
+VPADDQZrmbk	3613
+VPADDQZrmbkz	3614
+VPADDQZrmk	3615
+VPADDQZrmkz	3616
+VPADDQZrr	3617
+VPADDQZrrk	3618
+VPADDQZrrkz	3619
+VPADDQrm	3620
+VPADDQrr	3621
+VPADDSBYrm	3622
+VPADDSBYrr	3623
+VPADDSBZ	3624
+VPADDSBZrm	3625
+VPADDSBZrmk	3626
+VPADDSBZrmkz	3627
+VPADDSBZrr	3628
+VPADDSBZrrk	3629
+VPADDSBZrrkz	3630
+VPADDSBrm	3631
+VPADDSBrr	3632
+VPADDSWYrm	3633
+VPADDSWYrr	3634
+VPADDSWZ	3635
+VPADDSWZrm	3636
+VPADDSWZrmk	3637
+VPADDSWZrmkz	3638
+VPADDSWZrr	3639
+VPADDSWZrrk	3640
+VPADDSWZrrkz	3641
+VPADDSWrm	3642
+VPADDSWrr	3643
+VPADDUSBYrm	3644
+VPADDUSBYrr	3645
+VPADDUSBZ	3646
+VPADDUSBZrm	3647
+VPADDUSBZrmk	3648
+VPADDUSBZrmkz	3649
+VPADDUSBZrr	3650
+VPADDUSBZrrk	3651
+VPADDUSBZrrkz	3652
+VPADDUSBrm	3653
+VPADDUSBrr	3654
+VPADDUSWYrm	3655
+VPADDUSWYrr	3656
+VPADDUSWZ	3657
+VPADDUSWZrm	3658
+VPADDUSWZrmk	3659
+VPADDUSWZrmkz	3660
+VPADDUSWZrr	3661
+VPADDUSWZrrk	3662
+VPADDUSWZrrkz	3663
+VPADDUSWrm	3664
+VPADDUSWrr	3665
+VPADDWYrm	3666
+VPADDWYrr	3667
+VPADDWZ	3668
+VPADDWZrm	3669
+VPADDWZrmk	3670
+VPADDWZrmkz	3671
+VPADDWZrr	3672
+VPADDWZrrk	3673
+VPADDWZrrkz	3674
+VPADDWrm	3675
+VPADDWrr	3676
+VPALIGNRYrmi	3677
+VPALIGNRYrri	3678
+VPALIGNRZ	3679
+VPALIGNRZrmi	3680
+VPALIGNRZrmik	3681
+VPALIGNRZrmikz	3682
+VPALIGNRZrri	3683
+VPALIGNRZrrik	3684
+VPALIGNRZrrikz	3685
+VPALIGNRrmi	3686
+VPALIGNRrri	3687
+VPANDDZ	3688
+VPANDDZrm	3689
+VPANDDZrmb	3690
+VPANDDZrmbk	3691
+VPANDDZrmbkz	3692
+VPANDDZrmk	3693
+VPANDDZrmkz	3694
+VPANDDZrr	3695
+VPANDDZrrk	3696
+VPANDDZrrkz	3697
+VPANDNDZ	3698
+VPANDNDZrm	3699
+VPANDNDZrmb	3700
+VPANDNDZrmbk	3701
+VPANDNDZrmbkz	3702
+VPANDNDZrmk	3703
+VPANDNDZrmkz	3704
+VPANDNDZrr	3705
+VPANDNDZrrk	3706
+VPANDNDZrrkz	3707
+VPANDNQZ	3708
+VPANDNQZrm	3709
+VPANDNQZrmb	3710
+VPANDNQZrmbk	3711
+VPANDNQZrmbkz	3712
+VPANDNQZrmk	3713
+VPANDNQZrmkz	3714
+VPANDNQZrr	3715
+VPANDNQZrrk	3716
+VPANDNQZrrkz	3717
+VPANDNYrm	3718
+VPANDNYrr	3719
+VPANDNrm	3720
+VPANDNrr	3721
+VPANDQZ	3722
+VPANDQZrm	3723
+VPANDQZrmb	3724
+VPANDQZrmbk	3725
+VPANDQZrmbkz	3726
+VPANDQZrmk	3727
+VPANDQZrmkz	3728
+VPANDQZrr	3729
+VPANDQZrrk	3730
+VPANDQZrrkz	3731
+VPANDYrm	3732
+VPANDYrr	3733
+VPANDrm	3734
+VPANDrr	3735
+VPAVGBYrm	3736
+VPAVGBYrr	3737
+VPAVGBZ	3738
+VPAVGBZrm	3739
+VPAVGBZrmk	3740
+VPAVGBZrmkz	3741
+VPAVGBZrr	3742
+VPAVGBZrrk	3743
+VPAVGBZrrkz	3744
+VPAVGBrm	3745
+VPAVGBrr	3746
+VPAVGWYrm	3747
+VPAVGWYrr	3748
+VPAVGWZ	3749
+VPAVGWZrm	3750
+VPAVGWZrmk	3751
+VPAVGWZrmkz	3752
+VPAVGWZrr	3753
+VPAVGWZrrk	3754
+VPAVGWZrrkz	3755
+VPAVGWrm	3756
+VPAVGWrr	3757
+VPBLENDDYrmi	3758
+VPBLENDDYrri	3759
+VPBLENDDrmi	3760
+VPBLENDDrri	3761
+VPBLENDMBZ	3762
+VPBLENDMBZrm	3763
+VPBLENDMBZrmk	3764
+VPBLENDMBZrmkz	3765
+VPBLENDMBZrr	3766
+VPBLENDMBZrrk	3767
+VPBLENDMBZrrkz	3768
+VPBLENDMDZ	3769
+VPBLENDMDZrm	3770
+VPBLENDMDZrmb	3771
+VPBLENDMDZrmbk	3772
+VPBLENDMDZrmbkz	3773
+VPBLENDMDZrmk	3774
+VPBLENDMDZrmkz	3775
+VPBLENDMDZrr	3776
+VPBLENDMDZrrk	3777
+VPBLENDMDZrrkz	3778
+VPBLENDMQZ	3779
+VPBLENDMQZrm	3780
+VPBLENDMQZrmb	3781
+VPBLENDMQZrmbk	3782
+VPBLENDMQZrmbkz	3783
+VPBLENDMQZrmk	3784
+VPBLENDMQZrmkz	3785
+VPBLENDMQZrr	3786
+VPBLENDMQZrrk	3787
+VPBLENDMQZrrkz	3788
+VPBLENDMWZ	3789
+VPBLENDMWZrm	3790
+VPBLENDMWZrmk	3791
+VPBLENDMWZrmkz	3792
+VPBLENDMWZrr	3793
+VPBLENDMWZrrk	3794
+VPBLENDMWZrrkz	3795
+VPBLENDVBYrmr	3796
+VPBLENDVBYrrr	3797
+VPBLENDVBrmr	3798
+VPBLENDVBrrr	3799
+VPBLENDWYrmi	3800
+VPBLENDWYrri	3801
+VPBLENDWrmi	3802
+VPBLENDWrri	3803
+VPBROADCASTBYrm	3804
+VPBROADCASTBYrr	3805
+VPBROADCASTBZ	3806
+VPBROADCASTBZrm	3807
+VPBROADCASTBZrmk	3808
+VPBROADCASTBZrmkz	3809
+VPBROADCASTBZrr	3810
+VPBROADCASTBZrrk	3811
+VPBROADCASTBZrrkz	3812
+VPBROADCASTBrZ	3813
+VPBROADCASTBrZrr	3814
+VPBROADCASTBrZrrk	3815
+VPBROADCASTBrZrrkz	3816
+VPBROADCASTBrm	3817
+VPBROADCASTBrr	3818
+VPBROADCASTDYrm	3819
+VPBROADCASTDYrr	3820
+VPBROADCASTDZ	3821
+VPBROADCASTDZrm	3822
+VPBROADCASTDZrmk	3823
+VPBROADCASTDZrmkz	3824
+VPBROADCASTDZrr	3825
+VPBROADCASTDZrrk	3826
+VPBROADCASTDZrrkz	3827
+VPBROADCASTDrZ	3828
+VPBROADCASTDrZrr	3829
+VPBROADCASTDrZrrk	3830
+VPBROADCASTDrZrrkz	3831
+VPBROADCASTDrm	3832
+VPBROADCASTDrr	3833
+VPBROADCASTMB	3834
+VPBROADCASTMW	3835
+VPBROADCASTQYrm	3836
+VPBROADCASTQYrr	3837
+VPBROADCASTQZ	3838
+VPBROADCASTQZrm	3839
+VPBROADCASTQZrmk	3840
+VPBROADCASTQZrmkz	3841
+VPBROADCASTQZrr	3842
+VPBROADCASTQZrrk	3843
+VPBROADCASTQZrrkz	3844
+VPBROADCASTQrZ	3845
+VPBROADCASTQrZrr	3846
+VPBROADCASTQrZrrk	3847
+VPBROADCASTQrZrrkz	3848
+VPBROADCASTQrm	3849
+VPBROADCASTQrr	3850
+VPBROADCASTWYrm	3851
+VPBROADCASTWYrr	3852
+VPBROADCASTWZ	3853
+VPBROADCASTWZrm	3854
+VPBROADCASTWZrmk	3855
+VPBROADCASTWZrmkz	3856
+VPBROADCASTWZrr	3857
+VPBROADCASTWZrrk	3858
+VPBROADCASTWZrrkz	3859
+VPBROADCASTWrZ	3860
+VPBROADCASTWrZrr	3861
+VPBROADCASTWrZrrk	3862
+VPBROADCASTWrZrrkz	3863
+VPBROADCASTWrm	3864
+VPBROADCASTWrr	3865
+VPCLMULQDQYrmi	3866
+VPCLMULQDQYrri	3867
+VPCLMULQDQZ	3868
+VPCLMULQDQZrmi	3869
+VPCLMULQDQZrri	3870
+VPCLMULQDQrmi	3871
+VPCLMULQDQrri	3872
+VPCMOVYrmr	3873
+VPCMOVYrrm	3874
+VPCMOVYrrr	3875
+VPCMOVYrrr_REV	3876
+VPCMOVrmr	3877
+VPCMOVrrm	3878
+VPCMOVrrr	3879
+VPCMOVrrr_REV	3880
+VPCMPBZ	3881
+VPCMPBZrmi	3882
+VPCMPBZrmik	3883
+VPCMPBZrri	3884
+VPCMPBZrrik	3885
+VPCMPDZ	3886
+VPCMPDZrmbi	3887
+VPCMPDZrmbik	3888
+VPCMPDZrmi	3889
+VPCMPDZrmik	3890
+VPCMPDZrri	3891
+VPCMPDZrrik	3892
+VPCMPEQBYrm	3893
+VPCMPEQBYrr	3894
+VPCMPEQBZ	3895
+VPCMPEQBZrm	3896
+VPCMPEQBZrmk	3897
+VPCMPEQBZrr	3898
+VPCMPEQBZrrk	3899
+VPCMPEQBrm	3900
+VPCMPEQBrr	3901
+VPCMPEQDYrm	3902
+VPCMPEQDYrr	3903
+VPCMPEQDZ	3904
+VPCMPEQDZrm	3905
+VPCMPEQDZrmb	3906
+VPCMPEQDZrmbk	3907
+VPCMPEQDZrmk	3908
+VPCMPEQDZrr	3909
+VPCMPEQDZrrk	3910
+VPCMPEQDrm	3911
+VPCMPEQDrr	3912
+VPCMPEQQYrm	3913
+VPCMPEQQYrr	3914
+VPCMPEQQZ	3915
+VPCMPEQQZrm	3916
+VPCMPEQQZrmb	3917
+VPCMPEQQZrmbk	3918
+VPCMPEQQZrmk	3919
+VPCMPEQQZrr	3920
+VPCMPEQQZrrk	3921
+VPCMPEQQrm	3922
+VPCMPEQQrr	3923
+VPCMPEQWYrm	3924
+VPCMPEQWYrr	3925
+VPCMPEQWZ	3926
+VPCMPEQWZrm	3927
+VPCMPEQWZrmk	3928
+VPCMPEQWZrr	3929
+VPCMPEQWZrrk	3930
+VPCMPEQWrm	3931
+VPCMPEQWrr	3932
+VPCMPESTRIrmi	3933
+VPCMPESTRIrri	3934
+VPCMPESTRMrmi	3935
+VPCMPESTRMrri	3936
+VPCMPGTBYrm	3937
+VPCMPGTBYrr	3938
+VPCMPGTBZ	3939
+VPCMPGTBZrm	3940
+VPCMPGTBZrmk	3941
+VPCMPGTBZrr	3942
+VPCMPGTBZrrk	3943
+VPCMPGTBrm	3944
+VPCMPGTBrr	3945
+VPCMPGTDYrm	3946
+VPCMPGTDYrr	3947
+VPCMPGTDZ	3948
+VPCMPGTDZrm	3949
+VPCMPGTDZrmb	3950
+VPCMPGTDZrmbk	3951
+VPCMPGTDZrmk	3952
+VPCMPGTDZrr	3953
+VPCMPGTDZrrk	3954
+VPCMPGTDrm	3955
+VPCMPGTDrr	3956
+VPCMPGTQYrm	3957
+VPCMPGTQYrr	3958
+VPCMPGTQZ	3959
+VPCMPGTQZrm	3960
+VPCMPGTQZrmb	3961
+VPCMPGTQZrmbk	3962
+VPCMPGTQZrmk	3963
+VPCMPGTQZrr	3964
+VPCMPGTQZrrk	3965
+VPCMPGTQrm	3966
+VPCMPGTQrr	3967
+VPCMPGTWYrm	3968
+VPCMPGTWYrr	3969
+VPCMPGTWZ	3970
+VPCMPGTWZrm	3971
+VPCMPGTWZrmk	3972
+VPCMPGTWZrr	3973
+VPCMPGTWZrrk	3974
+VPCMPGTWrm	3975
+VPCMPGTWrr	3976
+VPCMPISTRIrmi	3977
+VPCMPISTRIrri	3978
+VPCMPISTRMrmi	3979
+VPCMPISTRMrri	3980
+VPCMPQZ	3981
+VPCMPQZrmbi	3982
+VPCMPQZrmbik	3983
+VPCMPQZrmi	3984
+VPCMPQZrmik	3985
+VPCMPQZrri	3986
+VPCMPQZrrik	3987
+VPCMPUBZ	3988
+VPCMPUBZrmi	3989
+VPCMPUBZrmik	3990
+VPCMPUBZrri	3991
+VPCMPUBZrrik	3992
+VPCMPUDZ	3993
+VPCMPUDZrmbi	3994
+VPCMPUDZrmbik	3995
+VPCMPUDZrmi	3996
+VPCMPUDZrmik	3997
+VPCMPUDZrri	3998
+VPCMPUDZrrik	3999
+VPCMPUQZ	4000
+VPCMPUQZrmbi	4001
+VPCMPUQZrmbik	4002
+VPCMPUQZrmi	4003
+VPCMPUQZrmik	4004
+VPCMPUQZrri	4005
+VPCMPUQZrrik	4006
+VPCMPUWZ	4007
+VPCMPUWZrmi	4008
+VPCMPUWZrmik	4009
+VPCMPUWZrri	4010
+VPCMPUWZrrik	4011
+VPCMPWZ	4012
+VPCMPWZrmi	4013
+VPCMPWZrmik	4014
+VPCMPWZrri	4015
+VPCMPWZrrik	4016
+VPCOMBmi	4017
+VPCOMBri	4018
+VPCOMDmi	4019
+VPCOMDri	4020
+VPCOMPRESSBZ	4021
+VPCOMPRESSBZmr	4022
+VPCOMPRESSBZmrk	4023
+VPCOMPRESSBZrr	4024
+VPCOMPRESSBZrrk	4025
+VPCOMPRESSBZrrkz	4026
+VPCOMPRESSDZ	4027
+VPCOMPRESSDZmr	4028
+VPCOMPRESSDZmrk	4029
+VPCOMPRESSDZrr	4030
+VPCOMPRESSDZrrk	4031
+VPCOMPRESSDZrrkz	4032
+VPCOMPRESSQZ	4033
+VPCOMPRESSQZmr	4034
+VPCOMPRESSQZmrk	4035
+VPCOMPRESSQZrr	4036
+VPCOMPRESSQZrrk	4037
+VPCOMPRESSQZrrkz	4038
+VPCOMPRESSWZ	4039
+VPCOMPRESSWZmr	4040
+VPCOMPRESSWZmrk	4041
+VPCOMPRESSWZrr	4042
+VPCOMPRESSWZrrk	4043
+VPCOMPRESSWZrrkz	4044
+VPCOMQmi	4045
+VPCOMQri	4046
+VPCOMUBmi	4047
+VPCOMUBri	4048
+VPCOMUDmi	4049
+VPCOMUDri	4050
+VPCOMUQmi	4051
+VPCOMUQri	4052
+VPCOMUWmi	4053
+VPCOMUWri	4054
+VPCOMWmi	4055
+VPCOMWri	4056
+VPCONFLICTDZ	4057
+VPCONFLICTDZrm	4058
+VPCONFLICTDZrmb	4059
+VPCONFLICTDZrmbk	4060
+VPCONFLICTDZrmbkz	4061
+VPCONFLICTDZrmk	4062
+VPCONFLICTDZrmkz	4063
+VPCONFLICTDZrr	4064
+VPCONFLICTDZrrk	4065
+VPCONFLICTDZrrkz	4066
+VPCONFLICTQZ	4067
+VPCONFLICTQZrm	4068
+VPCONFLICTQZrmb	4069
+VPCONFLICTQZrmbk	4070
+VPCONFLICTQZrmbkz	4071
+VPCONFLICTQZrmk	4072
+VPCONFLICTQZrmkz	4073
+VPCONFLICTQZrr	4074
+VPCONFLICTQZrrk	4075
+VPCONFLICTQZrrkz	4076
+VPDPBSSDSYrm	4077
+VPDPBSSDSYrr	4078
+VPDPBSSDSZ	4079
+VPDPBSSDSZrm	4080
+VPDPBSSDSZrmb	4081
+VPDPBSSDSZrmbk	4082
+VPDPBSSDSZrmbkz	4083
+VPDPBSSDSZrmk	4084
+VPDPBSSDSZrmkz	4085
+VPDPBSSDSZrr	4086
+VPDPBSSDSZrrk	4087
+VPDPBSSDSZrrkz	4088
+VPDPBSSDSrm	4089
+VPDPBSSDSrr	4090
+VPDPBSSDYrm	4091
+VPDPBSSDYrr	4092
+VPDPBSSDZ	4093
+VPDPBSSDZrm	4094
+VPDPBSSDZrmb	4095
+VPDPBSSDZrmbk	4096
+VPDPBSSDZrmbkz	4097
+VPDPBSSDZrmk	4098
+VPDPBSSDZrmkz	4099
+VPDPBSSDZrr	4100
+VPDPBSSDZrrk	4101
+VPDPBSSDZrrkz	4102
+VPDPBSSDrm	4103
+VPDPBSSDrr	4104
+VPDPBSUDSYrm	4105
+VPDPBSUDSYrr	4106
+VPDPBSUDSZ	4107
+VPDPBSUDSZrm	4108
+VPDPBSUDSZrmb	4109
+VPDPBSUDSZrmbk	4110
+VPDPBSUDSZrmbkz	4111
+VPDPBSUDSZrmk	4112
+VPDPBSUDSZrmkz	4113
+VPDPBSUDSZrr	4114
+VPDPBSUDSZrrk	4115
+VPDPBSUDSZrrkz	4116
+VPDPBSUDSrm	4117
+VPDPBSUDSrr	4118
+VPDPBSUDYrm	4119
+VPDPBSUDYrr	4120
+VPDPBSUDZ	4121
+VPDPBSUDZrm	4122
+VPDPBSUDZrmb	4123
+VPDPBSUDZrmbk	4124
+VPDPBSUDZrmbkz	4125
+VPDPBSUDZrmk	4126
+VPDPBSUDZrmkz	4127
+VPDPBSUDZrr	4128
+VPDPBSUDZrrk	4129
+VPDPBSUDZrrkz	4130
+VPDPBSUDrm	4131
+VPDPBSUDrr	4132
+VPDPBUSDSYrm	4133
+VPDPBUSDSYrr	4134
+VPDPBUSDSZ	4135
+VPDPBUSDSZrm	4136
+VPDPBUSDSZrmb	4137
+VPDPBUSDSZrmbk	4138
+VPDPBUSDSZrmbkz	4139
+VPDPBUSDSZrmk	4140
+VPDPBUSDSZrmkz	4141
+VPDPBUSDSZrr	4142
+VPDPBUSDSZrrk	4143
+VPDPBUSDSZrrkz	4144
+VPDPBUSDSrm	4145
+VPDPBUSDSrr	4146
+VPDPBUSDYrm	4147
+VPDPBUSDYrr	4148
+VPDPBUSDZ	4149
+VPDPBUSDZrm	4150
+VPDPBUSDZrmb	4151
+VPDPBUSDZrmbk	4152
+VPDPBUSDZrmbkz	4153
+VPDPBUSDZrmk	4154
+VPDPBUSDZrmkz	4155
+VPDPBUSDZrr	4156
+VPDPBUSDZrrk	4157
+VPDPBUSDZrrkz	4158
+VPDPBUSDrm	4159
+VPDPBUSDrr	4160
+VPDPBUUDSYrm	4161
+VPDPBUUDSYrr	4162
+VPDPBUUDSZ	4163
+VPDPBUUDSZrm	4164
+VPDPBUUDSZrmb	4165
+VPDPBUUDSZrmbk	4166
+VPDPBUUDSZrmbkz	4167
+VPDPBUUDSZrmk	4168
+VPDPBUUDSZrmkz	4169
+VPDPBUUDSZrr	4170
+VPDPBUUDSZrrk	4171
+VPDPBUUDSZrrkz	4172
+VPDPBUUDSrm	4173
+VPDPBUUDSrr	4174
+VPDPBUUDYrm	4175
+VPDPBUUDYrr	4176
+VPDPBUUDZ	4177
+VPDPBUUDZrm	4178
+VPDPBUUDZrmb	4179
+VPDPBUUDZrmbk	4180
+VPDPBUUDZrmbkz	4181
+VPDPBUUDZrmk	4182
+VPDPBUUDZrmkz	4183
+VPDPBUUDZrr	4184
+VPDPBUUDZrrk	4185
+VPDPBUUDZrrkz	4186
+VPDPBUUDrm	4187
+VPDPBUUDrr	4188
+VPDPWSSDSYrm	4189
+VPDPWSSDSYrr	4190
+VPDPWSSDSZ	4191
+VPDPWSSDSZrm	4192
+VPDPWSSDSZrmb	4193
+VPDPWSSDSZrmbk	4194
+VPDPWSSDSZrmbkz	4195
+VPDPWSSDSZrmk	4196
+VPDPWSSDSZrmkz	4197
+VPDPWSSDSZrr	4198
+VPDPWSSDSZrrk	4199
+VPDPWSSDSZrrkz	4200
+VPDPWSSDSrm	4201
+VPDPWSSDSrr	4202
+VPDPWSSDYrm	4203
+VPDPWSSDYrr	4204
+VPDPWSSDZ	4205
+VPDPWSSDZrm	4206
+VPDPWSSDZrmb	4207
+VPDPWSSDZrmbk	4208
+VPDPWSSDZrmbkz	4209
+VPDPWSSDZrmk	4210
+VPDPWSSDZrmkz	4211
+VPDPWSSDZrr	4212
+VPDPWSSDZrrk	4213
+VPDPWSSDZrrkz	4214
+VPDPWSSDrm	4215
+VPDPWSSDrr	4216
+VPDPWSUDSYrm	4217
+VPDPWSUDSYrr	4218
+VPDPWSUDSZ	4219
+VPDPWSUDSZrm	4220
+VPDPWSUDSZrmb	4221
+VPDPWSUDSZrmbk	4222
+VPDPWSUDSZrmbkz	4223
+VPDPWSUDSZrmk	4224
+VPDPWSUDSZrmkz	4225
+VPDPWSUDSZrr	4226
+VPDPWSUDSZrrk	4227
+VPDPWSUDSZrrkz	4228
+VPDPWSUDSrm	4229
+VPDPWSUDSrr	4230
+VPDPWSUDYrm	4231
+VPDPWSUDYrr	4232
+VPDPWSUDZ	4233
+VPDPWSUDZrm	4234
+VPDPWSUDZrmb	4235
+VPDPWSUDZrmbk	4236
+VPDPWSUDZrmbkz	4237
+VPDPWSUDZrmk	4238
+VPDPWSUDZrmkz	4239
+VPDPWSUDZrr	4240
+VPDPWSUDZrrk	4241
+VPDPWSUDZrrkz	4242
+VPDPWSUDrm	4243
+VPDPWSUDrr	4244
+VPDPWUSDSYrm	4245
+VPDPWUSDSYrr	4246
+VPDPWUSDSZ	4247
+VPDPWUSDSZrm	4248
+VPDPWUSDSZrmb	4249
+VPDPWUSDSZrmbk	4250
+VPDPWUSDSZrmbkz	4251
+VPDPWUSDSZrmk	4252
+VPDPWUSDSZrmkz	4253
+VPDPWUSDSZrr	4254
+VPDPWUSDSZrrk	4255
+VPDPWUSDSZrrkz	4256
+VPDPWUSDSrm	4257
+VPDPWUSDSrr	4258
+VPDPWUSDYrm	4259
+VPDPWUSDYrr	4260
+VPDPWUSDZ	4261
+VPDPWUSDZrm	4262
+VPDPWUSDZrmb	4263
+VPDPWUSDZrmbk	4264
+VPDPWUSDZrmbkz	4265
+VPDPWUSDZrmk	4266
+VPDPWUSDZrmkz	4267
+VPDPWUSDZrr	4268
+VPDPWUSDZrrk	4269
+VPDPWUSDZrrkz	4270
+VPDPWUSDrm	4271
+VPDPWUSDrr	4272
+VPDPWUUDSYrm	4273
+VPDPWUUDSYrr	4274
+VPDPWUUDSZ	4275
+VPDPWUUDSZrm	4276
+VPDPWUUDSZrmb	4277
+VPDPWUUDSZrmbk	4278
+VPDPWUUDSZrmbkz	4279
+VPDPWUUDSZrmk	4280
+VPDPWUUDSZrmkz	4281
+VPDPWUUDSZrr	4282
+VPDPWUUDSZrrk	4283
+VPDPWUUDSZrrkz	4284
+VPDPWUUDSrm	4285
+VPDPWUUDSrr	4286
+VPDPWUUDYrm	4287
+VPDPWUUDYrr	4288
+VPDPWUUDZ	4289
+VPDPWUUDZrm	4290
+VPDPWUUDZrmb	4291
+VPDPWUUDZrmbk	4292
+VPDPWUUDZrmbkz	4293
+VPDPWUUDZrmk	4294
+VPDPWUUDZrmkz	4295
+VPDPWUUDZrr	4296
+VPDPWUUDZrrk	4297
+VPDPWUUDZrrkz	4298
+VPDPWUUDrm	4299
+VPDPWUUDrr	4300
+VPERM	4301
+VPERMBZ	4302
+VPERMBZrm	4303
+VPERMBZrmk	4304
+VPERMBZrmkz	4305
+VPERMBZrr	4306
+VPERMBZrrk	4307
+VPERMBZrrkz	4308
+VPERMDYrm	4309
+VPERMDYrr	4310
+VPERMDZ	4311
+VPERMDZrm	4312
+VPERMDZrmb	4313
+VPERMDZrmbk	4314
+VPERMDZrmbkz	4315
+VPERMDZrmk	4316
+VPERMDZrmkz	4317
+VPERMDZrr	4318
+VPERMDZrrk	4319
+VPERMDZrrkz	4320
+VPERMI	4321
+VPERMIL	4322
+VPERMILPDYmi	4323
+VPERMILPDYri	4324
+VPERMILPDYrm	4325
+VPERMILPDYrr	4326
+VPERMILPDZ	4327
+VPERMILPDZmbi	4328
+VPERMILPDZmbik	4329
+VPERMILPDZmbikz	4330
+VPERMILPDZmi	4331
+VPERMILPDZmik	4332
+VPERMILPDZmikz	4333
+VPERMILPDZri	4334
+VPERMILPDZrik	4335
+VPERMILPDZrikz	4336
+VPERMILPDZrm	4337
+VPERMILPDZrmb	4338
+VPERMILPDZrmbk	4339
+VPERMILPDZrmbkz	4340
+VPERMILPDZrmk	4341
+VPERMILPDZrmkz	4342
+VPERMILPDZrr	4343
+VPERMILPDZrrk	4344
+VPERMILPDZrrkz	4345
+VPERMILPDmi	4346
+VPERMILPDri	4347
+VPERMILPDrm	4348
+VPERMILPDrr	4349
+VPERMILPSYmi	4350
+VPERMILPSYri	4351
+VPERMILPSYrm	4352
+VPERMILPSYrr	4353
+VPERMILPSZ	4354
+VPERMILPSZmbi	4355
+VPERMILPSZmbik	4356
+VPERMILPSZmbikz	4357
+VPERMILPSZmi	4358
+VPERMILPSZmik	4359
+VPERMILPSZmikz	4360
+VPERMILPSZri	4361
+VPERMILPSZrik	4362
+VPERMILPSZrikz	4363
+VPERMILPSZrm	4364
+VPERMILPSZrmb	4365
+VPERMILPSZrmbk	4366
+VPERMILPSZrmbkz	4367
+VPERMILPSZrmk	4368
+VPERMILPSZrmkz	4369
+VPERMILPSZrr	4370
+VPERMILPSZrrk	4371
+VPERMILPSZrrkz	4372
+VPERMILPSmi	4373
+VPERMILPSri	4374
+VPERMILPSrm	4375
+VPERMILPSrr	4376
+VPERMPDYmi	4377
+VPERMPDYri	4378
+VPERMPDZ	4379
+VPERMPDZmbi	4380
+VPERMPDZmbik	4381
+VPERMPDZmbikz	4382
+VPERMPDZmi	4383
+VPERMPDZmik	4384
+VPERMPDZmikz	4385
+VPERMPDZri	4386
+VPERMPDZrik	4387
+VPERMPDZrikz	4388
+VPERMPDZrm	4389
+VPERMPDZrmb	4390
+VPERMPDZrmbk	4391
+VPERMPDZrmbkz	4392
+VPERMPDZrmk	4393
+VPERMPDZrmkz	4394
+VPERMPDZrr	4395
+VPERMPDZrrk	4396
+VPERMPDZrrkz	4397
+VPERMPSYrm	4398
+VPERMPSYrr	4399
+VPERMPSZ	4400
+VPERMPSZrm	4401
+VPERMPSZrmb	4402
+VPERMPSZrmbk	4403
+VPERMPSZrmbkz	4404
+VPERMPSZrmk	4405
+VPERMPSZrmkz	4406
+VPERMPSZrr	4407
+VPERMPSZrrk	4408
+VPERMPSZrrkz	4409
+VPERMQYmi	4410
+VPERMQYri	4411
+VPERMQZ	4412
+VPERMQZmbi	4413
+VPERMQZmbik	4414
+VPERMQZmbikz	4415
+VPERMQZmi	4416
+VPERMQZmik	4417
+VPERMQZmikz	4418
+VPERMQZri	4419
+VPERMQZrik	4420
+VPERMQZrikz	4421
+VPERMQZrm	4422
+VPERMQZrmb	4423
+VPERMQZrmbk	4424
+VPERMQZrmbkz	4425
+VPERMQZrmk	4426
+VPERMQZrmkz	4427
+VPERMQZrr	4428
+VPERMQZrrk	4429
+VPERMQZrrkz	4430
+VPERMT	4431
+VPERMWZ	4432
+VPERMWZrm	4433
+VPERMWZrmk	4434
+VPERMWZrmkz	4435
+VPERMWZrr	4436
+VPERMWZrrk	4437
+VPERMWZrrkz	4438
+VPEXPANDBZ	4439
+VPEXPANDBZrm	4440
+VPEXPANDBZrmk	4441
+VPEXPANDBZrmkz	4442
+VPEXPANDBZrr	4443
+VPEXPANDBZrrk	4444
+VPEXPANDBZrrkz	4445
+VPEXPANDDZ	4446
+VPEXPANDDZrm	4447
+VPEXPANDDZrmk	4448
+VPEXPANDDZrmkz	4449
+VPEXPANDDZrr	4450
+VPEXPANDDZrrk	4451
+VPEXPANDDZrrkz	4452
+VPEXPANDQZ	4453
+VPEXPANDQZrm	4454
+VPEXPANDQZrmk	4455
+VPEXPANDQZrmkz	4456
+VPEXPANDQZrr	4457
+VPEXPANDQZrrk	4458
+VPEXPANDQZrrkz	4459
+VPEXPANDWZ	4460
+VPEXPANDWZrm	4461
+VPEXPANDWZrmk	4462
+VPEXPANDWZrmkz	4463
+VPEXPANDWZrr	4464
+VPEXPANDWZrrk	4465
+VPEXPANDWZrrkz	4466
+VPEXTRBZmri	4467
+VPEXTRBZrri	4468
+VPEXTRBmri	4469
+VPEXTRBrri	4470
+VPEXTRDZmri	4471
+VPEXTRDZrri	4472
+VPEXTRDmri	4473
+VPEXTRDrri	4474
+VPEXTRQZmri	4475
+VPEXTRQZrri	4476
+VPEXTRQmri	4477
+VPEXTRQrri	4478
+VPEXTRWZmri	4479
+VPEXTRWZrri	4480
+VPEXTRWZrri_REV	4481
+VPEXTRWmri	4482
+VPEXTRWrri	4483
+VPEXTRWrri_REV	4484
+VPGATHERDDYrm	4485
+VPGATHERDDZ	4486
+VPGATHERDDZrm	4487
+VPGATHERDDrm	4488
+VPGATHERDQYrm	4489
+VPGATHERDQZ	4490
+VPGATHERDQZrm	4491
+VPGATHERDQrm	4492
+VPGATHERQDYrm	4493
+VPGATHERQDZ	4494
+VPGATHERQDZrm	4495
+VPGATHERQDrm	4496
+VPGATHERQQYrm	4497
+VPGATHERQQZ	4498
+VPGATHERQQZrm	4499
+VPGATHERQQrm	4500
+VPHADDBDrm	4501
+VPHADDBDrr	4502
+VPHADDBQrm	4503
+VPHADDBQrr	4504
+VPHADDBWrm	4505
+VPHADDBWrr	4506
+VPHADDDQrm	4507
+VPHADDDQrr	4508
+VPHADDDYrm	4509
+VPHADDDYrr	4510
+VPHADDDrm	4511
+VPHADDDrr	4512
+VPHADDSWYrm	4513
+VPHADDSWYrr	4514
+VPHADDSWrm	4515
+VPHADDSWrr	4516
+VPHADDUBDrm	4517
+VPHADDUBDrr	4518
+VPHADDUBQrm	4519
+VPHADDUBQrr	4520
+VPHADDUBWrm	4521
+VPHADDUBWrr	4522
+VPHADDUDQrm	4523
+VPHADDUDQrr	4524
+VPHADDUWDrm	4525
+VPHADDUWDrr	4526
+VPHADDUWQrm	4527
+VPHADDUWQrr	4528
+VPHADDWDrm	4529
+VPHADDWDrr	4530
+VPHADDWQrm	4531
+VPHADDWQrr	4532
+VPHADDWYrm	4533
+VPHADDWYrr	4534
+VPHADDWrm	4535
+VPHADDWrr	4536
+VPHMINPOSUWrm	4537
+VPHMINPOSUWrr	4538
+VPHSUBBWrm	4539
+VPHSUBBWrr	4540
+VPHSUBDQrm	4541
+VPHSUBDQrr	4542
+VPHSUBDYrm	4543
+VPHSUBDYrr	4544
+VPHSUBDrm	4545
+VPHSUBDrr	4546
+VPHSUBSWYrm	4547
+VPHSUBSWYrr	4548
+VPHSUBSWrm	4549
+VPHSUBSWrr	4550
+VPHSUBWDrm	4551
+VPHSUBWDrr	4552
+VPHSUBWYrm	4553
+VPHSUBWYrr	4554
+VPHSUBWrm	4555
+VPHSUBWrr	4556
+VPINSRBZrmi	4557
+VPINSRBZrri	4558
+VPINSRBrmi	4559
+VPINSRBrri	4560
+VPINSRDZrmi	4561
+VPINSRDZrri	4562
+VPINSRDrmi	4563
+VPINSRDrri	4564
+VPINSRQZrmi	4565
+VPINSRQZrri	4566
+VPINSRQrmi	4567
+VPINSRQrri	4568
+VPINSRWZrmi	4569
+VPINSRWZrri	4570
+VPINSRWrmi	4571
+VPINSRWrri	4572
+VPLZCNTDZ	4573
+VPLZCNTDZrm	4574
+VPLZCNTDZrmb	4575
+VPLZCNTDZrmbk	4576
+VPLZCNTDZrmbkz	4577
+VPLZCNTDZrmk	4578
+VPLZCNTDZrmkz	4579
+VPLZCNTDZrr	4580
+VPLZCNTDZrrk	4581
+VPLZCNTDZrrkz	4582
+VPLZCNTQZ	4583
+VPLZCNTQZrm	4584
+VPLZCNTQZrmb	4585
+VPLZCNTQZrmbk	4586
+VPLZCNTQZrmbkz	4587
+VPLZCNTQZrmk	4588
+VPLZCNTQZrmkz	4589
+VPLZCNTQZrr	4590
+VPLZCNTQZrrk	4591
+VPLZCNTQZrrkz	4592
+VPMACSDDrm	4593
+VPMACSDDrr	4594
+VPMACSDQHrm	4595
+VPMACSDQHrr	4596
+VPMACSDQLrm	4597
+VPMACSDQLrr	4598
+VPMACSSDDrm	4599
+VPMACSSDDrr	4600
+VPMACSSDQHrm	4601
+VPMACSSDQHrr	4602
+VPMACSSDQLrm	4603
+VPMACSSDQLrr	4604
+VPMACSSWDrm	4605
+VPMACSSWDrr	4606
+VPMACSSWWrm	4607
+VPMACSSWWrr	4608
+VPMACSWDrm	4609
+VPMACSWDrr	4610
+VPMACSWWrm	4611
+VPMACSWWrr	4612
+VPMADCSSWDrm	4613
+VPMADCSSWDrr	4614
+VPMADCSWDrm	4615
+VPMADCSWDrr	4616
+VPMADD	4617
+VPMADDUBSWYrm	4618
+VPMADDUBSWYrr	4619
+VPMADDUBSWZ	4620
+VPMADDUBSWZrm	4621
+VPMADDUBSWZrmk	4622
+VPMADDUBSWZrmkz	4623
+VPMADDUBSWZrr	4624
+VPMADDUBSWZrrk	4625
+VPMADDUBSWZrrkz	4626
+VPMADDUBSWrm	4627
+VPMADDUBSWrr	4628
+VPMADDWDYrm	4629
+VPMADDWDYrr	4630
+VPMADDWDZ	4631
+VPMADDWDZrm	4632
+VPMADDWDZrmk	4633
+VPMADDWDZrmkz	4634
+VPMADDWDZrr	4635
+VPMADDWDZrrk	4636
+VPMADDWDZrrkz	4637
+VPMADDWDrm	4638
+VPMADDWDrr	4639
+VPMASKMOVDYmr	4640
+VPMASKMOVDYrm	4641
+VPMASKMOVDmr	4642
+VPMASKMOVDrm	4643
+VPMASKMOVQYmr	4644
+VPMASKMOVQYrm	4645
+VPMASKMOVQmr	4646
+VPMASKMOVQrm	4647
+VPMAXSBYrm	4648
+VPMAXSBYrr	4649
+VPMAXSBZ	4650
+VPMAXSBZrm	4651
+VPMAXSBZrmk	4652
+VPMAXSBZrmkz	4653
+VPMAXSBZrr	4654
+VPMAXSBZrrk	4655
+VPMAXSBZrrkz	4656
+VPMAXSBrm	4657
+VPMAXSBrr	4658
+VPMAXSDYrm	4659
+VPMAXSDYrr	4660
+VPMAXSDZ	4661
+VPMAXSDZrm	4662
+VPMAXSDZrmb	4663
+VPMAXSDZrmbk	4664
+VPMAXSDZrmbkz	4665
+VPMAXSDZrmk	4666
+VPMAXSDZrmkz	4667
+VPMAXSDZrr	4668
+VPMAXSDZrrk	4669
+VPMAXSDZrrkz	4670
+VPMAXSDrm	4671
+VPMAXSDrr	4672
+VPMAXSQZ	4673
+VPMAXSQZrm	4674
+VPMAXSQZrmb	4675
+VPMAXSQZrmbk	4676
+VPMAXSQZrmbkz	4677
+VPMAXSQZrmk	4678
+VPMAXSQZrmkz	4679
+VPMAXSQZrr	4680
+VPMAXSQZrrk	4681
+VPMAXSQZrrkz	4682
+VPMAXSWYrm	4683
+VPMAXSWYrr	4684
+VPMAXSWZ	4685
+VPMAXSWZrm	4686
+VPMAXSWZrmk	4687
+VPMAXSWZrmkz	4688
+VPMAXSWZrr	4689
+VPMAXSWZrrk	4690
+VPMAXSWZrrkz	4691
+VPMAXSWrm	4692
+VPMAXSWrr	4693
+VPMAXUBYrm	4694
+VPMAXUBYrr	4695
+VPMAXUBZ	4696
+VPMAXUBZrm	4697
+VPMAXUBZrmk	4698
+VPMAXUBZrmkz	4699
+VPMAXUBZrr	4700
+VPMAXUBZrrk	4701
+VPMAXUBZrrkz	4702
+VPMAXUBrm	4703
+VPMAXUBrr	4704
+VPMAXUDYrm	4705
+VPMAXUDYrr	4706
+VPMAXUDZ	4707
+VPMAXUDZrm	4708
+VPMAXUDZrmb	4709
+VPMAXUDZrmbk	4710
+VPMAXUDZrmbkz	4711
+VPMAXUDZrmk	4712
+VPMAXUDZrmkz	4713
+VPMAXUDZrr	4714
+VPMAXUDZrrk	4715
+VPMAXUDZrrkz	4716
+VPMAXUDrm	4717
+VPMAXUDrr	4718
+VPMAXUQZ	4719
+VPMAXUQZrm	4720
+VPMAXUQZrmb	4721
+VPMAXUQZrmbk	4722
+VPMAXUQZrmbkz	4723
+VPMAXUQZrmk	4724
+VPMAXUQZrmkz	4725
+VPMAXUQZrr	4726
+VPMAXUQZrrk	4727
+VPMAXUQZrrkz	4728
+VPMAXUWYrm	4729
+VPMAXUWYrr	4730
+VPMAXUWZ	4731
+VPMAXUWZrm	4732
+VPMAXUWZrmk	4733
+VPMAXUWZrmkz	4734
+VPMAXUWZrr	4735
+VPMAXUWZrrk	4736
+VPMAXUWZrrkz	4737
+VPMAXUWrm	4738
+VPMAXUWrr	4739
+VPMINSBYrm	4740
+VPMINSBYrr	4741
+VPMINSBZ	4742
+VPMINSBZrm	4743
+VPMINSBZrmk	4744
+VPMINSBZrmkz	4745
+VPMINSBZrr	4746
+VPMINSBZrrk	4747
+VPMINSBZrrkz	4748
+VPMINSBrm	4749
+VPMINSBrr	4750
+VPMINSDYrm	4751
+VPMINSDYrr	4752
+VPMINSDZ	4753
+VPMINSDZrm	4754
+VPMINSDZrmb	4755
+VPMINSDZrmbk	4756
+VPMINSDZrmbkz	4757
+VPMINSDZrmk	4758
+VPMINSDZrmkz	4759
+VPMINSDZrr	4760
+VPMINSDZrrk	4761
+VPMINSDZrrkz	4762
+VPMINSDrm	4763
+VPMINSDrr	4764
+VPMINSQZ	4765
+VPMINSQZrm	4766
+VPMINSQZrmb	4767
+VPMINSQZrmbk	4768
+VPMINSQZrmbkz	4769
+VPMINSQZrmk	4770
+VPMINSQZrmkz	4771
+VPMINSQZrr	4772
+VPMINSQZrrk	4773
+VPMINSQZrrkz	4774
+VPMINSWYrm	4775
+VPMINSWYrr	4776
+VPMINSWZ	4777
+VPMINSWZrm	4778
+VPMINSWZrmk	4779
+VPMINSWZrmkz	4780
+VPMINSWZrr	4781
+VPMINSWZrrk	4782
+VPMINSWZrrkz	4783
+VPMINSWrm	4784
+VPMINSWrr	4785
+VPMINUBYrm	4786
+VPMINUBYrr	4787
+VPMINUBZ	4788
+VPMINUBZrm	4789
+VPMINUBZrmk	4790
+VPMINUBZrmkz	4791
+VPMINUBZrr	4792
+VPMINUBZrrk	4793
+VPMINUBZrrkz	4794
+VPMINUBrm	4795
+VPMINUBrr	4796
+VPMINUDYrm	4797
+VPMINUDYrr	4798
+VPMINUDZ	4799
+VPMINUDZrm	4800
+VPMINUDZrmb	4801
+VPMINUDZrmbk	4802
+VPMINUDZrmbkz	4803
+VPMINUDZrmk	4804
+VPMINUDZrmkz	4805
+VPMINUDZrr	4806
+VPMINUDZrrk	4807
+VPMINUDZrrkz	4808
+VPMINUDrm	4809
+VPMINUDrr	4810
+VPMINUQZ	4811
+VPMINUQZrm	4812
+VPMINUQZrmb	4813
+VPMINUQZrmbk	4814
+VPMINUQZrmbkz	4815
+VPMINUQZrmk	4816
+VPMINUQZrmkz	4817
+VPMINUQZrr	4818
+VPMINUQZrrk	4819
+VPMINUQZrrkz	4820
+VPMINUWYrm	4821
+VPMINUWYrr	4822
+VPMINUWZ	4823
+VPMINUWZrm	4824
+VPMINUWZrmk	4825
+VPMINUWZrmkz	4826
+VPMINUWZrr	4827
+VPMINUWZrrk	4828
+VPMINUWZrrkz	4829
+VPMINUWrm	4830
+VPMINUWrr	4831
+VPMOVB	4832
+VPMOVD	4833
+VPMOVDBZ	4834
+VPMOVDBZmr	4835
+VPMOVDBZmrk	4836
+VPMOVDBZrr	4837
+VPMOVDBZrrk	4838
+VPMOVDBZrrkz	4839
+VPMOVDWZ	4840
+VPMOVDWZmr	4841
+VPMOVDWZmrk	4842
+VPMOVDWZrr	4843
+VPMOVDWZrrk	4844
+VPMOVDWZrrkz	4845
+VPMOVM	4846
+VPMOVMSKBYrr	4847
+VPMOVMSKBrr	4848
+VPMOVQ	4849
+VPMOVQBZ	4850
+VPMOVQBZmr	4851
+VPMOVQBZmrk	4852
+VPMOVQBZrr	4853
+VPMOVQBZrrk	4854
+VPMOVQBZrrkz	4855
+VPMOVQDZ	4856
+VPMOVQDZmr	4857
+VPMOVQDZmrk	4858
+VPMOVQDZrr	4859
+VPMOVQDZrrk	4860
+VPMOVQDZrrkz	4861
+VPMOVQWZ	4862
+VPMOVQWZmr	4863
+VPMOVQWZmrk	4864
+VPMOVQWZrr	4865
+VPMOVQWZrrk	4866
+VPMOVQWZrrkz	4867
+VPMOVSDBZ	4868
+VPMOVSDBZmr	4869
+VPMOVSDBZmrk	4870
+VPMOVSDBZrr	4871
+VPMOVSDBZrrk	4872
+VPMOVSDBZrrkz	4873
+VPMOVSDWZ	4874
+VPMOVSDWZmr	4875
+VPMOVSDWZmrk	4876
+VPMOVSDWZrr	4877
+VPMOVSDWZrrk	4878
+VPMOVSDWZrrkz	4879
+VPMOVSQBZ	4880
+VPMOVSQBZmr	4881
+VPMOVSQBZmrk	4882
+VPMOVSQBZrr	4883
+VPMOVSQBZrrk	4884
+VPMOVSQBZrrkz	4885
+VPMOVSQDZ	4886
+VPMOVSQDZmr	4887
+VPMOVSQDZmrk	4888
+VPMOVSQDZrr	4889
+VPMOVSQDZrrk	4890
+VPMOVSQDZrrkz	4891
+VPMOVSQWZ	4892
+VPMOVSQWZmr	4893
+VPMOVSQWZmrk	4894
+VPMOVSQWZrr	4895
+VPMOVSQWZrrk	4896
+VPMOVSQWZrrkz	4897
+VPMOVSWBZ	4898
+VPMOVSWBZmr	4899
+VPMOVSWBZmrk	4900
+VPMOVSWBZrr	4901
+VPMOVSWBZrrk	4902
+VPMOVSWBZrrkz	4903
+VPMOVSXBDYrm	4904
+VPMOVSXBDYrr	4905
+VPMOVSXBDZ	4906
+VPMOVSXBDZrm	4907
+VPMOVSXBDZrmk	4908
+VPMOVSXBDZrmkz	4909
+VPMOVSXBDZrr	4910
+VPMOVSXBDZrrk	4911
+VPMOVSXBDZrrkz	4912
+VPMOVSXBDrm	4913
+VPMOVSXBDrr	4914
+VPMOVSXBQYrm	4915
+VPMOVSXBQYrr	4916
+VPMOVSXBQZ	4917
+VPMOVSXBQZrm	4918
+VPMOVSXBQZrmk	4919
+VPMOVSXBQZrmkz	4920
+VPMOVSXBQZrr	4921
+VPMOVSXBQZrrk	4922
+VPMOVSXBQZrrkz	4923
+VPMOVSXBQrm	4924
+VPMOVSXBQrr	4925
+VPMOVSXBWYrm	4926
+VPMOVSXBWYrr	4927
+VPMOVSXBWZ	4928
+VPMOVSXBWZrm	4929
+VPMOVSXBWZrmk	4930
+VPMOVSXBWZrmkz	4931
+VPMOVSXBWZrr	4932
+VPMOVSXBWZrrk	4933
+VPMOVSXBWZrrkz	4934
+VPMOVSXBWrm	4935
+VPMOVSXBWrr	4936
+VPMOVSXDQYrm	4937
+VPMOVSXDQYrr	4938
+VPMOVSXDQZ	4939
+VPMOVSXDQZrm	4940
+VPMOVSXDQZrmk	4941
+VPMOVSXDQZrmkz	4942
+VPMOVSXDQZrr	4943
+VPMOVSXDQZrrk	4944
+VPMOVSXDQZrrkz	4945
+VPMOVSXDQrm	4946
+VPMOVSXDQrr	4947
+VPMOVSXWDYrm	4948
+VPMOVSXWDYrr	4949
+VPMOVSXWDZ	4950
+VPMOVSXWDZrm	4951
+VPMOVSXWDZrmk	4952
+VPMOVSXWDZrmkz	4953
+VPMOVSXWDZrr	4954
+VPMOVSXWDZrrk	4955
+VPMOVSXWDZrrkz	4956
+VPMOVSXWDrm	4957
+VPMOVSXWDrr	4958
+VPMOVSXWQYrm	4959
+VPMOVSXWQYrr	4960
+VPMOVSXWQZ	4961
+VPMOVSXWQZrm	4962
+VPMOVSXWQZrmk	4963
+VPMOVSXWQZrmkz	4964
+VPMOVSXWQZrr	4965
+VPMOVSXWQZrrk	4966
+VPMOVSXWQZrrkz	4967
+VPMOVSXWQrm	4968
+VPMOVSXWQrr	4969
+VPMOVUSDBZ	4970
+VPMOVUSDBZmr	4971
+VPMOVUSDBZmrk	4972
+VPMOVUSDBZrr	4973
+VPMOVUSDBZrrk	4974
+VPMOVUSDBZrrkz	4975
+VPMOVUSDWZ	4976
+VPMOVUSDWZmr	4977
+VPMOVUSDWZmrk	4978
+VPMOVUSDWZrr	4979
+VPMOVUSDWZrrk	4980
+VPMOVUSDWZrrkz	4981
+VPMOVUSQBZ	4982
+VPMOVUSQBZmr	4983
+VPMOVUSQBZmrk	4984
+VPMOVUSQBZrr	4985
+VPMOVUSQBZrrk	4986
+VPMOVUSQBZrrkz	4987
+VPMOVUSQDZ	4988
+VPMOVUSQDZmr	4989
+VPMOVUSQDZmrk	4990
+VPMOVUSQDZrr	4991
+VPMOVUSQDZrrk	4992
+VPMOVUSQDZrrkz	4993
+VPMOVUSQWZ	4994
+VPMOVUSQWZmr	4995
+VPMOVUSQWZmrk	4996
+VPMOVUSQWZrr	4997
+VPMOVUSQWZrrk	4998
+VPMOVUSQWZrrkz	4999
+VPMOVUSWBZ	5000
+VPMOVUSWBZmr	5001
+VPMOVUSWBZmrk	5002
+VPMOVUSWBZrr	5003
+VPMOVUSWBZrrk	5004
+VPMOVUSWBZrrkz	5005
+VPMOVW	5006
+VPMOVWBZ	5007
+VPMOVWBZmr	5008
+VPMOVWBZmrk	5009
+VPMOVWBZrr	5010
+VPMOVWBZrrk	5011
+VPMOVWBZrrkz	5012
+VPMOVZXBDYrm	5013
+VPMOVZXBDYrr	5014
+VPMOVZXBDZ	5015
+VPMOVZXBDZrm	5016
+VPMOVZXBDZrmk	5017
+VPMOVZXBDZrmkz	5018
+VPMOVZXBDZrr	5019
+VPMOVZXBDZrrk	5020
+VPMOVZXBDZrrkz	5021
+VPMOVZXBDrm	5022
+VPMOVZXBDrr	5023
+VPMOVZXBQYrm	5024
+VPMOVZXBQYrr	5025
+VPMOVZXBQZ	5026
+VPMOVZXBQZrm	5027
+VPMOVZXBQZrmk	5028
+VPMOVZXBQZrmkz	5029
+VPMOVZXBQZrr	5030
+VPMOVZXBQZrrk	5031
+VPMOVZXBQZrrkz	5032
+VPMOVZXBQrm	5033
+VPMOVZXBQrr	5034
+VPMOVZXBWYrm	5035
+VPMOVZXBWYrr	5036
+VPMOVZXBWZ	5037
+VPMOVZXBWZrm	5038
+VPMOVZXBWZrmk	5039
+VPMOVZXBWZrmkz	5040
+VPMOVZXBWZrr	5041
+VPMOVZXBWZrrk	5042
+VPMOVZXBWZrrkz	5043
+VPMOVZXBWrm	5044
+VPMOVZXBWrr	5045
+VPMOVZXDQYrm	5046
+VPMOVZXDQYrr	5047
+VPMOVZXDQZ	5048
+VPMOVZXDQZrm	5049
+VPMOVZXDQZrmk	5050
+VPMOVZXDQZrmkz	5051
+VPMOVZXDQZrr	5052
+VPMOVZXDQZrrk	5053
+VPMOVZXDQZrrkz	5054
+VPMOVZXDQrm	5055
+VPMOVZXDQrr	5056
+VPMOVZXWDYrm	5057
+VPMOVZXWDYrr	5058
+VPMOVZXWDZ	5059
+VPMOVZXWDZrm	5060
+VPMOVZXWDZrmk	5061
+VPMOVZXWDZrmkz	5062
+VPMOVZXWDZrr	5063
+VPMOVZXWDZrrk	5064
+VPMOVZXWDZrrkz	5065
+VPMOVZXWDrm	5066
+VPMOVZXWDrr	5067
+VPMOVZXWQYrm	5068
+VPMOVZXWQYrr	5069
+VPMOVZXWQZ	5070
+VPMOVZXWQZrm	5071
+VPMOVZXWQZrmk	5072
+VPMOVZXWQZrmkz	5073
+VPMOVZXWQZrr	5074
+VPMOVZXWQZrrk	5075
+VPMOVZXWQZrrkz	5076
+VPMOVZXWQrm	5077
+VPMOVZXWQrr	5078
+VPMULDQYrm	5079
+VPMULDQYrr	5080
+VPMULDQZ	5081
+VPMULDQZrm	5082
+VPMULDQZrmb	5083
+VPMULDQZrmbk	5084
+VPMULDQZrmbkz	5085
+VPMULDQZrmk	5086
+VPMULDQZrmkz	5087
+VPMULDQZrr	5088
+VPMULDQZrrk	5089
+VPMULDQZrrkz	5090
+VPMULDQrm	5091
+VPMULDQrr	5092
+VPMULHRSWYrm	5093
+VPMULHRSWYrr	5094
+VPMULHRSWZ	5095
+VPMULHRSWZrm	5096
+VPMULHRSWZrmk	5097
+VPMULHRSWZrmkz	5098
+VPMULHRSWZrr	5099
+VPMULHRSWZrrk	5100
+VPMULHRSWZrrkz	5101
+VPMULHRSWrm	5102
+VPMULHRSWrr	5103
+VPMULHUWYrm	5104
+VPMULHUWYrr	5105
+VPMULHUWZ	5106
+VPMULHUWZrm	5107
+VPMULHUWZrmk	5108
+VPMULHUWZrmkz	5109
+VPMULHUWZrr	5110
+VPMULHUWZrrk	5111
+VPMULHUWZrrkz	5112
+VPMULHUWrm	5113
+VPMULHUWrr	5114
+VPMULHWYrm	5115
+VPMULHWYrr	5116
+VPMULHWZ	5117
+VPMULHWZrm	5118
+VPMULHWZrmk	5119
+VPMULHWZrmkz	5120
+VPMULHWZrr	5121
+VPMULHWZrrk	5122
+VPMULHWZrrkz	5123
+VPMULHWrm	5124
+VPMULHWrr	5125
+VPMULLDYrm	5126
+VPMULLDYrr	5127
+VPMULLDZ	5128
+VPMULLDZrm	5129
+VPMULLDZrmb	5130
+VPMULLDZrmbk	5131
+VPMULLDZrmbkz	5132
+VPMULLDZrmk	5133
+VPMULLDZrmkz	5134
+VPMULLDZrr	5135
+VPMULLDZrrk	5136
+VPMULLDZrrkz	5137
+VPMULLDrm	5138
+VPMULLDrr	5139
+VPMULLQZ	5140
+VPMULLQZrm	5141
+VPMULLQZrmb	5142
+VPMULLQZrmbk	5143
+VPMULLQZrmbkz	5144
+VPMULLQZrmk	5145
+VPMULLQZrmkz	5146
+VPMULLQZrr	5147
+VPMULLQZrrk	5148
+VPMULLQZrrkz	5149
+VPMULLWYrm	5150
+VPMULLWYrr	5151
+VPMULLWZ	5152
+VPMULLWZrm	5153
+VPMULLWZrmk	5154
+VPMULLWZrmkz	5155
+VPMULLWZrr	5156
+VPMULLWZrrk	5157
+VPMULLWZrrkz	5158
+VPMULLWrm	5159
+VPMULLWrr	5160
+VPMULTISHIFTQBZ	5161
+VPMULTISHIFTQBZrm	5162
+VPMULTISHIFTQBZrmb	5163
+VPMULTISHIFTQBZrmbk	5164
+VPMULTISHIFTQBZrmbkz	5165
+VPMULTISHIFTQBZrmk	5166
+VPMULTISHIFTQBZrmkz	5167
+VPMULTISHIFTQBZrr	5168
+VPMULTISHIFTQBZrrk	5169
+VPMULTISHIFTQBZrrkz	5170
+VPMULUDQYrm	5171
+VPMULUDQYrr	5172
+VPMULUDQZ	5173
+VPMULUDQZrm	5174
+VPMULUDQZrmb	5175
+VPMULUDQZrmbk	5176
+VPMULUDQZrmbkz	5177
+VPMULUDQZrmk	5178
+VPMULUDQZrmkz	5179
+VPMULUDQZrr	5180
+VPMULUDQZrrk	5181
+VPMULUDQZrrkz	5182
+VPMULUDQrm	5183
+VPMULUDQrr	5184
+VPOPCNTBZ	5185
+VPOPCNTBZrm	5186
+VPOPCNTBZrmk	5187
+VPOPCNTBZrmkz	5188
+VPOPCNTBZrr	5189
+VPOPCNTBZrrk	5190
+VPOPCNTBZrrkz	5191
+VPOPCNTDZ	5192
+VPOPCNTDZrm	5193
+VPOPCNTDZrmb	5194
+VPOPCNTDZrmbk	5195
+VPOPCNTDZrmbkz	5196
+VPOPCNTDZrmk	5197
+VPOPCNTDZrmkz	5198
+VPOPCNTDZrr	5199
+VPOPCNTDZrrk	5200
+VPOPCNTDZrrkz	5201
+VPOPCNTQZ	5202
+VPOPCNTQZrm	5203
+VPOPCNTQZrmb	5204
+VPOPCNTQZrmbk	5205
+VPOPCNTQZrmbkz	5206
+VPOPCNTQZrmk	5207
+VPOPCNTQZrmkz	5208
+VPOPCNTQZrr	5209
+VPOPCNTQZrrk	5210
+VPOPCNTQZrrkz	5211
+VPOPCNTWZ	5212
+VPOPCNTWZrm	5213
+VPOPCNTWZrmk	5214
+VPOPCNTWZrmkz	5215
+VPOPCNTWZrr	5216
+VPOPCNTWZrrk	5217
+VPOPCNTWZrrkz	5218
+VPORDZ	5219
+VPORDZrm	5220
+VPORDZrmb	5221
+VPORDZrmbk	5222
+VPORDZrmbkz	5223
+VPORDZrmk	5224
+VPORDZrmkz	5225
+VPORDZrr	5226
+VPORDZrrk	5227
+VPORDZrrkz	5228
+VPORQZ	5229
+VPORQZrm	5230
+VPORQZrmb	5231
+VPORQZrmbk	5232
+VPORQZrmbkz	5233
+VPORQZrmk	5234
+VPORQZrmkz	5235
+VPORQZrr	5236
+VPORQZrrk	5237
+VPORQZrrkz	5238
+VPORYrm	5239
+VPORYrr	5240
+VPORrm	5241
+VPORrr	5242
+VPPERMrmr	5243
+VPPERMrrm	5244
+VPPERMrrr	5245
+VPPERMrrr_REV	5246
+VPROLDZ	5247
+VPROLDZmbi	5248
+VPROLDZmbik	5249
+VPROLDZmbikz	5250
+VPROLDZmi	5251
+VPROLDZmik	5252
+VPROLDZmikz	5253
+VPROLDZri	5254
+VPROLDZrik	5255
+VPROLDZrikz	5256
+VPROLQZ	5257
+VPROLQZmbi	5258
+VPROLQZmbik	5259
+VPROLQZmbikz	5260
+VPROLQZmi	5261
+VPROLQZmik	5262
+VPROLQZmikz	5263
+VPROLQZri	5264
+VPROLQZrik	5265
+VPROLQZrikz	5266
+VPROLVDZ	5267
+VPROLVDZrm	5268
+VPROLVDZrmb	5269
+VPROLVDZrmbk	5270
+VPROLVDZrmbkz	5271
+VPROLVDZrmk	5272
+VPROLVDZrmkz	5273
+VPROLVDZrr	5274
+VPROLVDZrrk	5275
+VPROLVDZrrkz	5276
+VPROLVQZ	5277
+VPROLVQZrm	5278
+VPROLVQZrmb	5279
+VPROLVQZrmbk	5280
+VPROLVQZrmbkz	5281
+VPROLVQZrmk	5282
+VPROLVQZrmkz	5283
+VPROLVQZrr	5284
+VPROLVQZrrk	5285
+VPROLVQZrrkz	5286
+VPRORDZ	5287
+VPRORDZmbi	5288
+VPRORDZmbik	5289
+VPRORDZmbikz	5290
+VPRORDZmi	5291
+VPRORDZmik	5292
+VPRORDZmikz	5293
+VPRORDZri	5294
+VPRORDZrik	5295
+VPRORDZrikz	5296
+VPRORQZ	5297
+VPRORQZmbi	5298
+VPRORQZmbik	5299
+VPRORQZmbikz	5300
+VPRORQZmi	5301
+VPRORQZmik	5302
+VPRORQZmikz	5303
+VPRORQZri	5304
+VPRORQZrik	5305
+VPRORQZrikz	5306
+VPRORVDZ	5307
+VPRORVDZrm	5308
+VPRORVDZrmb	5309
+VPRORVDZrmbk	5310
+VPRORVDZrmbkz	5311
+VPRORVDZrmk	5312
+VPRORVDZrmkz	5313
+VPRORVDZrr	5314
+VPRORVDZrrk	5315
+VPRORVDZrrkz	5316
+VPRORVQZ	5317
+VPRORVQZrm	5318
+VPRORVQZrmb	5319
+VPRORVQZrmbk	5320
+VPRORVQZrmbkz	5321
+VPRORVQZrmk	5322
+VPRORVQZrmkz	5323
+VPRORVQZrr	5324
+VPRORVQZrrk	5325
+VPRORVQZrrkz	5326
+VPROTBmi	5327
+VPROTBmr	5328
+VPROTBri	5329
+VPROTBrm	5330
+VPROTBrr	5331
+VPROTBrr_REV	5332
+VPROTDmi	5333
+VPROTDmr	5334
+VPROTDri	5335
+VPROTDrm	5336
+VPROTDrr	5337
+VPROTDrr_REV	5338
+VPROTQmi	5339
+VPROTQmr	5340
+VPROTQri	5341
+VPROTQrm	5342
+VPROTQrr	5343
+VPROTQrr_REV	5344
+VPROTWmi	5345
+VPROTWmr	5346
+VPROTWri	5347
+VPROTWrm	5348
+VPROTWrr	5349
+VPROTWrr_REV	5350
+VPSADBWYrm	5351
+VPSADBWYrr	5352
+VPSADBWZ	5353
+VPSADBWZrm	5354
+VPSADBWZrr	5355
+VPSADBWrm	5356
+VPSADBWrr	5357
+VPSCATTERDDZ	5358
+VPSCATTERDDZmr	5359
+VPSCATTERDQZ	5360
+VPSCATTERDQZmr	5361
+VPSCATTERQDZ	5362
+VPSCATTERQDZmr	5363
+VPSCATTERQQZ	5364
+VPSCATTERQQZmr	5365
+VPSHABmr	5366
+VPSHABrm	5367
+VPSHABrr	5368
+VPSHABrr_REV	5369
+VPSHADmr	5370
+VPSHADrm	5371
+VPSHADrr	5372
+VPSHADrr_REV	5373
+VPSHAQmr	5374
+VPSHAQrm	5375
+VPSHAQrr	5376
+VPSHAQrr_REV	5377
+VPSHAWmr	5378
+VPSHAWrm	5379
+VPSHAWrr	5380
+VPSHAWrr_REV	5381
+VPSHLBmr	5382
+VPSHLBrm	5383
+VPSHLBrr	5384
+VPSHLBrr_REV	5385
+VPSHLDDZ	5386
+VPSHLDDZrmbi	5387
+VPSHLDDZrmbik	5388
+VPSHLDDZrmbikz	5389
+VPSHLDDZrmi	5390
+VPSHLDDZrmik	5391
+VPSHLDDZrmikz	5392
+VPSHLDDZrri	5393
+VPSHLDDZrrik	5394
+VPSHLDDZrrikz	5395
+VPSHLDQZ	5396
+VPSHLDQZrmbi	5397
+VPSHLDQZrmbik	5398
+VPSHLDQZrmbikz	5399
+VPSHLDQZrmi	5400
+VPSHLDQZrmik	5401
+VPSHLDQZrmikz	5402
+VPSHLDQZrri	5403
+VPSHLDQZrrik	5404
+VPSHLDQZrrikz	5405
+VPSHLDVDZ	5406
+VPSHLDVDZm	5407
+VPSHLDVDZmb	5408
+VPSHLDVDZmbk	5409
+VPSHLDVDZmbkz	5410
+VPSHLDVDZmk	5411
+VPSHLDVDZmkz	5412
+VPSHLDVDZr	5413
+VPSHLDVDZrk	5414
+VPSHLDVDZrkz	5415
+VPSHLDVQZ	5416
+VPSHLDVQZm	5417
+VPSHLDVQZmb	5418
+VPSHLDVQZmbk	5419
+VPSHLDVQZmbkz	5420
+VPSHLDVQZmk	5421
+VPSHLDVQZmkz	5422
+VPSHLDVQZr	5423
+VPSHLDVQZrk	5424
+VPSHLDVQZrkz	5425
+VPSHLDVWZ	5426
+VPSHLDVWZm	5427
+VPSHLDVWZmk	5428
+VPSHLDVWZmkz	5429
+VPSHLDVWZr	5430
+VPSHLDVWZrk	5431
+VPSHLDVWZrkz	5432
+VPSHLDWZ	5433
+VPSHLDWZrmi	5434
+VPSHLDWZrmik	5435
+VPSHLDWZrmikz	5436
+VPSHLDWZrri	5437
+VPSHLDWZrrik	5438
+VPSHLDWZrrikz	5439
+VPSHLDmr	5440
+VPSHLDrm	5441
+VPSHLDrr	5442
+VPSHLDrr_REV	5443
+VPSHLQmr	5444
+VPSHLQrm	5445
+VPSHLQrr	5446
+VPSHLQrr_REV	5447
+VPSHLWmr	5448
+VPSHLWrm	5449
+VPSHLWrr	5450
+VPSHLWrr_REV	5451
+VPSHRDDZ	5452
+VPSHRDDZrmbi	5453
+VPSHRDDZrmbik	5454
+VPSHRDDZrmbikz	5455
+VPSHRDDZrmi	5456
+VPSHRDDZrmik	5457
+VPSHRDDZrmikz	5458
+VPSHRDDZrri	5459
+VPSHRDDZrrik	5460
+VPSHRDDZrrikz	5461
+VPSHRDQZ	5462
+VPSHRDQZrmbi	5463
+VPSHRDQZrmbik	5464
+VPSHRDQZrmbikz	5465
+VPSHRDQZrmi	5466
+VPSHRDQZrmik	5467
+VPSHRDQZrmikz	5468
+VPSHRDQZrri	5469
+VPSHRDQZrrik	5470
+VPSHRDQZrrikz	5471
+VPSHRDVDZ	5472
+VPSHRDVDZm	5473
+VPSHRDVDZmb	5474
+VPSHRDVDZmbk	5475
+VPSHRDVDZmbkz	5476
+VPSHRDVDZmk	5477
+VPSHRDVDZmkz	5478
+VPSHRDVDZr	5479
+VPSHRDVDZrk	5480
+VPSHRDVDZrkz	5481
+VPSHRDVQZ	5482
+VPSHRDVQZm	5483
+VPSHRDVQZmb	5484
+VPSHRDVQZmbk	5485
+VPSHRDVQZmbkz	5486
+VPSHRDVQZmk	5487
+VPSHRDVQZmkz	5488
+VPSHRDVQZr	5489
+VPSHRDVQZrk	5490
+VPSHRDVQZrkz	5491
+VPSHRDVWZ	5492
+VPSHRDVWZm	5493
+VPSHRDVWZmk	5494
+VPSHRDVWZmkz	5495
+VPSHRDVWZr	5496
+VPSHRDVWZrk	5497
+VPSHRDVWZrkz	5498
+VPSHRDWZ	5499
+VPSHRDWZrmi	5500
+VPSHRDWZrmik	5501
+VPSHRDWZrmikz	5502
+VPSHRDWZrri	5503
+VPSHRDWZrrik	5504
+VPSHRDWZrrikz	5505
+VPSHUFBITQMBZ	5506
+VPSHUFBITQMBZrm	5507
+VPSHUFBITQMBZrmk	5508
+VPSHUFBITQMBZrr	5509
+VPSHUFBITQMBZrrk	5510
+VPSHUFBYrm	5511
+VPSHUFBYrr	5512
+VPSHUFBZ	5513
+VPSHUFBZrm	5514
+VPSHUFBZrmk	5515
+VPSHUFBZrmkz	5516
+VPSHUFBZrr	5517
+VPSHUFBZrrk	5518
+VPSHUFBZrrkz	5519
+VPSHUFBrm	5520
+VPSHUFBrr	5521
+VPSHUFDYmi	5522
+VPSHUFDYri	5523
+VPSHUFDZ	5524
+VPSHUFDZmbi	5525
+VPSHUFDZmbik	5526
+VPSHUFDZmbikz	5527
+VPSHUFDZmi	5528
+VPSHUFDZmik	5529
+VPSHUFDZmikz	5530
+VPSHUFDZri	5531
+VPSHUFDZrik	5532
+VPSHUFDZrikz	5533
+VPSHUFDmi	5534
+VPSHUFDri	5535
+VPSHUFHWYmi	5536
+VPSHUFHWYri	5537
+VPSHUFHWZ	5538
+VPSHUFHWZmi	5539
+VPSHUFHWZmik	5540
+VPSHUFHWZmikz	5541
+VPSHUFHWZri	5542
+VPSHUFHWZrik	5543
+VPSHUFHWZrikz	5544
+VPSHUFHWmi	5545
+VPSHUFHWri	5546
+VPSHUFLWYmi	5547
+VPSHUFLWYri	5548
+VPSHUFLWZ	5549
+VPSHUFLWZmi	5550
+VPSHUFLWZmik	5551
+VPSHUFLWZmikz	5552
+VPSHUFLWZri	5553
+VPSHUFLWZrik	5554
+VPSHUFLWZrikz	5555
+VPSHUFLWmi	5556
+VPSHUFLWri	5557
+VPSIGNBYrm	5558
+VPSIGNBYrr	5559
+VPSIGNBrm	5560
+VPSIGNBrr	5561
+VPSIGNDYrm	5562
+VPSIGNDYrr	5563
+VPSIGNDrm	5564
+VPSIGNDrr	5565
+VPSIGNWYrm	5566
+VPSIGNWYrr	5567
+VPSIGNWrm	5568
+VPSIGNWrr	5569
+VPSLLDQYri	5570
+VPSLLDQZ	5571
+VPSLLDQZmi	5572
+VPSLLDQZri	5573
+VPSLLDQri	5574
+VPSLLDYri	5575
+VPSLLDYrm	5576
+VPSLLDYrr	5577
+VPSLLDZ	5578
+VPSLLDZmbi	5579
+VPSLLDZmbik	5580
+VPSLLDZmbikz	5581
+VPSLLDZmi	5582
+VPSLLDZmik	5583
+VPSLLDZmikz	5584
+VPSLLDZri	5585
+VPSLLDZrik	5586
+VPSLLDZrikz	5587
+VPSLLDZrm	5588
+VPSLLDZrmk	5589
+VPSLLDZrmkz	5590
+VPSLLDZrr	5591
+VPSLLDZrrk	5592
+VPSLLDZrrkz	5593
+VPSLLDri	5594
+VPSLLDrm	5595
+VPSLLDrr	5596
+VPSLLQYri	5597
+VPSLLQYrm	5598
+VPSLLQYrr	5599
+VPSLLQZ	5600
+VPSLLQZmbi	5601
+VPSLLQZmbik	5602
+VPSLLQZmbikz	5603
+VPSLLQZmi	5604
+VPSLLQZmik	5605
+VPSLLQZmikz	5606
+VPSLLQZri	5607
+VPSLLQZrik	5608
+VPSLLQZrikz	5609
+VPSLLQZrm	5610
+VPSLLQZrmk	5611
+VPSLLQZrmkz	5612
+VPSLLQZrr	5613
+VPSLLQZrrk	5614
+VPSLLQZrrkz	5615
+VPSLLQri	5616
+VPSLLQrm	5617
+VPSLLQrr	5618
+VPSLLVDYrm	5619
+VPSLLVDYrr	5620
+VPSLLVDZ	5621
+VPSLLVDZrm	5622
+VPSLLVDZrmb	5623
+VPSLLVDZrmbk	5624
+VPSLLVDZrmbkz	5625
+VPSLLVDZrmk	5626
+VPSLLVDZrmkz	5627
+VPSLLVDZrr	5628
+VPSLLVDZrrk	5629
+VPSLLVDZrrkz	5630
+VPSLLVDrm	5631
+VPSLLVDrr	5632
+VPSLLVQYrm	5633
+VPSLLVQYrr	5634
+VPSLLVQZ	5635
+VPSLLVQZrm	5636
+VPSLLVQZrmb	5637
+VPSLLVQZrmbk	5638
+VPSLLVQZrmbkz	5639
+VPSLLVQZrmk	5640
+VPSLLVQZrmkz	5641
+VPSLLVQZrr	5642
+VPSLLVQZrrk	5643
+VPSLLVQZrrkz	5644
+VPSLLVQrm	5645
+VPSLLVQrr	5646
+VPSLLVWZ	5647
+VPSLLVWZrm	5648
+VPSLLVWZrmk	5649
+VPSLLVWZrmkz	5650
+VPSLLVWZrr	5651
+VPSLLVWZrrk	5652
+VPSLLVWZrrkz	5653
+VPSLLWYri	5654
+VPSLLWYrm	5655
+VPSLLWYrr	5656
+VPSLLWZ	5657
+VPSLLWZmi	5658
+VPSLLWZmik	5659
+VPSLLWZmikz	5660
+VPSLLWZri	5661
+VPSLLWZrik	5662
+VPSLLWZrikz	5663
+VPSLLWZrm	5664
+VPSLLWZrmk	5665
+VPSLLWZrmkz	5666
+VPSLLWZrr	5667
+VPSLLWZrrk	5668
+VPSLLWZrrkz	5669
+VPSLLWri	5670
+VPSLLWrm	5671
+VPSLLWrr	5672
+VPSRADYri	5673
+VPSRADYrm	5674
+VPSRADYrr	5675
+VPSRADZ	5676
+VPSRADZmbi	5677
+VPSRADZmbik	5678
+VPSRADZmbikz	5679
+VPSRADZmi	5680
+VPSRADZmik	5681
+VPSRADZmikz	5682
+VPSRADZri	5683
+VPSRADZrik	5684
+VPSRADZrikz	5685
+VPSRADZrm	5686
+VPSRADZrmk	5687
+VPSRADZrmkz	5688
+VPSRADZrr	5689
+VPSRADZrrk	5690
+VPSRADZrrkz	5691
+VPSRADri	5692
+VPSRADrm	5693
+VPSRADrr	5694
+VPSRAQZ	5695
+VPSRAQZmbi	5696
+VPSRAQZmbik	5697
+VPSRAQZmbikz	5698
+VPSRAQZmi	5699
+VPSRAQZmik	5700
+VPSRAQZmikz	5701
+VPSRAQZri	5702
+VPSRAQZrik	5703
+VPSRAQZrikz	5704
+VPSRAQZrm	5705
+VPSRAQZrmk	5706
+VPSRAQZrmkz	5707
+VPSRAQZrr	5708
+VPSRAQZrrk	5709
+VPSRAQZrrkz	5710
+VPSRAVDYrm	5711
+VPSRAVDYrr	5712
+VPSRAVDZ	5713
+VPSRAVDZrm	5714
+VPSRAVDZrmb	5715
+VPSRAVDZrmbk	5716
+VPSRAVDZrmbkz	5717
+VPSRAVDZrmk	5718
+VPSRAVDZrmkz	5719
+VPSRAVDZrr	5720
+VPSRAVDZrrk	5721
+VPSRAVDZrrkz	5722
+VPSRAVDrm	5723
+VPSRAVDrr	5724
+VPSRAVQZ	5725
+VPSRAVQZrm	5726
+VPSRAVQZrmb	5727
+VPSRAVQZrmbk	5728
+VPSRAVQZrmbkz	5729
+VPSRAVQZrmk	5730
+VPSRAVQZrmkz	5731
+VPSRAVQZrr	5732
+VPSRAVQZrrk	5733
+VPSRAVQZrrkz	5734
+VPSRAVWZ	5735
+VPSRAVWZrm	5736
+VPSRAVWZrmk	5737
+VPSRAVWZrmkz	5738
+VPSRAVWZrr	5739
+VPSRAVWZrrk	5740
+VPSRAVWZrrkz	5741
+VPSRAWYri	5742
+VPSRAWYrm	5743
+VPSRAWYrr	5744
+VPSRAWZ	5745
+VPSRAWZmi	5746
+VPSRAWZmik	5747
+VPSRAWZmikz	5748
+VPSRAWZri	5749
+VPSRAWZrik	5750
+VPSRAWZrikz	5751
+VPSRAWZrm	5752
+VPSRAWZrmk	5753
+VPSRAWZrmkz	5754
+VPSRAWZrr	5755
+VPSRAWZrrk	5756
+VPSRAWZrrkz	5757
+VPSRAWri	5758
+VPSRAWrm	5759
+VPSRAWrr	5760
+VPSRLDQYri	5761
+VPSRLDQZ	5762
+VPSRLDQZmi	5763
+VPSRLDQZri	5764
+VPSRLDQri	5765
+VPSRLDYri	5766
+VPSRLDYrm	5767
+VPSRLDYrr	5768
+VPSRLDZ	5769
+VPSRLDZmbi	5770
+VPSRLDZmbik	5771
+VPSRLDZmbikz	5772
+VPSRLDZmi	5773
+VPSRLDZmik	5774
+VPSRLDZmikz	5775
+VPSRLDZri	5776
+VPSRLDZrik	5777
+VPSRLDZrikz	5778
+VPSRLDZrm	5779
+VPSRLDZrmk	5780
+VPSRLDZrmkz	5781
+VPSRLDZrr	5782
+VPSRLDZrrk	5783
+VPSRLDZrrkz	5784
+VPSRLDri	5785
+VPSRLDrm	5786
+VPSRLDrr	5787
+VPSRLQYri	5788
+VPSRLQYrm	5789
+VPSRLQYrr	5790
+VPSRLQZ	5791
+VPSRLQZmbi	5792
+VPSRLQZmbik	5793
+VPSRLQZmbikz	5794
+VPSRLQZmi	5795
+VPSRLQZmik	5796
+VPSRLQZmikz	5797
+VPSRLQZri	5798
+VPSRLQZrik	5799
+VPSRLQZrikz	5800
+VPSRLQZrm	5801
+VPSRLQZrmk	5802
+VPSRLQZrmkz	5803
+VPSRLQZrr	5804
+VPSRLQZrrk	5805
+VPSRLQZrrkz	5806
+VPSRLQri	5807
+VPSRLQrm	5808
+VPSRLQrr	5809
+VPSRLVDYrm	5810
+VPSRLVDYrr	5811
+VPSRLVDZ	5812
+VPSRLVDZrm	5813
+VPSRLVDZrmb	5814
+VPSRLVDZrmbk	5815
+VPSRLVDZrmbkz	5816
+VPSRLVDZrmk	5817
+VPSRLVDZrmkz	5818
+VPSRLVDZrr	5819
+VPSRLVDZrrk	5820
+VPSRLVDZrrkz	5821
+VPSRLVDrm	5822
+VPSRLVDrr	5823
+VPSRLVQYrm	5824
+VPSRLVQYrr	5825
+VPSRLVQZ	5826
+VPSRLVQZrm	5827
+VPSRLVQZrmb	5828
+VPSRLVQZrmbk	5829
+VPSRLVQZrmbkz	5830
+VPSRLVQZrmk	5831
+VPSRLVQZrmkz	5832
+VPSRLVQZrr	5833
+VPSRLVQZrrk	5834
+VPSRLVQZrrkz	5835
+VPSRLVQrm	5836
+VPSRLVQrr	5837
+VPSRLVWZ	5838
+VPSRLVWZrm	5839
+VPSRLVWZrmk	5840
+VPSRLVWZrmkz	5841
+VPSRLVWZrr	5842
+VPSRLVWZrrk	5843
+VPSRLVWZrrkz	5844
+VPSRLWYri	5845
+VPSRLWYrm	5846
+VPSRLWYrr	5847
+VPSRLWZ	5848
+VPSRLWZmi	5849
+VPSRLWZmik	5850
+VPSRLWZmikz	5851
+VPSRLWZri	5852
+VPSRLWZrik	5853
+VPSRLWZrikz	5854
+VPSRLWZrm	5855
+VPSRLWZrmk	5856
+VPSRLWZrmkz	5857
+VPSRLWZrr	5858
+VPSRLWZrrk	5859
+VPSRLWZrrkz	5860
+VPSRLWri	5861
+VPSRLWrm	5862
+VPSRLWrr	5863
+VPSUBBYrm	5864
+VPSUBBYrr	5865
+VPSUBBZ	5866
+VPSUBBZrm	5867
+VPSUBBZrmk	5868
+VPSUBBZrmkz	5869
+VPSUBBZrr	5870
+VPSUBBZrrk	5871
+VPSUBBZrrkz	5872
+VPSUBBrm	5873
+VPSUBBrr	5874
+VPSUBDYrm	5875
+VPSUBDYrr	5876
+VPSUBDZ	5877
+VPSUBDZrm	5878
+VPSUBDZrmb	5879
+VPSUBDZrmbk	5880
+VPSUBDZrmbkz	5881
+VPSUBDZrmk	5882
+VPSUBDZrmkz	5883
+VPSUBDZrr	5884
+VPSUBDZrrk	5885
+VPSUBDZrrkz	5886
+VPSUBDrm	5887
+VPSUBDrr	5888
+VPSUBQYrm	5889
+VPSUBQYrr	5890
+VPSUBQZ	5891
+VPSUBQZrm	5892
+VPSUBQZrmb	5893
+VPSUBQZrmbk	5894
+VPSUBQZrmbkz	5895
+VPSUBQZrmk	5896
+VPSUBQZrmkz	5897
+VPSUBQZrr	5898
+VPSUBQZrrk	5899
+VPSUBQZrrkz	5900
+VPSUBQrm	5901
+VPSUBQrr	5902
+VPSUBSBYrm	5903
+VPSUBSBYrr	5904
+VPSUBSBZ	5905
+VPSUBSBZrm	5906
+VPSUBSBZrmk	5907
+VPSUBSBZrmkz	5908
+VPSUBSBZrr	5909
+VPSUBSBZrrk	5910
+VPSUBSBZrrkz	5911
+VPSUBSBrm	5912
+VPSUBSBrr	5913
+VPSUBSWYrm	5914
+VPSUBSWYrr	5915
+VPSUBSWZ	5916
+VPSUBSWZrm	5917
+VPSUBSWZrmk	5918
+VPSUBSWZrmkz	5919
+VPSUBSWZrr	5920
+VPSUBSWZrrk	5921
+VPSUBSWZrrkz	5922
+VPSUBSWrm	5923
+VPSUBSWrr	5924
+VPSUBUSBYrm	5925
+VPSUBUSBYrr	5926
+VPSUBUSBZ	5927
+VPSUBUSBZrm	5928
+VPSUBUSBZrmk	5929
+VPSUBUSBZrmkz	5930
+VPSUBUSBZrr	5931
+VPSUBUSBZrrk	5932
+VPSUBUSBZrrkz	5933
+VPSUBUSBrm	5934
+VPSUBUSBrr	5935
+VPSUBUSWYrm	5936
+VPSUBUSWYrr	5937
+VPSUBUSWZ	5938
+VPSUBUSWZrm	5939
+VPSUBUSWZrmk	5940
+VPSUBUSWZrmkz	5941
+VPSUBUSWZrr	5942
+VPSUBUSWZrrk	5943
+VPSUBUSWZrrkz	5944
+VPSUBUSWrm	5945
+VPSUBUSWrr	5946
+VPSUBWYrm	5947
+VPSUBWYrr	5948
+VPSUBWZ	5949
+VPSUBWZrm	5950
+VPSUBWZrmk	5951
+VPSUBWZrmkz	5952
+VPSUBWZrr	5953
+VPSUBWZrrk	5954
+VPSUBWZrrkz	5955
+VPSUBWrm	5956
+VPSUBWrr	5957
+VPTERNLOGDZ	5958
+VPTERNLOGDZrmbi	5959
+VPTERNLOGDZrmbik	5960
+VPTERNLOGDZrmbikz	5961
+VPTERNLOGDZrmi	5962
+VPTERNLOGDZrmik	5963
+VPTERNLOGDZrmikz	5964
+VPTERNLOGDZrri	5965
+VPTERNLOGDZrrik	5966
+VPTERNLOGDZrrikz	5967
+VPTERNLOGQZ	5968
+VPTERNLOGQZrmbi	5969
+VPTERNLOGQZrmbik	5970
+VPTERNLOGQZrmbikz	5971
+VPTERNLOGQZrmi	5972
+VPTERNLOGQZrmik	5973
+VPTERNLOGQZrmikz	5974
+VPTERNLOGQZrri	5975
+VPTERNLOGQZrrik	5976
+VPTERNLOGQZrrikz	5977
+VPTESTMBZ	5978
+VPTESTMBZrm	5979
+VPTESTMBZrmk	5980
+VPTESTMBZrr	5981
+VPTESTMBZrrk	5982
+VPTESTMDZ	5983
+VPTESTMDZrm	5984
+VPTESTMDZrmb	5985
+VPTESTMDZrmbk	5986
+VPTESTMDZrmk	5987
+VPTESTMDZrr	5988
+VPTESTMDZrrk	5989
+VPTESTMQZ	5990
+VPTESTMQZrm	5991
+VPTESTMQZrmb	5992
+VPTESTMQZrmbk	5993
+VPTESTMQZrmk	5994
+VPTESTMQZrr	5995
+VPTESTMQZrrk	5996
+VPTESTMWZ	5997
+VPTESTMWZrm	5998
+VPTESTMWZrmk	5999
+VPTESTMWZrr	6000
+VPTESTMWZrrk	6001
+VPTESTNMBZ	6002
+VPTESTNMBZrm	6003
+VPTESTNMBZrmk	6004
+VPTESTNMBZrr	6005
+VPTESTNMBZrrk	6006
+VPTESTNMDZ	6007
+VPTESTNMDZrm	6008
+VPTESTNMDZrmb	6009
+VPTESTNMDZrmbk	6010
+VPTESTNMDZrmk	6011
+VPTESTNMDZrr	6012
+VPTESTNMDZrrk	6013
+VPTESTNMQZ	6014
+VPTESTNMQZrm	6015
+VPTESTNMQZrmb	6016
+VPTESTNMQZrmbk	6017
+VPTESTNMQZrmk	6018
+VPTESTNMQZrr	6019
+VPTESTNMQZrrk	6020
+VPTESTNMWZ	6021
+VPTESTNMWZrm	6022
+VPTESTNMWZrmk	6023
+VPTESTNMWZrr	6024
+VPTESTNMWZrrk	6025
+VPTESTYrm	6026
+VPTESTYrr	6027
+VPTESTrm	6028
+VPTESTrr	6029
+VPUNPCKHBWYrm	6030
+VPUNPCKHBWYrr	6031
+VPUNPCKHBWZ	6032
+VPUNPCKHBWZrm	6033
+VPUNPCKHBWZrmk	6034
+VPUNPCKHBWZrmkz	6035
+VPUNPCKHBWZrr	6036
+VPUNPCKHBWZrrk	6037
+VPUNPCKHBWZrrkz	6038
+VPUNPCKHBWrm	6039
+VPUNPCKHBWrr	6040
+VPUNPCKHDQYrm	6041
+VPUNPCKHDQYrr	6042
+VPUNPCKHDQZ	6043
+VPUNPCKHDQZrm	6044
+VPUNPCKHDQZrmb	6045
+VPUNPCKHDQZrmbk	6046
+VPUNPCKHDQZrmbkz	6047
+VPUNPCKHDQZrmk	6048
+VPUNPCKHDQZrmkz	6049
+VPUNPCKHDQZrr	6050
+VPUNPCKHDQZrrk	6051
+VPUNPCKHDQZrrkz	6052
+VPUNPCKHDQrm	6053
+VPUNPCKHDQrr	6054
+VPUNPCKHQDQYrm	6055
+VPUNPCKHQDQYrr	6056
+VPUNPCKHQDQZ	6057
+VPUNPCKHQDQZrm	6058
+VPUNPCKHQDQZrmb	6059
+VPUNPCKHQDQZrmbk	6060
+VPUNPCKHQDQZrmbkz	6061
+VPUNPCKHQDQZrmk	6062
+VPUNPCKHQDQZrmkz	6063
+VPUNPCKHQDQZrr	6064
+VPUNPCKHQDQZrrk	6065
+VPUNPCKHQDQZrrkz	6066
+VPUNPCKHQDQrm	6067
+VPUNPCKHQDQrr	6068
+VPUNPCKHWDYrm	6069
+VPUNPCKHWDYrr	6070
+VPUNPCKHWDZ	6071
+VPUNPCKHWDZrm	6072
+VPUNPCKHWDZrmk	6073
+VPUNPCKHWDZrmkz	6074
+VPUNPCKHWDZrr	6075
+VPUNPCKHWDZrrk	6076
+VPUNPCKHWDZrrkz	6077
+VPUNPCKHWDrm	6078
+VPUNPCKHWDrr	6079
+VPUNPCKLBWYrm	6080
+VPUNPCKLBWYrr	6081
+VPUNPCKLBWZ	6082
+VPUNPCKLBWZrm	6083
+VPUNPCKLBWZrmk	6084
+VPUNPCKLBWZrmkz	6085
+VPUNPCKLBWZrr	6086
+VPUNPCKLBWZrrk	6087
+VPUNPCKLBWZrrkz	6088
+VPUNPCKLBWrm	6089
+VPUNPCKLBWrr	6090
+VPUNPCKLDQYrm	6091
+VPUNPCKLDQYrr	6092
+VPUNPCKLDQZ	6093
+VPUNPCKLDQZrm	6094
+VPUNPCKLDQZrmb	6095
+VPUNPCKLDQZrmbk	6096
+VPUNPCKLDQZrmbkz	6097
+VPUNPCKLDQZrmk	6098
+VPUNPCKLDQZrmkz	6099
+VPUNPCKLDQZrr	6100
+VPUNPCKLDQZrrk	6101
+VPUNPCKLDQZrrkz	6102
+VPUNPCKLDQrm	6103
+VPUNPCKLDQrr	6104
+VPUNPCKLQDQYrm	6105
+VPUNPCKLQDQYrr	6106
+VPUNPCKLQDQZ	6107
+VPUNPCKLQDQZrm	6108
+VPUNPCKLQDQZrmb	6109
+VPUNPCKLQDQZrmbk	6110
+VPUNPCKLQDQZrmbkz	6111
+VPUNPCKLQDQZrmk	6112
+VPUNPCKLQDQZrmkz	6113
+VPUNPCKLQDQZrr	6114
+VPUNPCKLQDQZrrk	6115
+VPUNPCKLQDQZrrkz	6116
+VPUNPCKLQDQrm	6117
+VPUNPCKLQDQrr	6118
+VPUNPCKLWDYrm	6119
+VPUNPCKLWDYrr	6120
+VPUNPCKLWDZ	6121
+VPUNPCKLWDZrm	6122
+VPUNPCKLWDZrmk	6123
+VPUNPCKLWDZrmkz	6124
+VPUNPCKLWDZrr	6125
+VPUNPCKLWDZrrk	6126
+VPUNPCKLWDZrrkz	6127
+VPUNPCKLWDrm	6128
+VPUNPCKLWDrr	6129
+VPXORDZ	6130
+VPXORDZrm	6131
+VPXORDZrmb	6132
+VPXORDZrmbk	6133
+VPXORDZrmbkz	6134
+VPXORDZrmk	6135
+VPXORDZrmkz	6136
+VPXORDZrr	6137
+VPXORDZrrk	6138
+VPXORDZrrkz	6139
+VPXORQZ	6140
+VPXORQZrm	6141
+VPXORQZrmb	6142
+VPXORQZrmbk	6143
+VPXORQZrmbkz	6144
+VPXORQZrmk	6145
+VPXORQZrmkz	6146
+VPXORQZrr	6147
+VPXORQZrrk	6148
+VPXORQZrrkz	6149
+VPXORYrm	6150
+VPXORYrr	6151
+VPXORrm	6152
+VPXORrr	6153
+VRANGEPDZ	6154
+VRANGEPDZrmbi	6155
+VRANGEPDZrmbik	6156
+VRANGEPDZrmbikz	6157
+VRANGEPDZrmi	6158
+VRANGEPDZrmik	6159
+VRANGEPDZrmikz	6160
+VRANGEPDZrri	6161
+VRANGEPDZrrib	6162
+VRANGEPDZrribk	6163
+VRANGEPDZrribkz	6164
+VRANGEPDZrrik	6165
+VRANGEPDZrrikz	6166
+VRANGEPSZ	6167
+VRANGEPSZrmbi	6168
+VRANGEPSZrmbik	6169
+VRANGEPSZrmbikz	6170
+VRANGEPSZrmi	6171
+VRANGEPSZrmik	6172
+VRANGEPSZrmikz	6173
+VRANGEPSZrri	6174
+VRANGEPSZrrib	6175
+VRANGEPSZrribk	6176
+VRANGEPSZrribkz	6177
+VRANGEPSZrrik	6178
+VRANGEPSZrrikz	6179
+VRANGESDZrmi	6180
+VRANGESDZrmik	6181
+VRANGESDZrmikz	6182
+VRANGESDZrri	6183
+VRANGESDZrrib	6184
+VRANGESDZrribk	6185
+VRANGESDZrribkz	6186
+VRANGESDZrrik	6187
+VRANGESDZrrikz	6188
+VRANGESSZrmi	6189
+VRANGESSZrmik	6190
+VRANGESSZrmikz	6191
+VRANGESSZrri	6192
+VRANGESSZrrib	6193
+VRANGESSZrribk	6194
+VRANGESSZrribkz	6195
+VRANGESSZrrik	6196
+VRANGESSZrrikz	6197
+VRCP	6198
+VRCPBF	6199
+VRCPPHZ	6200
+VRCPPHZm	6201
+VRCPPHZmb	6202
+VRCPPHZmbk	6203
+VRCPPHZmbkz	6204
+VRCPPHZmk	6205
+VRCPPHZmkz	6206
+VRCPPHZr	6207
+VRCPPHZrk	6208
+VRCPPHZrkz	6209
+VRCPPSYm	6210
+VRCPPSYr	6211
+VRCPPSm	6212
+VRCPPSr	6213
+VRCPSHZrm	6214
+VRCPSHZrmk	6215
+VRCPSHZrmkz	6216
+VRCPSHZrr	6217
+VRCPSHZrrk	6218
+VRCPSHZrrkz	6219
+VRCPSSm	6220
+VRCPSSm_Int	6221
+VRCPSSr	6222
+VRCPSSr_Int	6223
+VREDUCEBF	6224
+VREDUCEPDZ	6225
+VREDUCEPDZrmbi	6226
+VREDUCEPDZrmbik	6227
+VREDUCEPDZrmbikz	6228
+VREDUCEPDZrmi	6229
+VREDUCEPDZrmik	6230
+VREDUCEPDZrmikz	6231
+VREDUCEPDZrri	6232
+VREDUCEPDZrrib	6233
+VREDUCEPDZrribk	6234
+VREDUCEPDZrribkz	6235
+VREDUCEPDZrrik	6236
+VREDUCEPDZrrikz	6237
+VREDUCEPHZ	6238
+VREDUCEPHZrmbi	6239
+VREDUCEPHZrmbik	6240
+VREDUCEPHZrmbikz	6241
+VREDUCEPHZrmi	6242
+VREDUCEPHZrmik	6243
+VREDUCEPHZrmikz	6244
+VREDUCEPHZrri	6245
+VREDUCEPHZrrib	6246
+VREDUCEPHZrribk	6247
+VREDUCEPHZrribkz	6248
+VREDUCEPHZrrik	6249
+VREDUCEPHZrrikz	6250
+VREDUCEPSZ	6251
+VREDUCEPSZrmbi	6252
+VREDUCEPSZrmbik	6253
+VREDUCEPSZrmbikz	6254
+VREDUCEPSZrmi	6255
+VREDUCEPSZrmik	6256
+VREDUCEPSZrmikz	6257
+VREDUCEPSZrri	6258
+VREDUCEPSZrrib	6259
+VREDUCEPSZrribk	6260
+VREDUCEPSZrribkz	6261
+VREDUCEPSZrrik	6262
+VREDUCEPSZrrikz	6263
+VREDUCESDZrmi	6264
+VREDUCESDZrmik	6265
+VREDUCESDZrmikz	6266
+VREDUCESDZrri	6267
+VREDUCESDZrrib	6268
+VREDUCESDZrribk	6269
+VREDUCESDZrribkz	6270
+VREDUCESDZrrik	6271
+VREDUCESDZrrikz	6272
+VREDUCESHZrmi	6273
+VREDUCESHZrmik	6274
+VREDUCESHZrmikz	6275
+VREDUCESHZrri	6276
+VREDUCESHZrrib	6277
+VREDUCESHZrribk	6278
+VREDUCESHZrribkz	6279
+VREDUCESHZrrik	6280
+VREDUCESHZrrikz	6281
+VREDUCESSZrmi	6282
+VREDUCESSZrmik	6283
+VREDUCESSZrmikz	6284
+VREDUCESSZrri	6285
+VREDUCESSZrrib	6286
+VREDUCESSZrribk	6287
+VREDUCESSZrribkz	6288
+VREDUCESSZrrik	6289
+VREDUCESSZrrikz	6290
+VRNDSCALEBF	6291
+VRNDSCALEPDZ	6292
+VRNDSCALEPDZrmbi	6293
+VRNDSCALEPDZrmbik	6294
+VRNDSCALEPDZrmbikz	6295
+VRNDSCALEPDZrmi	6296
+VRNDSCALEPDZrmik	6297
+VRNDSCALEPDZrmikz	6298
+VRNDSCALEPDZrri	6299
+VRNDSCALEPDZrrib	6300
+VRNDSCALEPDZrribk	6301
+VRNDSCALEPDZrribkz	6302
+VRNDSCALEPDZrrik	6303
+VRNDSCALEPDZrrikz	6304
+VRNDSCALEPHZ	6305
+VRNDSCALEPHZrmbi	6306
+VRNDSCALEPHZrmbik	6307
+VRNDSCALEPHZrmbikz	6308
+VRNDSCALEPHZrmi	6309
+VRNDSCALEPHZrmik	6310
+VRNDSCALEPHZrmikz	6311
+VRNDSCALEPHZrri	6312
+VRNDSCALEPHZrrib	6313
+VRNDSCALEPHZrribk	6314
+VRNDSCALEPHZrribkz	6315
+VRNDSCALEPHZrrik	6316
+VRNDSCALEPHZrrikz	6317
+VRNDSCALEPSZ	6318
+VRNDSCALEPSZrmbi	6319
+VRNDSCALEPSZrmbik	6320
+VRNDSCALEPSZrmbikz	6321
+VRNDSCALEPSZrmi	6322
+VRNDSCALEPSZrmik	6323
+VRNDSCALEPSZrmikz	6324
+VRNDSCALEPSZrri	6325
+VRNDSCALEPSZrrib	6326
+VRNDSCALEPSZrribk	6327
+VRNDSCALEPSZrribkz	6328
+VRNDSCALEPSZrrik	6329
+VRNDSCALEPSZrrikz	6330
+VRNDSCALESDZrmi	6331
+VRNDSCALESDZrmi_Int	6332
+VRNDSCALESDZrmik_Int	6333
+VRNDSCALESDZrmikz_Int	6334
+VRNDSCALESDZrri	6335
+VRNDSCALESDZrri_Int	6336
+VRNDSCALESDZrrib_Int	6337
+VRNDSCALESDZrribk_Int	6338
+VRNDSCALESDZrribkz_Int	6339
+VRNDSCALESDZrrik_Int	6340
+VRNDSCALESDZrrikz_Int	6341
+VRNDSCALESHZrmi	6342
+VRNDSCALESHZrmi_Int	6343
+VRNDSCALESHZrmik_Int	6344
+VRNDSCALESHZrmikz_Int	6345
+VRNDSCALESHZrri	6346
+VRNDSCALESHZrri_Int	6347
+VRNDSCALESHZrrib_Int	6348
+VRNDSCALESHZrribk_Int	6349
+VRNDSCALESHZrribkz_Int	6350
+VRNDSCALESHZrrik_Int	6351
+VRNDSCALESHZrrikz_Int	6352
+VRNDSCALESSZrmi	6353
+VRNDSCALESSZrmi_Int	6354
+VRNDSCALESSZrmik_Int	6355
+VRNDSCALESSZrmikz_Int	6356
+VRNDSCALESSZrri	6357
+VRNDSCALESSZrri_Int	6358
+VRNDSCALESSZrrib_Int	6359
+VRNDSCALESSZrribk_Int	6360
+VRNDSCALESSZrribkz_Int	6361
+VRNDSCALESSZrrik_Int	6362
+VRNDSCALESSZrrikz_Int	6363
+VROUNDPDYmi	6364
+VROUNDPDYri	6365
+VROUNDPDmi	6366
+VROUNDPDri	6367
+VROUNDPSYmi	6368
+VROUNDPSYri	6369
+VROUNDPSmi	6370
+VROUNDPSri	6371
+VROUNDSDmi	6372
+VROUNDSDmi_Int	6373
+VROUNDSDri	6374
+VROUNDSDri_Int	6375
+VROUNDSSmi	6376
+VROUNDSSmi_Int	6377
+VROUNDSSri	6378
+VROUNDSSri_Int	6379
+VRSQRT	6380
+VRSQRTBF	6381
+VRSQRTPHZ	6382
+VRSQRTPHZm	6383
+VRSQRTPHZmb	6384
+VRSQRTPHZmbk	6385
+VRSQRTPHZmbkz	6386
+VRSQRTPHZmk	6387
+VRSQRTPHZmkz	6388
+VRSQRTPHZr	6389
+VRSQRTPHZrk	6390
+VRSQRTPHZrkz	6391
+VRSQRTPSYm	6392
+VRSQRTPSYr	6393
+VRSQRTPSm	6394
+VRSQRTPSr	6395
+VRSQRTSHZrm	6396
+VRSQRTSHZrmk	6397
+VRSQRTSHZrmkz	6398
+VRSQRTSHZrr	6399
+VRSQRTSHZrrk	6400
+VRSQRTSHZrrkz	6401
+VRSQRTSSm	6402
+VRSQRTSSm_Int	6403
+VRSQRTSSr	6404
+VRSQRTSSr_Int	6405
+VSCALEFBF	6406
+VSCALEFPDZ	6407
+VSCALEFPDZrm	6408
+VSCALEFPDZrmb	6409
+VSCALEFPDZrmbk	6410
+VSCALEFPDZrmbkz	6411
+VSCALEFPDZrmk	6412
+VSCALEFPDZrmkz	6413
+VSCALEFPDZrr	6414
+VSCALEFPDZrrb	6415
+VSCALEFPDZrrbk	6416
+VSCALEFPDZrrbkz	6417
+VSCALEFPDZrrk	6418
+VSCALEFPDZrrkz	6419
+VSCALEFPHZ	6420
+VSCALEFPHZrm	6421
+VSCALEFPHZrmb	6422
+VSCALEFPHZrmbk	6423
+VSCALEFPHZrmbkz	6424
+VSCALEFPHZrmk	6425
+VSCALEFPHZrmkz	6426
+VSCALEFPHZrr	6427
+VSCALEFPHZrrb	6428
+VSCALEFPHZrrbk	6429
+VSCALEFPHZrrbkz	6430
+VSCALEFPHZrrk	6431
+VSCALEFPHZrrkz	6432
+VSCALEFPSZ	6433
+VSCALEFPSZrm	6434
+VSCALEFPSZrmb	6435
+VSCALEFPSZrmbk	6436
+VSCALEFPSZrmbkz	6437
+VSCALEFPSZrmk	6438
+VSCALEFPSZrmkz	6439
+VSCALEFPSZrr	6440
+VSCALEFPSZrrb	6441
+VSCALEFPSZrrbk	6442
+VSCALEFPSZrrbkz	6443
+VSCALEFPSZrrk	6444
+VSCALEFPSZrrkz	6445
+VSCALEFSDZrm	6446
+VSCALEFSDZrmk	6447
+VSCALEFSDZrmkz	6448
+VSCALEFSDZrr	6449
+VSCALEFSDZrrb_Int	6450
+VSCALEFSDZrrbk_Int	6451
+VSCALEFSDZrrbkz_Int	6452
+VSCALEFSDZrrk	6453
+VSCALEFSDZrrkz	6454
+VSCALEFSHZrm	6455
+VSCALEFSHZrmk	6456
+VSCALEFSHZrmkz	6457
+VSCALEFSHZrr	6458
+VSCALEFSHZrrb_Int	6459
+VSCALEFSHZrrbk_Int	6460
+VSCALEFSHZrrbkz_Int	6461
+VSCALEFSHZrrk	6462
+VSCALEFSHZrrkz	6463
+VSCALEFSSZrm	6464
+VSCALEFSSZrmk	6465
+VSCALEFSSZrmkz	6466
+VSCALEFSSZrr	6467
+VSCALEFSSZrrb_Int	6468
+VSCALEFSSZrrbk_Int	6469
+VSCALEFSSZrrbkz_Int	6470
+VSCALEFSSZrrk	6471
+VSCALEFSSZrrkz	6472
+VSCATTERDPDZ	6473
+VSCATTERDPDZmr	6474
+VSCATTERDPSZ	6475
+VSCATTERDPSZmr	6476
+VSCATTERPF	6477
+VSCATTERQPDZ	6478
+VSCATTERQPDZmr	6479
+VSCATTERQPSZ	6480
+VSCATTERQPSZmr	6481
+VSHA	6482
+VSHUFF	6483
+VSHUFI	6484
+VSHUFPDYrmi	6485
+VSHUFPDYrri	6486
+VSHUFPDZ	6487
+VSHUFPDZrmbi	6488
+VSHUFPDZrmbik	6489
+VSHUFPDZrmbikz	6490
+VSHUFPDZrmi	6491
+VSHUFPDZrmik	6492
+VSHUFPDZrmikz	6493
+VSHUFPDZrri	6494
+VSHUFPDZrrik	6495
+VSHUFPDZrrikz	6496
+VSHUFPDrmi	6497
+VSHUFPDrri	6498
+VSHUFPSYrmi	6499
+VSHUFPSYrri	6500
+VSHUFPSZ	6501
+VSHUFPSZrmbi	6502
+VSHUFPSZrmbik	6503
+VSHUFPSZrmbikz	6504
+VSHUFPSZrmi	6505
+VSHUFPSZrmik	6506
+VSHUFPSZrmikz	6507
+VSHUFPSZrri	6508
+VSHUFPSZrrik	6509
+VSHUFPSZrrikz	6510
+VSHUFPSrmi	6511
+VSHUFPSrri	6512
+VSM	6513
+VSQRTBF	6514
+VSQRTPDYm	6515
+VSQRTPDYr	6516
+VSQRTPDZ	6517
+VSQRTPDZm	6518
+VSQRTPDZmb	6519
+VSQRTPDZmbk	6520
+VSQRTPDZmbkz	6521
+VSQRTPDZmk	6522
+VSQRTPDZmkz	6523
+VSQRTPDZr	6524
+VSQRTPDZrb	6525
+VSQRTPDZrbk	6526
+VSQRTPDZrbkz	6527
+VSQRTPDZrk	6528
+VSQRTPDZrkz	6529
+VSQRTPDm	6530
+VSQRTPDr	6531
+VSQRTPHZ	6532
+VSQRTPHZm	6533
+VSQRTPHZmb	6534
+VSQRTPHZmbk	6535
+VSQRTPHZmbkz	6536
+VSQRTPHZmk	6537
+VSQRTPHZmkz	6538
+VSQRTPHZr	6539
+VSQRTPHZrb	6540
+VSQRTPHZrbk	6541
+VSQRTPHZrbkz	6542
+VSQRTPHZrk	6543
+VSQRTPHZrkz	6544
+VSQRTPSYm	6545
+VSQRTPSYr	6546
+VSQRTPSZ	6547
+VSQRTPSZm	6548
+VSQRTPSZmb	6549
+VSQRTPSZmbk	6550
+VSQRTPSZmbkz	6551
+VSQRTPSZmk	6552
+VSQRTPSZmkz	6553
+VSQRTPSZr	6554
+VSQRTPSZrb	6555
+VSQRTPSZrbk	6556
+VSQRTPSZrbkz	6557
+VSQRTPSZrk	6558
+VSQRTPSZrkz	6559
+VSQRTPSm	6560
+VSQRTPSr	6561
+VSQRTSDZm	6562
+VSQRTSDZm_Int	6563
+VSQRTSDZmk_Int	6564
+VSQRTSDZmkz_Int	6565
+VSQRTSDZr	6566
+VSQRTSDZr_Int	6567
+VSQRTSDZrb_Int	6568
+VSQRTSDZrbk_Int	6569
+VSQRTSDZrbkz_Int	6570
+VSQRTSDZrk_Int	6571
+VSQRTSDZrkz_Int	6572
+VSQRTSDm	6573
+VSQRTSDm_Int	6574
+VSQRTSDr	6575
+VSQRTSDr_Int	6576
+VSQRTSHZm	6577
+VSQRTSHZm_Int	6578
+VSQRTSHZmk_Int	6579
+VSQRTSHZmkz_Int	6580
+VSQRTSHZr	6581
+VSQRTSHZr_Int	6582
+VSQRTSHZrb_Int	6583
+VSQRTSHZrbk_Int	6584
+VSQRTSHZrbkz_Int	6585
+VSQRTSHZrk_Int	6586
+VSQRTSHZrkz_Int	6587
+VSQRTSSZm	6588
+VSQRTSSZm_Int	6589
+VSQRTSSZmk_Int	6590
+VSQRTSSZmkz_Int	6591
+VSQRTSSZr	6592
+VSQRTSSZr_Int	6593
+VSQRTSSZrb_Int	6594
+VSQRTSSZrbk_Int	6595
+VSQRTSSZrbkz_Int	6596
+VSQRTSSZrk_Int	6597
+VSQRTSSZrkz_Int	6598
+VSQRTSSm	6599
+VSQRTSSm_Int	6600
+VSQRTSSr	6601
+VSQRTSSr_Int	6602
+VSTMXCSR	6603
+VSUBBF	6604
+VSUBPDYrm	6605
+VSUBPDYrr	6606
+VSUBPDZ	6607
+VSUBPDZrm	6608
+VSUBPDZrmb	6609
+VSUBPDZrmbk	6610
+VSUBPDZrmbkz	6611
+VSUBPDZrmk	6612
+VSUBPDZrmkz	6613
+VSUBPDZrr	6614
+VSUBPDZrrb	6615
+VSUBPDZrrbk	6616
+VSUBPDZrrbkz	6617
+VSUBPDZrrk	6618
+VSUBPDZrrkz	6619
+VSUBPDrm	6620
+VSUBPDrr	6621
+VSUBPHZ	6622
+VSUBPHZrm	6623
+VSUBPHZrmb	6624
+VSUBPHZrmbk	6625
+VSUBPHZrmbkz	6626
+VSUBPHZrmk	6627
+VSUBPHZrmkz	6628
+VSUBPHZrr	6629
+VSUBPHZrrb	6630
+VSUBPHZrrbk	6631
+VSUBPHZrrbkz	6632
+VSUBPHZrrk	6633
+VSUBPHZrrkz	6634
+VSUBPSYrm	6635
+VSUBPSYrr	6636
+VSUBPSZ	6637
+VSUBPSZrm	6638
+VSUBPSZrmb	6639
+VSUBPSZrmbk	6640
+VSUBPSZrmbkz	6641
+VSUBPSZrmk	6642
+VSUBPSZrmkz	6643
+VSUBPSZrr	6644
+VSUBPSZrrb	6645
+VSUBPSZrrbk	6646
+VSUBPSZrrbkz	6647
+VSUBPSZrrk	6648
+VSUBPSZrrkz	6649
+VSUBPSrm	6650
+VSUBPSrr	6651
+VSUBSDZrm	6652
+VSUBSDZrm_Int	6653
+VSUBSDZrmk_Int	6654
+VSUBSDZrmkz_Int	6655
+VSUBSDZrr	6656
+VSUBSDZrr_Int	6657
+VSUBSDZrrb_Int	6658
+VSUBSDZrrbk_Int	6659
+VSUBSDZrrbkz_Int	6660
+VSUBSDZrrk_Int	6661
+VSUBSDZrrkz_Int	6662
+VSUBSDrm	6663
+VSUBSDrm_Int	6664
+VSUBSDrr	6665
+VSUBSDrr_Int	6666
+VSUBSHZrm	6667
+VSUBSHZrm_Int	6668
+VSUBSHZrmk_Int	6669
+VSUBSHZrmkz_Int	6670
+VSUBSHZrr	6671
+VSUBSHZrr_Int	6672
+VSUBSHZrrb_Int	6673
+VSUBSHZrrbk_Int	6674
+VSUBSHZrrbkz_Int	6675
+VSUBSHZrrk_Int	6676
+VSUBSHZrrkz_Int	6677
+VSUBSSZrm	6678
+VSUBSSZrm_Int	6679
+VSUBSSZrmk_Int	6680
+VSUBSSZrmkz_Int	6681
+VSUBSSZrr	6682
+VSUBSSZrr_Int	6683
+VSUBSSZrrb_Int	6684
+VSUBSSZrrbk_Int	6685
+VSUBSSZrrbkz_Int	6686
+VSUBSSZrrk_Int	6687
+VSUBSSZrrkz_Int	6688
+VSUBSSrm	6689
+VSUBSSrm_Int	6690
+VSUBSSrr	6691
+VSUBSSrr_Int	6692
+VTESTPDYrm	6693
+VTESTPDYrr	6694
+VTESTPDrm	6695
+VTESTPDrr	6696
+VTESTPSYrm	6697
+VTESTPSYrr	6698
+VTESTPSrm	6699
+VTESTPSrr	6700
+VUCOMISDZrm	6701
+VUCOMISDZrm_Int	6702
+VUCOMISDZrr	6703
+VUCOMISDZrr_Int	6704
+VUCOMISDZrrb	6705
+VUCOMISDrm	6706
+VUCOMISDrm_Int	6707
+VUCOMISDrr	6708
+VUCOMISDrr_Int	6709
+VUCOMISHZrm	6710
+VUCOMISHZrm_Int	6711
+VUCOMISHZrr	6712
+VUCOMISHZrr_Int	6713
+VUCOMISHZrrb	6714
+VUCOMISSZrm	6715
+VUCOMISSZrm_Int	6716
+VUCOMISSZrr	6717
+VUCOMISSZrr_Int	6718
+VUCOMISSZrrb	6719
+VUCOMISSrm	6720
+VUCOMISSrm_Int	6721
+VUCOMISSrr	6722
+VUCOMISSrr_Int	6723
+VUCOMXSDZrm	6724
+VUCOMXSDZrm_Int	6725
+VUCOMXSDZrr	6726
+VUCOMXSDZrr_Int	6727
+VUCOMXSDZrrb_Int	6728
+VUCOMXSHZrm	6729
+VUCOMXSHZrm_Int	6730
+VUCOMXSHZrr	6731
+VUCOMXSHZrr_Int	6732
+VUCOMXSHZrrb_Int	6733
+VUCOMXSSZrm	6734
+VUCOMXSSZrm_Int	6735
+VUCOMXSSZrr	6736
+VUCOMXSSZrr_Int	6737
+VUCOMXSSZrrb_Int	6738
+VUNPCKHPDYrm	6739
+VUNPCKHPDYrr	6740
+VUNPCKHPDZ	6741
+VUNPCKHPDZrm	6742
+VUNPCKHPDZrmb	6743
+VUNPCKHPDZrmbk	6744
+VUNPCKHPDZrmbkz	6745
+VUNPCKHPDZrmk	6746
+VUNPCKHPDZrmkz	6747
+VUNPCKHPDZrr	6748
+VUNPCKHPDZrrk	6749
+VUNPCKHPDZrrkz	6750
+VUNPCKHPDrm	6751
+VUNPCKHPDrr	6752
+VUNPCKHPSYrm	6753
+VUNPCKHPSYrr	6754
+VUNPCKHPSZ	6755
+VUNPCKHPSZrm	6756
+VUNPCKHPSZrmb	6757
+VUNPCKHPSZrmbk	6758
+VUNPCKHPSZrmbkz	6759
+VUNPCKHPSZrmk	6760
+VUNPCKHPSZrmkz	6761
+VUNPCKHPSZrr	6762
+VUNPCKHPSZrrk	6763
+VUNPCKHPSZrrkz	6764
+VUNPCKHPSrm	6765
+VUNPCKHPSrr	6766
+VUNPCKLPDYrm	6767
+VUNPCKLPDYrr	6768
+VUNPCKLPDZ	6769
+VUNPCKLPDZrm	6770
+VUNPCKLPDZrmb	6771
+VUNPCKLPDZrmbk	6772
+VUNPCKLPDZrmbkz	6773
+VUNPCKLPDZrmk	6774
+VUNPCKLPDZrmkz	6775
+VUNPCKLPDZrr	6776
+VUNPCKLPDZrrk	6777
+VUNPCKLPDZrrkz	6778
+VUNPCKLPDrm	6779
+VUNPCKLPDrr	6780
+VUNPCKLPSYrm	6781
+VUNPCKLPSYrr	6782
+VUNPCKLPSZ	6783
+VUNPCKLPSZrm	6784
+VUNPCKLPSZrmb	6785
+VUNPCKLPSZrmbk	6786
+VUNPCKLPSZrmbkz	6787
+VUNPCKLPSZrmk	6788
+VUNPCKLPSZrmkz	6789
+VUNPCKLPSZrr	6790
+VUNPCKLPSZrrk	6791
+VUNPCKLPSZrrkz	6792
+VUNPCKLPSrm	6793
+VUNPCKLPSrr	6794
+VXORPDYrm	6795
+VXORPDYrr	6796
+VXORPDZ	6797
+VXORPDZrm	6798
+VXORPDZrmb	6799
+VXORPDZrmbk	6800
+VXORPDZrmbkz	6801
+VXORPDZrmk	6802
+VXORPDZrmkz	6803
+VXORPDZrr	6804
+VXORPDZrrk	6805
+VXORPDZrrkz	6806
+VXORPDrm	6807
+VXORPDrr	6808
+VXORPSYrm	6809
+VXORPSYrr	6810
+VXORPSZ	6811
+VXORPSZrm	6812
+VXORPSZrmb	6813
+VXORPSZrmbk	6814
+VXORPSZrmbkz	6815
+VXORPSZrmk	6816
+VXORPSZrmkz	6817
+VXORPSZrr	6818
+VXORPSZrrk	6819
+VXORPSZrrkz	6820
+VXORPSrm	6821
+VXORPSrr	6822
+VZEROALL	6823
+VZEROUPPER	6824
+V_SET	6825
+V_SETALLONES	6826
+WAIT	6827
+WBINVD	6828
+WBNOINVD	6829
+WRFLAGS	6830
+WRFSBASE	6831
+WRGSBASE	6832
+WRMSR	6833
+WRMSRLIST	6834
+WRMSRNS	6835
+WRMSRNSir	6836
+WRMSRNSir_EVEX	6837
+WRPKRUr	6838
+WRSSD	6839
+WRSSD_EVEX	6840
+WRSSQ	6841
+WRSSQ_EVEX	6842
+WRUSSD	6843
+WRUSSD_EVEX	6844
+WRUSSQ	6845
+WRUSSQ_EVEX	6846
+XABORT	6847
+XABORT_DEF	6848
+XACQUIRE_PREFIX	6849
+XADD	6850
+XAM_F	6851
+XAM_Fp	6852
+XBEGIN	6853
+XCHG	6854
+XCH_F	6855
+XCRYPTCBC	6856
+XCRYPTCFB	6857
+XCRYPTCTR	6858
+XCRYPTECB	6859
+XCRYPTOFB	6860
+XEND	6861
+XGETBV	6862
+XLAT	6863
+XOR	6864
+XORPDrm	6865
+XORPDrr	6866
+XORPSrm	6867
+XORPSrr	6868
+XRELEASE_PREFIX	6869
+XRESLDTRK	6870
+XRSTOR	6871
+XRSTORS	6872
+XSAVE	6873
+XSAVEC	6874
+XSAVEOPT	6875
+XSAVES	6876
+XSETBV	6877
+XSHA	6878
+XSTORE	6879
+XSUSLDTRK	6880
+XTEST	6881
+Immediate	6882
+CImmediate	6883
+FPImmediate	6884
+MBB	6885
+FrameIndex	6886
+ConstantPoolIndex	6887
+TargetIndex	6888
+JumpTableIndex	6889
+ExternalSymbol	6890
+GlobalAddress	6891
+BlockAddress	6892
+RegisterMask	6893
+RegisterLiveOut	6894
+Metadata	6895
+MCSymbol	6896
+CFIIndex	6897
+IntrinsicID	6898
+Predicate	6899
+ShuffleMask	6900
+PhyReg_GR8	6901
+PhyReg_GRH8	6902
+PhyReg_GR8_NOREX2	6903
+PhyReg_GR8_NOREX	6904
+PhyReg_GR8_ABCD_H	6905
+PhyReg_GR8_ABCD_L	6906
+PhyReg_GRH16	6907
+PhyReg_GR16	6908
+PhyReg_GR16_NOREX2	6909
+PhyReg_GR16_NOREX	6910
+PhyReg_VK1	6911
+PhyReg_VK16	6912
+PhyReg_VK2	6913
+PhyReg_VK4	6914
+PhyReg_VK8	6915
+PhyReg_VK16WM	6916
+PhyReg_VK1WM	6917
+PhyReg_VK2WM	6918
+PhyReg_VK4WM	6919
+PhyReg_VK8WM	6920
+PhyReg_SEGMENT_REG	6921
+PhyReg_GR16_ABCD	6922
+PhyReg_FPCCR	6923
+PhyReg_FR16X	6924
+PhyReg_FR16	6925
+PhyReg_VK16PAIR	6926
+PhyReg_VK1PAIR	6927
+PhyReg_VK2PAIR	6928
+PhyReg_VK4PAIR	6929
+PhyReg_VK8PAIR	6930
+PhyReg_VK1PAIR_with_sub_mask_0_in_VK1WM	6931
+PhyReg_LOW32_ADDR_ACCESS_RBP	6932
+PhyReg_LOW32_ADDR_ACCESS	6933
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	6934
+PhyReg_FR32X	6935
+PhyReg_GR32	6936
+PhyReg_GR32_NOSP	6937
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	6938
+PhyReg_DEBUG_REG	6939
+PhyReg_FR32	6940
+PhyReg_GR32_NOREX2	6941
+PhyReg_GR32_NOREX2_NOSP	6942
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	6943
+PhyReg_GR32_NOREX	6944
+PhyReg_VK32	6945
+PhyReg_GR32_NOREX_NOSP	6946
+PhyReg_RFP32	6947
+PhyReg_VK32WM	6948
+PhyReg_GR32_ABCD	6949
+PhyReg_GR32_TC	6950
+PhyReg_GR32_ABCD_and_GR32_TC	6951
+PhyReg_GR32_AD	6952
+PhyReg_GR32_ArgRef	6953
+PhyReg_GR32_BPSP	6954
+PhyReg_GR32_BSI	6955
+PhyReg_GR32_CB	6956
+PhyReg_GR32_DC	6957
+PhyReg_GR32_DIBP	6958
+PhyReg_GR32_SIDI	6959
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	6960
+PhyReg_CCR	6961
+PhyReg_DFCCR	6962
+PhyReg_GR32_ABCD_and_GR32_BSI	6963
+PhyReg_GR32_AD_and_GR32_ArgRef	6964
+PhyReg_GR32_ArgRef_and_GR32_CB	6965
+PhyReg_GR32_BPSP_and_GR32_DIBP	6966
+PhyReg_GR32_BPSP_and_GR32_TC	6967
+PhyReg_GR32_BSI_and_GR32_SIDI	6968
+PhyReg_GR32_DIBP_and_GR32_SIDI	6969
+PhyReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	6970
+PhyReg_LOW32_ADDR_ACCESS_with_sub_32bit	6971
+PhyReg_RFP64	6972
+PhyReg_GR64	6973
+PhyReg_FR64X	6974
+PhyReg_GR64_with_sub_8bit	6975
+PhyReg_GR64_NOSP	6976
+PhyReg_GR64_NOREX2	6977
+PhyReg_CONTROL_REG	6978
+PhyReg_FR64	6979
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX2	6980
+PhyReg_GR64_NOREX2_NOSP	6981
+PhyReg_GR64PLTSafe	6982
+PhyReg_GR64_TC	6983
+PhyReg_GR64_NOREX	6984
+PhyReg_GR64_TCW64	6985
+PhyReg_GR64_TC_with_sub_8bit	6986
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TC	6987
+PhyReg_GR64_TCW64_with_sub_8bit	6988
+PhyReg_GR64_TC_and_GR64_TCW64	6989
+PhyReg_GR64_with_sub_16bit_in_GR16_NOREX	6990
+PhyReg_VK64	6991
+PhyReg_VR64	6992
+PhyReg_GR64PLTSafe_and_GR64_TC	6993
+PhyReg_GR64_NOREX2_NOSP_and_GR64_TCW64	6994
+PhyReg_GR64_NOREX_NOSP	6995
+PhyReg_GR64_NOREX_and_GR64_TC	6996
+PhyReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	6997
+PhyReg_VK64WM	6998
+PhyReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	6999
+PhyReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7000
+PhyReg_GR64PLTSafe_and_GR64_TCW64	7001
+PhyReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7002
+PhyReg_GR64_NOREX_and_GR64_TCW64	7003
+PhyReg_GR64_ABCD	7004
+PhyReg_GR64_with_sub_32bit_in_GR32_TC	7005
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7006
+PhyReg_GR64_AD	7007
+PhyReg_GR64_ArgRef	7008
+PhyReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7009
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef	7010
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP	7011
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI	7012
+PhyReg_GR64_with_sub_32bit_in_GR32_CB	7013
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP	7014
+PhyReg_GR64_with_sub_32bit_in_GR32_SIDI	7015
+PhyReg_GR64_A	7016
+PhyReg_GR64_ArgRef_and_GR64_TC	7017
+PhyReg_GR64_and_LOW32_ADDR_ACCESS	7018
+PhyReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7019
+PhyReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7020
+PhyReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7021
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7022
+PhyReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7023
+PhyReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7024
+PhyReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7025
+PhyReg_RST	7026
+PhyReg_RFP80	7027
+PhyReg_RFP80_7	7028
+PhyReg_VR128X	7029
+PhyReg_VR128	7030
+PhyReg_VR256X	7031
+PhyReg_VR256	7032
+PhyReg_VR512	7033
+PhyReg_VR512_0_15	7034
+PhyReg_TILE	7035
+PhyReg_TILEPAIR	7036
+VirtReg_GR8	7037
+VirtReg_GRH8	7038
+VirtReg_GR8_NOREX2	7039
+VirtReg_GR8_NOREX	7040
+VirtReg_GR8_ABCD_H	7041
+VirtReg_GR8_ABCD_L	7042
+VirtReg_GRH16	7043
+VirtReg_GR16	7044
+VirtReg_GR16_NOREX2	7045
+VirtReg_GR16_NOREX	7046
+VirtReg_VK1	7047
+VirtReg_VK16	7048
+VirtReg_VK2	7049
+VirtReg_VK4	7050
+VirtReg_VK8	7051
+VirtReg_VK16WM	7052
+VirtReg_VK1WM	7053
+VirtReg_VK2WM	7054
+VirtReg_VK4WM	7055
+VirtReg_VK8WM	7056
+VirtReg_SEGMENT_REG	7057
+VirtReg_GR16_ABCD	7058
+VirtReg_FPCCR	7059
+VirtReg_FR16X	7060
+VirtReg_FR16	7061
+VirtReg_VK16PAIR	7062
+VirtReg_VK1PAIR	7063
+VirtReg_VK2PAIR	7064
+VirtReg_VK4PAIR	7065
+VirtReg_VK8PAIR	7066
+VirtReg_VK1PAIR_with_sub_mask_0_in_VK1WM	7067
+VirtReg_LOW32_ADDR_ACCESS_RBP	7068
+VirtReg_LOW32_ADDR_ACCESS	7069
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit	7070
+VirtReg_FR32X	7071
+VirtReg_GR32	7072
+VirtReg_GR32_NOSP	7073
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX2	7074
+VirtReg_DEBUG_REG	7075
+VirtReg_FR32	7076
+VirtReg_GR32_NOREX2	7077
+VirtReg_GR32_NOREX2_NOSP	7078
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_16bit_in_GR16_NOREX	7079
+VirtReg_GR32_NOREX	7080
+VirtReg_VK32	7081
+VirtReg_GR32_NOREX_NOSP	7082
+VirtReg_RFP32	7083
+VirtReg_VK32WM	7084
+VirtReg_GR32_ABCD	7085
+VirtReg_GR32_TC	7086
+VirtReg_GR32_ABCD_and_GR32_TC	7087
+VirtReg_GR32_AD	7088
+VirtReg_GR32_ArgRef	7089
+VirtReg_GR32_BPSP	7090
+VirtReg_GR32_BSI	7091
+VirtReg_GR32_CB	7092
+VirtReg_GR32_DC	7093
+VirtReg_GR32_DIBP	7094
+VirtReg_GR32_SIDI	7095
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_32bit	7096
+VirtReg_CCR	7097
+VirtReg_DFCCR	7098
+VirtReg_GR32_ABCD_and_GR32_BSI	7099
+VirtReg_GR32_AD_and_GR32_ArgRef	7100
+VirtReg_GR32_ArgRef_and_GR32_CB	7101
+VirtReg_GR32_BPSP_and_GR32_DIBP	7102
+VirtReg_GR32_BPSP_and_GR32_TC	7103
+VirtReg_GR32_BSI_and_GR32_SIDI	7104
+VirtReg_GR32_DIBP_and_GR32_SIDI	7105
+VirtReg_LOW32_ADDR_ACCESS_RBP_with_sub_8bit_with_sub_32bit	7106
+VirtReg_LOW32_ADDR_ACCESS_with_sub_32bit	7107
+VirtReg_RFP64	7108
+VirtReg_GR64	7109
+VirtReg_FR64X	7110
+VirtReg_GR64_with_sub_8bit	7111
+VirtReg_GR64_NOSP	7112
+VirtReg_GR64_NOREX2	7113
+VirtReg_CONTROL_REG	7114
+VirtReg_FR64	7115
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX2	7116
+VirtReg_GR64_NOREX2_NOSP	7117
+VirtReg_GR64PLTSafe	7118
+VirtReg_GR64_TC	7119
+VirtReg_GR64_NOREX	7120
+VirtReg_GR64_TCW64	7121
+VirtReg_GR64_TC_with_sub_8bit	7122
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TC	7123
+VirtReg_GR64_TCW64_with_sub_8bit	7124
+VirtReg_GR64_TC_and_GR64_TCW64	7125
+VirtReg_GR64_with_sub_16bit_in_GR16_NOREX	7126
+VirtReg_VK64	7127
+VirtReg_VR64	7128
+VirtReg_GR64PLTSafe_and_GR64_TC	7129
+VirtReg_GR64_NOREX2_NOSP_and_GR64_TCW64	7130
+VirtReg_GR64_NOREX_NOSP	7131
+VirtReg_GR64_NOREX_and_GR64_TC	7132
+VirtReg_GR64_TCW64_and_GR64_TC_with_sub_8bit	7133
+VirtReg_VK64WM	7134
+VirtReg_GR64_TC_and_GR64_NOREX2_NOSP_and_GR64_TCW64	7135
+VirtReg_GR64_TC_and_GR64_with_sub_16bit_in_GR16_NOREX	7136
+VirtReg_GR64PLTSafe_and_GR64_TCW64	7137
+VirtReg_GR64_NOREX_and_GR64PLTSafe_and_GR64_TC	7138
+VirtReg_GR64_NOREX_and_GR64_TCW64	7139
+VirtReg_GR64_ABCD	7140
+VirtReg_GR64_with_sub_32bit_in_GR32_TC	7141
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_TC	7142
+VirtReg_GR64_AD	7143
+VirtReg_GR64_ArgRef	7144
+VirtReg_GR64_and_LOW32_ADDR_ACCESS_RBP	7145
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef	7146
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP	7147
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI	7148
+VirtReg_GR64_with_sub_32bit_in_GR32_CB	7149
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP	7150
+VirtReg_GR64_with_sub_32bit_in_GR32_SIDI	7151
+VirtReg_GR64_A	7152
+VirtReg_GR64_ArgRef_and_GR64_TC	7153
+VirtReg_GR64_and_LOW32_ADDR_ACCESS	7154
+VirtReg_GR64_with_sub_32bit_in_GR32_ABCD_and_GR32_BSI	7155
+VirtReg_GR64_with_sub_32bit_in_GR32_AD_and_GR32_ArgRef	7156
+VirtReg_GR64_with_sub_32bit_in_GR32_ArgRef_and_GR32_CB	7157
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_DIBP	7158
+VirtReg_GR64_with_sub_32bit_in_GR32_BPSP_and_GR32_TC	7159
+VirtReg_GR64_with_sub_32bit_in_GR32_BSI_and_GR32_SIDI	7160
+VirtReg_GR64_with_sub_32bit_in_GR32_DIBP_and_GR32_SIDI	7161
+VirtReg_RST	7162
+VirtReg_RFP80	7163
+VirtReg_RFP80_7	7164
+VirtReg_VR128X	7165
+VirtReg_VR128	7166
+VirtReg_VR256X	7167
+VirtReg_VR256	7168
+VirtReg_VR512	7169
+VirtReg_VR512_0_15	7170
+VirtReg_TILE	7171
+VirtReg_TILEPAIR	7172
diff --git a/llvm/test/tools/llvm-ir2vec/triplets.mir b/llvm/test/tools/llvm-ir2vec/triplets.mir
new file mode 100644
index 0000000000000..274984ae9e91d
--- /dev/null
+++ b/llvm/test/tools/llvm-ir2vec/triplets.mir
@@ -0,0 +1,61 @@
+# REQUIRES: x86_64-linux
+# RUN: llvm-ir2vec triplets --mode=mir %s -o 2>&1 %t1.log
+# RUN: diff %S/output/reference_triplets.txt %t1.log
+
+--- |
+  target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+  target triple = "x86_64-unknown-linux-gnu"
+  
+  define dso_local noundef i32 @add_function(i32 noundef %a, i32 noundef %b) {
+  entry:
+    %sum = add nsw i32 %a, %b
+    ret i32 %sum
+  }
+  
+  define dso_local noundef i32 @mul_function(i32 noundef %x, i32 noundef %y) {
+  entry:
+    %product = mul nsw i32 %x, %y
+    ret i32 %product
+  }
+...
+---
+name:            add_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw ADD32rr %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    RET 0, $eax
+
+---
+name:            mul_function
+alignment:       16
+tracksRegLiveness: true
+registers:
+  - { id: 0, class: gr32 }
+  - { id: 1, class: gr32 }
+  - { id: 2, class: gr32 }
+liveins:
+  - { reg: '$edi', virtual-reg: '%0' }
+  - { reg: '$esi', virtual-reg: '%1' }
+body:             |
+  bb.0.entry:
+    liveins: $edi, $esi
+  
+    %1:gr32 = COPY $esi
+    %0:gr32 = COPY $edi
+    %2:gr32 = nsw IMUL32rr %0, %1, implicit-def dead $eflags
+    $eax = COPY %2
+    RET 0, $eax
diff --git a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
index 49af4df3d8ff5..c20409e783013 100644
--- a/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
+++ b/llvm/test/tools/llvm-mca/AArch64/Neoverse/V2-sve-instructions.s
@@ -6864,7 +6864,7 @@ zip2	z31.s, z31.s, z31.s
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]
-# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1536.75 1281.75 794.25 748.25
+# CHECK-NEXT:  -      -      -      -      -      -      -     245.00 651.00 651.00 570.50 272.50 83.75  83.75  81.75  81.75  1540.75 1285.75 790.25 744.25
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0.0]  [0.1]  [1.0]  [1.1]  [2.0]  [2.1]  [2.2]  [3]    [4.0]  [4.1]  [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12]   [13]   [14]   Instructions:
@@ -9617,39 +9617,39 @@ zip2	z31.s, z31.s, z31.s
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z21.s - z24.s }, p5, [x10, #20, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -      -      -      -      -     9.00   9.00    -      -     st4w	{ z23.s - z26.s }, p3, [x13, #-32, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     9.00   9.00    -      -     4.50   4.50   4.50   4.50   9.00   9.00    -      -     st4w	{ z5.s - z8.s }, p3, [x17, x16, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0, x0]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z0.b }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0, x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z0.b }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z0.d }, p0, [z1.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z21.b }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1b	{ z23.b }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1b	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1b	{ z31.s }, p7, [z31.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z0.d }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0, x0, lsl #3]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z0.d }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z21.d }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1d	{ z23.d }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1d	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z0.h }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -     0.25   0.25   0.25   0.25   0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0, x0, lsl #1]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z0.h }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z21.h }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1h	{ z23.h }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1h	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1h	{ z31.s }, p7, [z31.s]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z0.d }, p0, [z1.d]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z0.s }, p0, [x0]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0, x0, lsl #2]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z0.s }, p0, [x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z0.s }, p0, [z1.s]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.25   0.25   0.25   0.25   stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z21.s }, p5, [x10, #7, mul vl]
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -     0.50   0.50    -      -     stnt1w	{ z23.s }, p3, [x13, #-8, mul vl]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d, x0]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -     1.00   1.00    -      -     stnt1w	{ z31.d }, p7, [z31.d]
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -     2.00   2.00    -      -     stnt1w	{ z31.s }, p7, [z31.s, x0]
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
index 1ffe53366fdb0..d1df30497325b 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx1.s
@@ -1403,8 +1403,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpeqd	%xmm0, %xmm1, %xmm2
@@ -1415,8 +1415,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpeqw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  8      6     3.00                        vpcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   vpcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        vpcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   vpcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  1      1     0.25                        vpcmpgtd	%xmm0, %xmm1, %xmm2
@@ -1427,8 +1427,8 @@ vzeroupper
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  4      2     2.00                        vpcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   vpcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        vpcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   vpcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      3     1.00                        vperm2f128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      10    1.00    *                   vperm2f128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      1     0.50                        vpermilpd	$1, %xmm0, %xmm2
@@ -1749,7 +1749,7 @@ vzeroupper
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     205.25 393.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
+# CHECK-NEXT: 1.33   1.33   1.33   16.50  16.50  16.50  16.50   -     204.25 392.58 268.08 158.08 208.50 208.50 65.00  119.67 119.67 119.67 107.00 107.00 107.00 19.00  19.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2126,8 +2126,8 @@ vzeroupper
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.50    -      -     0.50   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendvb	%xmm3, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpblendw	$11, %xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpblendw	$11, (%rax), %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm0, %xmm1, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqb	%xmm0, %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25   0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpcmpeqb	(%rax), %xmm1, %xmm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -     vpcmpeqd	%xmm0, %xmm1, %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
index 6dc5bacde9059..6c8fac4566498 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx2.s
@@ -560,14 +560,14 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      8     0.50    *                   vpcmpgtw	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  1      3     1.00                        vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  1      8     1.00    *                   vperm2i128	$1, (%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      5     1.00                        vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermd	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT:  3      13    2.00    *                   vpermpd	$1, (%rax), %ymm2
-# CHECK-NEXT:  2      7     1.00                        vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT:  3      14    2.00    *                   vpermps	(%rax), %ymm1, %ymm2
-# CHECK-NEXT:  2      6     1.00                        vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT:  2      12    2.00    *                   vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermd	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermpd	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermps	%ymm0, %ymm1, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT:  1      4     1.00                        vpermq	$1, %ymm0, %ymm2
+# CHECK-NEXT:  1      11    1.00    *                   vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT:  1      5     0.33    *                   vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
@@ -789,7 +789,7 @@ vpxor           (%rax), %ymm1, %ymm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  132.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
+# CHECK-NEXT: 6.67   6.67   6.67    -      -      -      -      -     93.75  128.75 92.25  36.25  80.50  80.50  29.00  52.33  52.33  52.33  50.67  50.67  50.67  2.50   2.50
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -894,13 +894,13 @@ vpxor           (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vperm2i128	$1, %ymm0, %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vperm2i128	$1, (%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermd	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermd	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermpd	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermpd	$1, (%rax), %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermps	%ymm0, %ymm1, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermps	(%rax), %ymm1, %ymm2
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpermq	$1, %ymm0, %ymm2
-# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     2.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
+# CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -     1.00    -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpermq	$1, (%rax), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%xmm0, (%rax,%xmm1,2), %xmm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdd	%ymm0, (%rax,%ymm1,2), %ymm2
 # CHECK-NEXT: 0.33   0.33   0.33    -      -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpgatherdq	%xmm0, (%rax,%xmm1,2), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
index 72d7de3353346..14b8e5f36c666 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512.s
@@ -1207,7 +1207,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      3     1.00                        vaddps	%zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      10    1.00    *                   vaddps	(%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignd	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1}
@@ -1216,7 +1216,7 @@ vunpcklps         (%rax){1to16}, %zmm17, %zmm19 {z}{k1}
 # CHECK-NEXT:  1      1     1.00                        valignd	$1, %zmm16, %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax), %zmm17, %zmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     1.00    *                   valignd	$1, (%rax){1to16}, %zmm17, %zmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  1      2     0.50                        valignq	$1, %zmm16, %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax), %zmm17, %zmm19
 # CHECK-NEXT:  1      8     1.00    *                   valignq	$1, (%rax){1to8}, %zmm17, %zmm19
 # CHECK-NEXT:  1      1     1.00                        valignq	$1, %zmm16, %zmm17, %zmm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
index 552b3e40284b9..ead609e33da4d 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vl.s
@@ -1948,7 +1948,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      3     0.50                        vaddps	%ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      10    0.50    *                   vaddps	(%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1957,7 +1957,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -1966,7 +1966,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignd	$1, %ymm16, %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax), %ymm17, %ymm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignd	$1, (%rax){1to8}, %ymm17, %ymm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  1      3     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1}
@@ -1975,7 +1975,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  1      4     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  1      4     1.00                        valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  1      8     0.50    *                   valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  1      1     0.50                        valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3614,7 +3614,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1083.00 636.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
+# CHECK-NEXT: 10.67  10.67  10.67   -      -      -      -      -     208.00 1084.00 637.50 261.50 509.50 509.50 32.00 355.67 355.67 355.67 334.33 334.33 334.33 32.00  32.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -3663,7 +3663,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to4}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignd	$1, (%rax){1to8}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignd	$1, %ymm16, %ymm17, %ymm19 {%k1}
@@ -3681,7 +3681,7 @@ vunpcklps         (%rax){1to8}, %ymm17, %ymm19 {z}{k1}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %xmm16, %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %xmm17, %xmm19 {%k1} {z}
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to2}, %xmm17, %xmm19 {%k1} {z}
-# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -      -     1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax), %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     valignq	$1, (%rax){1to4}, %ymm17, %ymm19
 # CHECK-NEXT:  -      -      -      -      -      -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -     valignq	$1, %ymm16, %ymm17, %ymm19 {%k1}
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
index 87ba0607e71d1..d1f2a980ee444 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %zmm17, %zmm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %zmm16, %zmm17, %zmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %zmm17, %zmm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
index 3c80c567227c5..ea7a28027a782 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-avx512vpclmulqdqvl.s
@@ -16,10 +16,10 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -48,11 +48,11 @@ vpclmulqdq    $11, (%rax), %ymm17, %ymm19
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     8.00   8.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     6.00   6.00    -      -     1.00   1.00    -     0.67   0.67   0.67   0.67   0.67   0.67    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %xmm16, %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %xmm17, %xmm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm16, %ymm17, %ymm19
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm17, %ymm19
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
index f4888cf81523f..afbd566751c95 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-bmi1.s
@@ -69,12 +69,12 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrl	(%rax), %ecx
 # CHECK-NEXT:  1      1     0.25                        blsrq	%rax, %rcx
 # CHECK-NEXT:  2      5     0.33    *                   blsrq	(%rax), %rcx
-# CHECK-NEXT:  2      2     1.00                        tzcntw	%ax, %cx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntw	(%rax), %cx
-# CHECK-NEXT:  2      2     0.50                        tzcntl	%eax, %ecx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntl	(%rax), %ecx
-# CHECK-NEXT:  2      2     0.50                        tzcntq	%rax, %rcx
-# CHECK-NEXT:  2      6     0.50    *                   tzcntq	(%rax), %rcx
+# CHECK-NEXT:  1      1     0.25                        tzcntw	%ax, %cx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntw	(%rax), %cx
+# CHECK-NEXT:  1      1     0.50                        tzcntl	%eax, %ecx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntl	(%rax), %ecx
+# CHECK-NEXT:  1      1     0.50                        tzcntq	%rax, %rcx
+# CHECK-NEXT:  1      5     0.50    *                   tzcntq	(%rax), %rcx
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -103,7 +103,7 @@ tzcnt       (%rax), %rcx
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 4.33   4.33   4.33   5.00   9.50   9.50   5.00    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
+# CHECK-NEXT: 4.33   4.33   4.33   4.25   8.75   8.75   4.25    -      -      -      -      -      -      -      -     4.33   4.33   4.33   4.33   4.33   4.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -127,7 +127,7 @@ tzcnt       (%rax), %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrl	(%rax), %ecx
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     blsrq	%rax, %rcx
 # CHECK-NEXT: 0.33   0.33   0.33   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     blsrq	(%rax), %rcx
-# CHECK-NEXT:  -      -      -     1.00   1.00   1.00   1.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
+# CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntw	%ax, %cx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntw	(%rax), %cx
 # CHECK-NEXT:  -      -      -      -     0.50   0.50    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     tzcntl	%eax, %ecx
 # CHECK-NEXT: 0.33   0.33   0.33    -     0.50   0.50    -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     tzcntl	(%rax), %ecx
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
index 64feeaf6d4ad8..26a42fd9964b5 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-cmpxchg.s
@@ -15,10 +15,10 @@ lock cmpxchg16b (%rax)
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  19     3     6.00    *      *            cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            cmpxchg16b	(%rax)
-# CHECK-NEXT:  19     3     6.00    *      *            lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  28     4     14.75   *      *            lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            cmpxchg16b	(%rax)
+# CHECK-NEXT:  15     3     5.00    *      *            lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  26     2     10.00   *      *            lock		cmpxchg16b	(%rax)
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -47,11 +47,11 @@ lock cmpxchg16b (%rax)
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -     41.50  41.50  41.50  41.50   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
+# CHECK-NEXT:  -      -      -     30.00  30.00  30.00  30.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
-# CHECK-NEXT:  -      -      -     6.00   6.00   6.00   6.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
-# CHECK-NEXT:  -      -      -     14.75  14.75  14.75  14.75   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     cmpxchg16b	(%rax)
+# CHECK-NEXT:  -      -      -     5.00   5.00   5.00   5.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg8b	(%rax)
+# CHECK-NEXT:  -      -      -     10.00  10.00  10.00  10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     lock		cmpxchg16b	(%rax)
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
index a36fb2aabe486..fc2bc8e21bf14 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-pclmul.s
@@ -13,8 +13,8 @@ pclmulqdq     $11, (%rax), %xmm2
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  4      11    2.00    *                   pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  4      4     1.50                        pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  4      11    1.50    *                   pclmulqdq	$11, (%rax), %xmm2
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ pclmulqdq     $11, (%rax), %xmm2
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     pclmulqdq	$11, %xmm0, %xmm2
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     pclmulqdq	$11, (%rax), %xmm2
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
index 015d37e3e6296..ae608354e2a6f 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-sse42.s
@@ -52,12 +52,12 @@ pcmpgtq     (%rax), %xmm2
 # CHECK-NEXT:  1      7     1.00    *                   crc32q	(%rax), %rcx
 # CHECK-NEXT:  8      6     3.00                        pcmpestri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  12     13    3.00    *                   pcmpestri	$1, (%rax), %xmm2
-# CHECK-NEXT:  7      6     3.00                        pcmpestrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  12     13    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  7      7     3.00                        pcmpestrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  12     14    3.00    *                   pcmpestrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  4      2     2.00                        pcmpistri	$1, %xmm0, %xmm2
 # CHECK-NEXT:  4      9     2.00    *                   pcmpistri	$1, (%rax), %xmm2
-# CHECK-NEXT:  3      6     2.00                        pcmpistrm	$1, %xmm0, %xmm2
-# CHECK-NEXT:  4      13    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
+# CHECK-NEXT:  3      7     2.00                        pcmpistrm	$1, %xmm0, %xmm2
+# CHECK-NEXT:  4      14    2.00    *                   pcmpistrm	$1, (%rax), %xmm2
 # CHECK-NEXT:  1      1     0.25                        pcmpgtq	%xmm0, %xmm2
 # CHECK-NEXT:  1      8     0.50    *                   pcmpgtq	(%rax), %xmm2
 
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
index 55a36d0f1ea09..dca470338b5a4 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-vpclmulqdq.s
@@ -13,8 +13,8 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 # CHECK-NEXT: [6]: HasSideEffects (U)
 
 # CHECK:      [1]    [2]    [3]    [4]    [5]    [6]    Instructions:
-# CHECK-NEXT:  4      4     2.00                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  4      11    2.00    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  4      4     1.50                        vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  4      11    1.50    *                   vpclmulqdq	$11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resources:
 # CHECK-NEXT: [0]   - Zn4AGU0
@@ -43,9 +43,9 @@ vpclmulqdq    $11, (%rax), %ymm1, %ymm3
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     4.00   4.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     3.00   3.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
-# CHECK-NEXT:  -      -      -      -      -      -      -      -     2.00   2.00    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -      -      -      -      -      -      -      -      -      -      -      -     vpclmulqdq	$11, %ymm0, %ymm1, %ymm3
+# CHECK-NEXT:  -      -      -      -      -      -      -      -     1.50   1.50    -      -     0.50   0.50    -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     vpclmulqdq	$11, (%rax), %ymm1, %ymm3
diff --git a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
index 9c5b4e45896de..886d9c6930418 100644
--- a/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
+++ b/llvm/test/tools/llvm-mca/X86/Znver4/resources-x86_64.s
@@ -1173,18 +1173,18 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      6     0.67    *      *            andq	%rsi, (%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		andq	%rsi, (%rax)
 # CHECK-NEXT:  1      5     0.33    *                   andq	(%rax), %rdi
-# CHECK-NEXT:  6      1     1.00                        bsfw	%si, %di
-# CHECK-NEXT:  6      1     1.00                        bsrw	%si, %di
-# CHECK-NEXT:  7      5     1.00    *                   bsfw	(%rax), %di
-# CHECK-NEXT:  7      5     1.00    *                   bsrw	(%rax), %di
-# CHECK-NEXT:  6      1     1.00                        bsfl	%esi, %edi
-# CHECK-NEXT:  6      1     1.00                        bsrl	%esi, %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsfl	(%rax), %edi
-# CHECK-NEXT:  7      5     1.00    *                   bsrl	(%rax), %edi
-# CHECK-NEXT:  6      1     1.00                        bsfq	%rsi, %rdi
-# CHECK-NEXT:  6      1     1.00                        bsrq	%rsi, %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsfq	(%rax), %rdi
-# CHECK-NEXT:  7      5     1.00    *                   bsrq	(%rax), %rdi
+# CHECK-NEXT:  1      1     1.00                        bsfw	%si, %di
+# CHECK-NEXT:  1      1     1.00                        bsrw	%si, %di
+# CHECK-NEXT:  2      5     1.00    *                   bsfw	(%rax), %di
+# CHECK-NEXT:  2      5     1.00    *                   bsrw	(%rax), %di
+# CHECK-NEXT:  1      1     1.00                        bsfl	%esi, %edi
+# CHECK-NEXT:  1      1     1.00                        bsrl	%esi, %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsfl	(%rax), %edi
+# CHECK-NEXT:  2      5     1.00    *                   bsrl	(%rax), %edi
+# CHECK-NEXT:  1      1     1.00                        bsfq	%rsi, %rdi
+# CHECK-NEXT:  1      1     1.00                        bsrq	%rsi, %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsfq	(%rax), %rdi
+# CHECK-NEXT:  2      5     1.00    *                   bsrq	(%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        bswapl	%eax
 # CHECK-NEXT:  1      1     0.25                        bswapq	%rax
 # CHECK-NEXT:  1      1     0.50                        btw	%si, %di
@@ -1321,23 +1321,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      1     0.25                        decq	%rdi
 # CHECK-NEXT:  1      6     0.67    *      *            decq	(%rax)
 # CHECK-NEXT:  1      6     0.67    *      *            lock		decq	(%rax)
-# CHECK-NEXT:  2      10    10.00                 U     divb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     divb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     divw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     divw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     divl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     divl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     divq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     divq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     divb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     divb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     divw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     divw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     divl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     divl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     divq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     divq	(%rax)
 # CHECK-NEXT:  100    100   25.00                 U     enter	$7, $4095
-# CHECK-NEXT:  2      10    10.00                 U     idivb	%dil
-# CHECK-NEXT:  2      14    10.00   *             U     idivb	(%rax)
-# CHECK-NEXT:  2      11    11.00                 U     idivw	%si
-# CHECK-NEXT:  2      15    11.00   *             U     idivw	(%rax)
-# CHECK-NEXT:  2      13    13.00                 U     idivl	%edx
-# CHECK-NEXT:  2      17    13.00   *             U     idivl	(%rax)
-# CHECK-NEXT:  2      17    17.00                 U     idivq	%rcx
-# CHECK-NEXT:  2      21    17.00   *             U     idivq	(%rax)
+# CHECK-NEXT:  2      9     9.00                  U     idivb	%dil
+# CHECK-NEXT:  2      13    9.00    *             U     idivb	(%rax)
+# CHECK-NEXT:  2      10    10.00                 U     idivw	%si
+# CHECK-NEXT:  2      14    10.00   *             U     idivw	(%rax)
+# CHECK-NEXT:  2      12    12.00                 U     idivl	%edx
+# CHECK-NEXT:  2      16    12.00   *             U     idivl	(%rax)
+# CHECK-NEXT:  2      18    18.00                 U     idivq	%rcx
+# CHECK-NEXT:  2      22    18.00   *             U     idivq	(%rax)
 # CHECK-NEXT:  1      3     3.00                        imulb	%dil
 # CHECK-NEXT:  1      7     3.00    *                   imulb	(%rax)
 # CHECK-NEXT:  3      3     3.00                        imulw	%di
@@ -1891,12 +1891,12 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  1      5     0.67    *      *            xaddq	%rax, (%rbx)
 # CHECK-NEXT:  1      5     0.67    *      *            lock		xaddq	%rax, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgb	%bl, %cl
-# CHECK-NEXT:  5      7     0.50    *      *            xchgb	%bl, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgb	%bl, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgb	%bl, (%rbx)
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %ax
 # CHECK-NEXT:  2      1     0.50                        xchgw	%bx, %cx
-# CHECK-NEXT:  5      7     0.50    *      *            xchgw	%ax, (%rbx)
-# CHECK-NEXT:  5      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            xchgw	%ax, (%rbx)
+# CHECK-NEXT:  2      7     0.50    *      *            lock		xchgw	%ax, (%rbx)
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %eax
 # CHECK-NEXT:  2      0     0.33                        xchgl	%ebx, %ecx
 # CHECK-NEXT:  2      6     0.50    *      *            xchgl	%eax, (%rbx)
@@ -1975,7 +1975,7 @@ xorq (%rax), %rdi
 
 # CHECK:      Resource pressure per iteration:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1]
-# CHECK-NEXT: 259.00 259.00 259.00 1733.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
+# CHECK-NEXT: 259.00 259.00 259.00 1725.00 1865.50 1775.50 1529.50 1.50  -    -      -      -      -      -      -     259.00 259.00 259.00 151.67 151.67 151.67 161.00 161.00
 
 # CHECK:      Resource pressure by instruction:
 # CHECK-NEXT: [0]    [1]    [2]    [3]    [4]    [5]    [6]    [7]    [8]    [9]    [10]   [11]   [12.0] [12.1] [13]   [14.0] [14.1] [14.2] [15.0] [15.1] [15.2] [16.0] [16.1] Instructions:
@@ -2266,23 +2266,23 @@ xorq (%rax), %rdi
 # CHECK-NEXT:  -      -      -     0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     decq	%rdi
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   decq	(%rax)
 # CHECK-NEXT: 0.67   0.67   0.67   0.25   0.25   0.25   0.25    -      -      -      -      -      -      -      -     0.67   0.67   0.67   0.33   0.33   0.33   0.50   0.50   lock		decq	(%rax)
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     divq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     divq	(%rax)
 # CHECK-NEXT:  -      -      -     25.00  25.00  25.00  25.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     enter	$7, $4095
-# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
-# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
-# CHECK-NEXT:  -      -      -     11.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
-# CHECK-NEXT: 0.33   0.33   0.33   11.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
-# CHECK-NEXT:  -      -      -     13.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
-# CHECK-NEXT: 0.33   0.33   0.33   13.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
-# CHECK-NEXT:  -      -      -     17.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
-# CHECK-NEXT: 0.33   0.33   0.33   17.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
+# CHECK-NEXT:  -      -      -     9.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivb	%dil
+# CHECK-NEXT: 0.33   0.33   0.33   9.00    -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivb	(%rax)
+# CHECK-NEXT:  -      -      -     10.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivw	%si
+# CHECK-NEXT: 0.33   0.33   0.33   10.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivw	(%rax)
+# CHECK-NEXT:  -      -      -     12.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivl	%edx
+# CHECK-NEXT: 0.33   0.33   0.33   12.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivl	(%rax)
+# CHECK-NEXT:  -      -      -     18.00   -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     idivq	%rcx
+# CHECK-NEXT: 0.33   0.33   0.33   18.00   -      -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     idivq	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulb	%dil
 # CHECK-NEXT: 0.33   0.33   0.33    -     3.00    -      -      -      -      -      -      -      -      -      -     0.33   0.33   0.33   0.33   0.33   0.33    -      -     imulb	(%rax)
 # CHECK-NEXT:  -      -      -      -     3.00    -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -      -     imulw	%di
diff --git a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
index aaaf6bffe4a33..9899dc5d41043 100644
--- a/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
+++ b/llvm/test/tools/llvm-objdump/MachO/disassemble-source-dsym.test
@@ -13,4 +13,35 @@
 # RUN: dsymutil -f  -oso-prepend-path=%p/../../dsymutil/ %t3 -o %t3.dSYM
 # RUN: llvm-objdump --source --prefix=%p/../../dsymutil  %t3 | FileCheck --check-prefix=SOURCE %s
 
+## Test that --source works with --macho flag.
+
+## --macho w/ explicit .dSYM
+# RUN: llvm-objdump < %p/../../dsymutil/Inputs/basic.macho.x86_64 - --source --macho --dsym=%t1.dSYM --prefix=%p/../../dsymutil | \
+# RUN:   FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (dir)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t2 | FileCheck --check-prefix=SOURCE %s
+
+## --macho w/ auto-detected .dSYM (file)
+# RUN: llvm-objdump --source --macho --prefix=%p/../../dsymutil %t3 | FileCheck --check-prefix=SOURCE %s
+
 # SOURCE: ; int bar(int arg) {
+
+## Test that --line-numbers works with --macho flag.
+
+## --macho -l w/ explicit .dSYM
+# RUN: llvm-objdump -d -l --macho --dsym=%t1.dSYM %p/../../dsymutil/Inputs/basic.macho.x86_64 | FileCheck --check-prefix=LINE %s
+
+## --macho -l w/ object file (embedded debug info)
+# RUN: llvm-objdump -d -l --macho %p/../../dsymutil/Inputs/basic1.macho.x86_64.o | FileCheck --check-prefix=LINE_OBJ %s
+
+# LINE: (__TEXT,__text) section
+# LINE: _bar:
+# LINE: ; bar():
+# LINE: ; {{.*}}basic3.c:
+
+# LINE_OBJ: (__TEXT,__text) section
+# LINE_OBJ: _main:
+# LINE_OBJ: ; main():
+# LINE_OBJ: ; {{.*}}basic1.c:23
+# LINE_OBJ: pushq %rbp ## basic1.c:23:0
diff --git a/llvm/test/tools/llvm-profdata/input-wildcard.test b/llvm/test/tools/llvm-profdata/input-wildcard.test
new file mode 100644
index 0000000000000..f2c46c962a817
--- /dev/null
+++ b/llvm/test/tools/llvm-profdata/input-wildcard.test
@@ -0,0 +1,15 @@
+# This test verifies that llvm-profdata will do wildcard expansion on its
+# arguments. The expansion is done by Windows-specific support in InitLLVM, so
+# we only expect this to work on Windows hosts.
+# REQUIRES: system-windows
+
+# Create two files to glob.
+RUN: echo '# empty profile 1' >  %t.prof1.proftxt
+RUN: echo '# empty profile 2' >> %t.prof2.proftxt
+
+# Prevent LIT itself from globbing by quoting the wildcard argument.
+RUN: llvm-profdata merge "%t.*.proftxt" -dump-input-file-list -o /dev/null | FileCheck %s
+
+# Verify that llvm-profdata expanded the wildcard argument.
+CHECK: 1,{{.*}}.prof1.proftxt
+CHECK-NEXT: 1,{{.*}}.prof2.proftxt
diff --git a/llvm/test/tools/llvm-readobj/ELF/section-types.test b/llvm/test/tools/llvm-readobj/ELF/section-types.test
index 904892a1b4a0b..12a9d0543bebd 100644
--- a/llvm/test/tools/llvm-readobj/ELF/section-types.test
+++ b/llvm/test/tools/llvm-readobj/ELF/section-types.test
@@ -63,6 +63,8 @@
 # LLVM: Type: SHT_LLVM_PART_PHDR
 # LLVM: Name: .llvm.lto
 # LLVM: Type: SHT_LLVM_LTO
+# LLVM: Name: .llvm.callgraph
+# LLVM: Type: SHT_LLVM_CALL_GRAPH
 # LLVM: Name: gnu_sframe
 # LLVM: Type: SHT_GNU_SFRAME
 # LLVM: Name: gnu_attributes
@@ -127,6 +129,7 @@
 # GNU-NEXT: part1                   LLVM_PART_EHDR
 # GNU-NEXT: .phdrs                  LLVM_PART_PHDR
 # GNU-NEXT: .llvm.lto               LLVM_LTO
+# GNU-NEXT: .llvm.callgraph         LLVM_CALL_GRAPH
 # GNU-NEXT: gnu_sframe              SFRAME
 # GNU-NEXT: gnu_attributes          ATTRIBUTES
 # GNU-NEXT: gnu_hash                GNU_HASH
@@ -218,6 +221,8 @@ Sections:
     Type: SHT_LLVM_PART_PHDR
   - Name: .llvm.lto
     Type: SHT_LLVM_LTO
+  - Name: .llvm.callgraph
+    Type: SHT_LLVM_CALL_GRAPH
   - Name: gnu_sframe
     Type: SHT_GNU_SFRAME
   - Name: gnu_attributes
diff --git a/llvm/test/tools/opt/no-target-machine.ll b/llvm/test/tools/opt/no-target-machine.ll
new file mode 100644
index 0000000000000..4f07c815114a0
--- /dev/null
+++ b/llvm/test/tools/opt/no-target-machine.ll
@@ -0,0 +1,18 @@
+; Report error when pass requires TargetMachine.
+; RUN: not opt -passes=atomic-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=codegenprepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=complex-deinterleaving -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=dwarf-eh-prepare -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-large-div-rem -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=expand-memcmp -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=indirectbr-expand -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-access -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=interleaved-load-combine -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=safe-stack -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=select-optimize -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=stack-protector -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes=typepromotion -disable-output %s 2>&1 | FileCheck %s
+; RUN: not opt -passes='expand-fp<O1>' -disable-output %s 2>&1 | FileCheck %s
+define void @foo() { ret void }
+; CHECK: pass '{{.+}}' requires TargetMachine
+;requires TargetMachine
diff --git a/llvm/tools/llvm-cov/llvm-cov.cpp b/llvm/tools/llvm-cov/llvm-cov.cpp
index 5ada55789b246..52610eff4f473 100644
--- a/llvm/tools/llvm-cov/llvm-cov.cpp
+++ b/llvm/tools/llvm-cov/llvm-cov.cpp
@@ -71,8 +71,8 @@ int main(int argc, const char **argv) {
                             .Case("gcov", gcovMain)
                             .Case("report", reportMain)
                             .Case("show", showMain)
-                            .Cases("-h", "-help", "--help", helpMain)
-                            .Cases("-version", "--version", versionMain)
+                            .Cases({"-h", "-help", "--help"}, helpMain)
+                            .Cases({"-version", "--version"}, versionMain)
                             .Default(nullptr);
 
     if (Func) {
diff --git a/llvm/tools/llvm-ir2vec/CMakeLists.txt b/llvm/tools/llvm-ir2vec/CMakeLists.txt
index a4cf9690e86b5..2bb6686392907 100644
--- a/llvm/tools/llvm-ir2vec/CMakeLists.txt
+++ b/llvm/tools/llvm-ir2vec/CMakeLists.txt
@@ -1,10 +1,25 @@
 set(LLVM_LINK_COMPONENTS
+  # Core LLVM components for IR processing
   Analysis
   Core
   IRReader
   Support
+  
+  # Machine IR components (for -mode=mir)
+  CodeGen           
+  MIRParser         
+  
+  # Target initialization (required for MIR parsing)
+  AllTargetsAsmParsers
+  AllTargetsCodeGens
+  AllTargetsDescs
+  AllTargetsInfos
+  TargetParser
   )
 
 add_llvm_tool(llvm-ir2vec
   llvm-ir2vec.cpp
+  
+  DEPENDS
+  intrinsics_gen
   )
diff --git a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
index 1031932116c1e..7402782bfd404 100644
--- a/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
+++ b/llvm/tools/llvm-ir2vec/llvm-ir2vec.cpp
@@ -1,4 +1,4 @@
-//===- llvm-ir2vec.cpp - IR2Vec Embedding Generation Tool -----------------===//
+//===- llvm-ir2vec.cpp - IR2Vec/MIR2Vec Embedding Generation Tool --------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -7,32 +7,54 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// This file implements the IR2Vec embedding generation tool.
+/// This file implements the IR2Vec and MIR2Vec embedding generation tool.
 ///
-/// This tool provides three main subcommands:
+/// This tool supports two modes:
+/// - LLVM IR mode (-mode=llvm): Process LLVM IR
+/// - Machine IR mode (-mode=mir): Process Machine IR
+///
+/// Available subcommands:
 ///
 /// 1. Triplet Generation (triplets):
 ///    Generates numeric triplets (head, tail, relation) for vocabulary
 ///    training. Output format: MAX_RELATION=N header followed by
 ///    head\ttail\trelation lines. Relations: 0=Type, 1=Next, 2+=Arg0,Arg1,...
-///    Usage: llvm-ir2vec triplets input.bc -o train2id.txt
+///
+///    For LLVM IR:
+///      llvm-ir2vec triplets input.bc -o train2id.txt
+///
+///    For Machine IR:
+///      llvm-ir2vec triplets -mode=mir input.mir -o train2id.txt
 ///
 /// 2. Entity Mappings (entities):
 ///    Generates entity mappings for vocabulary training.
 ///    Output format: <total_entities> header followed by entity\tid lines.
-///    Usage: llvm-ir2vec entities input.bc -o entity2id.txt
+///
+///    For LLVM IR:
+///      llvm-ir2vec entities input.bc -o entity2id.txt
+///
+///    For Machine IR:
+///      llvm-ir2vec entities -mode=mir input.mir -o entity2id.txt
 ///
 /// 3. Embedding Generation (embeddings):
-///    Generates IR2Vec embeddings using a trained vocabulary.
-///    Usage: llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json
-///    --ir2vec-kind=<kind> --level=<level> input.bc -o embeddings.txt
-///    Kind: --ir2vec-kind=symbolic (default), --ir2vec-kind=flow-aware
+///    Generates IR2Vec/MIR2Vec embeddings using a trained vocabulary.
+///
+///    For LLVM IR:
+///      llvm-ir2vec embeddings --ir2vec-vocab-path=vocab.json
+///        --ir2vec-kind=<kind> --level=<level> input.bc -o embeddings.txt
+///      Kind: --ir2vec-kind=symbolic (default), --ir2vec-kind=flow-aware
+///
+///    For Machine IR:
+///      llvm-ir2vec embeddings -mode=mir --mir2vec-vocab-path=vocab.json
+///        --level=<level> input.mir -o embeddings.txt
+///
 ///    Levels: --level=inst (instructions), --level=bb (basic blocks),
-///    --level=func (functions) (See IR2Vec.cpp for more embedding generation
-///    options)
+///    --level=func (functions) (See IR2Vec.cpp/MIR2Vec.cpp for more embedding
+///    generation options)
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/IR2Vec.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
@@ -50,10 +72,40 @@
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/raw_ostream.h"
 
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/CodeGen/MIR2Vec.h"
+#include "llvm/CodeGen/MIRParser/MIRParser.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/TargetRegistry.h"
+#include "llvm/Support/TargetSelect.h"
+#include "llvm/Support/WithColor.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/TargetParser/Host.h"
+
 #define DEBUG_TYPE "ir2vec"
 
 namespace llvm {
-namespace ir2vec {
+
+static const char *ToolName = "llvm-ir2vec";
+
+// Common option category for options shared between IR2Vec and MIR2Vec
+static cl::OptionCategory CommonCategory("Common Options",
+                                         "Options applicable to both IR2Vec "
+                                         "and MIR2Vec modes");
+
+enum IRKind {
+  LLVMIR = 0, ///< LLVM IR
+  MIR         ///< Machine IR
+};
+
+static cl::opt<IRKind>
+    IRMode("mode", cl::desc("Tool operation mode:"),
+           cl::values(clEnumValN(LLVMIR, "llvm", "Process LLVM IR"),
+                      clEnumValN(MIR, "mir", "Process Machine IR")),
+           cl::init(LLVMIR), cl::cat(CommonCategory));
 
 // Subcommands
 static cl::SubCommand
@@ -66,22 +118,21 @@ static cl::SubCommand
                      "Generate embeddings using trained vocabulary");
 
 // Common options
-static cl::opt<std::string>
-    InputFilename(cl::Positional,
-                  cl::desc("<input bitcode file or '-' for stdin>"),
-                  cl::init("-"), cl::sub(TripletsSubCmd),
-                  cl::sub(EmbeddingsSubCmd), cl::cat(ir2vec::IR2VecCategory));
+static cl::opt<std::string> InputFilename(
+    cl::Positional, cl::desc("<input bitcode/MIR file or '-' for stdin>"),
+    cl::init("-"), cl::sub(TripletsSubCmd), cl::sub(EntitiesSubCmd),
+    cl::sub(EmbeddingsSubCmd), cl::cat(CommonCategory));
 
 static cl::opt<std::string> OutputFilename("o", cl::desc("Output filename"),
                                            cl::value_desc("filename"),
                                            cl::init("-"),
-                                           cl::cat(ir2vec::IR2VecCategory));
+                                           cl::cat(CommonCategory));
 
 // Embedding-specific options
 static cl::opt<std::string>
     FunctionName("function", cl::desc("Process specific function only"),
                  cl::value_desc("name"), cl::Optional, cl::init(""),
-                 cl::sub(EmbeddingsSubCmd), cl::cat(ir2vec::IR2VecCategory));
+                 cl::sub(EmbeddingsSubCmd), cl::cat(CommonCategory));
 
 enum EmbeddingLevel {
   InstructionLevel, // Generate instruction-level embeddings
@@ -98,9 +149,9 @@ static cl::opt<EmbeddingLevel>
                      clEnumValN(FunctionLevel, "func",
                                 "Generate function-level embeddings")),
           cl::init(FunctionLevel), cl::sub(EmbeddingsSubCmd),
-          cl::cat(ir2vec::IR2VecCategory));
+          cl::cat(CommonCategory));
 
-namespace {
+namespace ir2vec {
 
 /// Relation types for triplet generation
 enum RelationType {
@@ -220,7 +271,8 @@ class IR2VecTool {
   /// Generate embeddings for the entire module
   void generateEmbeddings(raw_ostream &OS) const {
     if (!Vocab->isValid()) {
-      OS << "Error: Vocabulary is not valid. IR2VecTool not initialized.\n";
+      WithColor::error(errs(), ToolName)
+          << "Vocabulary is not valid. IR2VecTool not initialized.\n";
       return;
     }
 
@@ -239,8 +291,8 @@ class IR2VecTool {
     assert(Vocab->isValid() && "Vocabulary is not valid");
     auto Emb = Embedder::create(IR2VecEmbeddingKind, F, *Vocab);
     if (!Emb) {
-      OS << "Error: Failed to create embedder for function " << F.getName()
-         << "\n";
+      WithColor::error(errs(), ToolName)
+          << "Failed to create embedder for function " << F.getName() << "\n";
       return;
     }
 
@@ -300,20 +352,265 @@ Error processModule(Module &M, raw_ostream &OS) {
   }
   return Error::success();
 }
-} // namespace
 } // namespace ir2vec
+
+namespace mir2vec {
+
+/// Relation types for MIR2Vec triplet generation
+enum MIRRelationType {
+  MIRNextRelation = 0, ///< Sequential instruction relationship
+  MIRArgRelation = 1 ///< Instruction to operand relationship (ArgRelation + N)
+};
+
+/// Helper class for MIR2Vec embedding generation
+class MIR2VecTool {
+private:
+  MachineModuleInfo &MMI;
+  std::unique_ptr<MIRVocabulary> Vocab;
+
+public:
+  explicit MIR2VecTool(MachineModuleInfo &MMI) : MMI(MMI) {}
+
+  /// Initialize MIR2Vec vocabulary from file (for embeddings generation)
+  bool initializeVocabulary(const Module &M) {
+    MIR2VecVocabProvider Provider(MMI);
+    auto VocabOrErr = Provider.getVocabulary(M);
+    if (!VocabOrErr) {
+      WithColor::error(errs(), ToolName)
+          << "Failed to load MIR2Vec vocabulary - "
+          << toString(VocabOrErr.takeError()) << "\n";
+      return false;
+    }
+    Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
+    return true;
+  }
+
+  /// Initialize vocabulary with layout information only.
+  /// This creates a minimal vocabulary with correct layout but no actual
+  /// embeddings. Sufficient for generating training data and entity mappings.
+  ///
+  /// Note: Requires target-specific information from the first machine function
+  /// to determine the vocabulary layout (number of opcodes, register classes).
+  ///
+  /// FIXME: Use --target option to get target info directly, avoiding the need
+  /// to parse machine functions for pre-training operations.
+  bool initializeVocabularyForLayout(const Module &M) {
+    for (const Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+
+      MachineFunction *MF = MMI.getMachineFunction(F);
+      if (!MF)
+        continue;
+
+      const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+      const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+      const MachineRegisterInfo &MRI = MF->getRegInfo();
+
+      auto VocabOrErr =
+          MIRVocabulary::createDummyVocabForTest(TII, TRI, MRI, 1);
+      if (!VocabOrErr) {
+        WithColor::error(errs(), ToolName)
+            << "Failed to create dummy vocabulary - "
+            << toString(VocabOrErr.takeError()) << "\n";
+        return false;
+      }
+      Vocab = std::make_unique<MIRVocabulary>(std::move(*VocabOrErr));
+      return true;
+    }
+
+    WithColor::error(errs(), ToolName)
+        << "No machine functions found to initialize vocabulary\n";
+    return false;
+  }
+
+  /// Generate triplets for the module
+  /// Output format: MAX_RELATION=N header followed by relationships
+  void generateTriplets(const Module &M, raw_ostream &OS) const {
+    unsigned MaxRelation = MIRNextRelation; // Track maximum relation ID
+    std::string Relationships;
+    raw_string_ostream RelOS(Relationships);
+
+    for (const Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+
+      MachineFunction *MF = MMI.getMachineFunction(F);
+      if (!MF) {
+        WithColor::warning(errs(), ToolName)
+            << "No MachineFunction for " << F.getName() << "\n";
+        continue;
+      }
+
+      unsigned FuncMaxRelation = generateTriplets(*MF, RelOS);
+      MaxRelation = std::max(MaxRelation, FuncMaxRelation);
+    }
+
+    RelOS.flush();
+
+    // Write metadata header followed by relationships
+    OS << "MAX_RELATION=" << MaxRelation << '\n';
+    OS << Relationships;
+  }
+
+  /// Generate triplets for a single machine function
+  /// Returns the maximum relation ID used in this function
+  unsigned generateTriplets(const MachineFunction &MF, raw_ostream &OS) const {
+    unsigned MaxRelation = MIRNextRelation;
+    unsigned PrevOpcode = 0;
+    bool HasPrevOpcode = false;
+
+    if (!Vocab) {
+      WithColor::error(errs(), ToolName)
+          << "MIR Vocabulary must be initialized for triplet generation.\n";
+      return MaxRelation;
+    }
+
+    for (const MachineBasicBlock &MBB : MF) {
+      for (const MachineInstr &MI : MBB) {
+        // Skip debug instructions
+        if (MI.isDebugInstr())
+          continue;
+
+        // Get opcode entity ID
+        unsigned OpcodeID = Vocab->getEntityIDForOpcode(MI.getOpcode());
+
+        // Add "Next" relationship with previous instruction
+        if (HasPrevOpcode) {
+          OS << PrevOpcode << '\t' << OpcodeID << '\t' << MIRNextRelation
+             << '\n';
+          LLVM_DEBUG(dbgs()
+                     << Vocab->getStringKey(PrevOpcode) << '\t'
+                     << Vocab->getStringKey(OpcodeID) << '\t' << "Next\n");
+        }
+
+        // Add "Arg" relationships for operands
+        unsigned ArgIndex = 0;
+        for (const MachineOperand &MO : MI.operands()) {
+          auto OperandID = Vocab->getEntityIDForMachineOperand(MO);
+          unsigned RelationID = MIRArgRelation + ArgIndex;
+          OS << OpcodeID << '\t' << OperandID << '\t' << RelationID << '\n';
+          LLVM_DEBUG({
+            std::string OperandStr = Vocab->getStringKey(OperandID);
+            dbgs() << Vocab->getStringKey(OpcodeID) << '\t' << OperandStr
+                   << '\t' << "Arg" << ArgIndex << '\n';
+          });
+
+          ++ArgIndex;
+        }
+
+        // Update MaxRelation if there were operands
+        if (ArgIndex > 0)
+          MaxRelation = std::max(MaxRelation, MIRArgRelation + ArgIndex - 1);
+
+        PrevOpcode = OpcodeID;
+        HasPrevOpcode = true;
+      }
+    }
+
+    return MaxRelation;
+  }
+
+  /// Generate entity mappings with vocabulary
+  void generateEntityMappings(raw_ostream &OS) const {
+    if (!Vocab) {
+      WithColor::error(errs(), ToolName)
+          << "Vocabulary must be initialized for entity mappings.\n";
+      return;
+    }
+
+    const unsigned EntityCount = Vocab->getCanonicalSize();
+    OS << EntityCount << "\n";
+    for (unsigned EntityID = 0; EntityID < EntityCount; ++EntityID)
+      OS << Vocab->getStringKey(EntityID) << '\t' << EntityID << '\n';
+  }
+
+  /// Generate embeddings for all machine functions in the module
+  void generateEmbeddings(const Module &M, raw_ostream &OS) const {
+    if (!Vocab) {
+      WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+      return;
+    }
+
+    for (const Function &F : M) {
+      if (F.isDeclaration())
+        continue;
+
+      MachineFunction *MF = MMI.getMachineFunction(F);
+      if (!MF) {
+        WithColor::warning(errs(), ToolName)
+            << "No MachineFunction for " << F.getName() << "\n";
+        continue;
+      }
+
+      generateEmbeddings(*MF, OS);
+    }
+  }
+
+  /// Generate embeddings for a specific machine function
+  void generateEmbeddings(MachineFunction &MF, raw_ostream &OS) const {
+    if (!Vocab) {
+      WithColor::error(errs(), ToolName) << "Vocabulary not initialized.\n";
+      return;
+    }
+
+    auto Emb = MIREmbedder::create(MIR2VecKind::Symbolic, MF, *Vocab);
+    if (!Emb) {
+      WithColor::error(errs(), ToolName)
+          << "Failed to create embedder for " << MF.getName() << "\n";
+      return;
+    }
+
+    OS << "MIR2Vec embeddings for machine function " << MF.getName() << ":\n";
+
+    // Generate embeddings based on the specified level
+    switch (Level) {
+    case FunctionLevel: {
+      OS << "Function vector: ";
+      Emb->getMFunctionVector().print(OS);
+      break;
+    }
+    case BasicBlockLevel: {
+      OS << "Basic block vectors:\n";
+      for (const MachineBasicBlock &MBB : MF) {
+        OS << "MBB " << MBB.getName() << ": ";
+        Emb->getMBBVector(MBB).print(OS);
+      }
+      break;
+    }
+    case InstructionLevel: {
+      OS << "Instruction vectors:\n";
+      for (const MachineBasicBlock &MBB : MF) {
+        for (const MachineInstr &MI : MBB) {
+          OS << MI << " -> ";
+          Emb->getMInstVector(MI).print(OS);
+        }
+      }
+      break;
+    }
+    }
+  }
+
+  const MIRVocabulary *getVocabulary() const { return Vocab.get(); }
+};
+
+} // namespace mir2vec
+
 } // namespace llvm
 
 int main(int argc, char **argv) {
   using namespace llvm;
   using namespace llvm::ir2vec;
+  using namespace llvm::mir2vec;
 
   InitLLVM X(argc, argv);
-  cl::HideUnrelatedOptions(ir2vec::IR2VecCategory);
+  // Show Common, IR2Vec and MIR2Vec option categories
+  cl::HideUnrelatedOptions(ArrayRef<const cl::OptionCategory *>{
+      &CommonCategory, &ir2vec::IR2VecCategory, &mir2vec::MIR2VecCategory});
   cl::ParseCommandLineOptions(
       argc, argv,
-      "IR2Vec - Embedding Generation Tool\n"
-      "Generates embeddings for a given LLVM IR and "
+      "IR2Vec/MIR2Vec - Embedding Generation Tool\n"
+      "Generates embeddings for a given LLVM IR or MIR and "
       "supports triplet generation for vocabulary "
       "training and embedding generation.\n\n"
       "See https://llvm.org/docs/CommandGuide/llvm-ir2vec.html for more "
@@ -322,30 +619,146 @@ int main(int argc, char **argv) {
   std::error_code EC;
   raw_fd_ostream OS(OutputFilename, EC);
   if (EC) {
-    errs() << "Error opening output file: " << EC.message() << "\n";
+    WithColor::error(errs(), ToolName)
+        << "opening output file: " << EC.message() << "\n";
     return 1;
   }
 
-  if (EntitiesSubCmd) {
-    // Just dump entity mappings without processing any IR
-    IR2VecTool::generateEntityMappings(OS);
+  if (IRMode == IRKind::LLVMIR) {
+    if (EntitiesSubCmd) {
+      // Just dump entity mappings without processing any IR
+      IR2VecTool::generateEntityMappings(OS);
+      return 0;
+    }
+
+    // Parse the input LLVM IR file or stdin
+    SMDiagnostic Err;
+    LLVMContext Context;
+    std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
+    if (!M) {
+      Err.print(ToolName, errs());
+      return 1;
+    }
+
+    if (Error Err = processModule(*M, OS)) {
+      handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
+        WithColor::error(errs(), ToolName) << EIB.message() << "\n";
+      });
+      return 1;
+    }
     return 0;
   }
+  if (IRMode == IRKind::MIR) {
+    // Initialize targets for Machine IR processing
+    InitializeAllTargets();
+    InitializeAllTargetMCs();
+    InitializeAllAsmParsers();
+    InitializeAllAsmPrinters();
+    static codegen::RegisterCodeGenFlags CGF;
+
+    // Parse MIR input file
+    SMDiagnostic Err;
+    LLVMContext Context;
+    std::unique_ptr<TargetMachine> TM;
+
+    auto MIR = createMIRParserFromFile(InputFilename, Err, Context);
+    if (!MIR) {
+      Err.print(ToolName, errs());
+      return 1;
+    }
 
-  // Parse the input LLVM IR file or stdin
-  SMDiagnostic Err;
-  LLVMContext Context;
-  std::unique_ptr<Module> M = parseIRFile(InputFilename, Err, Context);
-  if (!M) {
-    Err.print(argv[0], errs());
-    return 1;
-  }
+    auto SetDataLayout = [&](StringRef DataLayoutTargetTriple,
+                             StringRef OldDLStr) -> std::optional<std::string> {
+      std::string IRTargetTriple = DataLayoutTargetTriple.str();
+      Triple TheTriple = Triple(IRTargetTriple);
+      if (TheTriple.getTriple().empty())
+        TheTriple.setTriple(sys::getDefaultTargetTriple());
+      auto TMOrErr = codegen::createTargetMachineForTriple(TheTriple.str());
+      if (!TMOrErr) {
+        Err.print(ToolName, errs());
+        exit(1);
+      }
+      TM = std::move(*TMOrErr);
+      return TM->createDataLayout().getStringRepresentation();
+    };
+
+    std::unique_ptr<Module> M = MIR->parseIRModule(SetDataLayout);
+    if (!M) {
+      Err.print(ToolName, errs());
+      return 1;
+    }
 
-  if (Error Err = processModule(*M, OS)) {
-    handleAllErrors(std::move(Err), [&](const ErrorInfoBase &EIB) {
-      errs() << "Error: " << EIB.message() << "\n";
-    });
-    return 1;
+    // Parse machine functions
+    auto MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    if (!MMI || MIR->parseMachineFunctions(*M, *MMI)) {
+      Err.print(ToolName, errs());
+      return 1;
+    }
+
+    // Create MIR2Vec tool
+    MIR2VecTool Tool(*MMI);
+
+    // Initialize vocabulary. For triplet/entity generation, only layout is
+    // needed For embedding generation, the full vocabulary is needed.
+    //
+    // Note: Unlike IR2Vec, MIR2Vec vocabulary initialization requires
+    // target-specific information for generating the vocabulary layout. So, we
+    // always initialize the vocabulary in this case.
+    if (TripletsSubCmd || EntitiesSubCmd) {
+      if (!Tool.initializeVocabularyForLayout(*M)) {
+        WithColor::error(errs(), ToolName)
+            << "Failed to initialize MIR2Vec vocabulary for layout.\n";
+        return 1;
+      }
+    } else {
+      if (!Tool.initializeVocabulary(*M)) {
+        WithColor::error(errs(), ToolName)
+            << "Failed to initialize MIR2Vec vocabulary.\n";
+        return 1;
+      }
+    }
+    assert(Tool.getVocabulary() &&
+           "MIR2Vec vocabulary should be initialized at this point");
+    LLVM_DEBUG(dbgs() << "MIR2Vec vocabulary loaded successfully.\n"
+                      << "Vocabulary dimension: "
+                      << Tool.getVocabulary()->getDimension() << "\n"
+                      << "Vocabulary size: "
+                      << Tool.getVocabulary()->getCanonicalSize() << "\n");
+
+    // Handle subcommands
+    if (TripletsSubCmd) {
+      Tool.generateTriplets(*M, OS);
+    } else if (EntitiesSubCmd) {
+      Tool.generateEntityMappings(OS);
+    } else if (EmbeddingsSubCmd) {
+      if (!FunctionName.empty()) {
+        // Process single function
+        Function *F = M->getFunction(FunctionName);
+        if (!F) {
+          WithColor::error(errs(), ToolName)
+              << "Function '" << FunctionName << "' not found\n";
+          return 1;
+        }
+
+        MachineFunction *MF = MMI->getMachineFunction(*F);
+        if (!MF) {
+          WithColor::error(errs(), ToolName)
+              << "No MachineFunction for " << FunctionName << "\n";
+          return 1;
+        }
+
+        Tool.generateEmbeddings(*MF, OS);
+      } else {
+        // Process all functions
+        Tool.generateEmbeddings(*M, OS);
+      }
+    } else {
+      WithColor::error(errs(), ToolName)
+          << "Please specify a subcommand: triplets, entities, or embeddings\n";
+      return 1;
+    }
+
+    return 0;
   }
 
   return 0;
diff --git a/llvm/tools/llvm-lto2/llvm-lto2.cpp b/llvm/tools/llvm-lto2/llvm-lto2.cpp
index ff75bb564b1e8..399306f39daeb 100644
--- a/llvm/tools/llvm-lto2/llvm-lto2.cpp
+++ b/llvm/tools/llvm-lto2/llvm-lto2.cpp
@@ -111,6 +111,12 @@ static cl::opt<std::string> DTLTOCompiler(
     "dtlto-compiler",
     cl::desc("Compiler to use for DTLTO ThinLTO backend compilations."));
 
+static cl::list<std::string> DTLTOCompilerPrependArgs(
+    "dtlto-compiler-prepend-arg", cl::CommaSeparated,
+    cl::desc("Prepend arguments to pass to the remote compiler for backend "
+             "compilations."),
+    cl::value_desc("arg"));
+
 static cl::list<std::string> DTLTOCompilerArgs(
     "dtlto-compiler-arg", cl::CommaSeparated,
     cl::desc("Arguments to pass to the remote compiler for backend "
@@ -371,6 +377,9 @@ static int run(int argc, char **argv) {
                     "with -dtlto-distributor\n";
   auto DTLTODistributorArgsSV = llvm::to_vector<0>(llvm::map_range(
       DTLTODistributorArgs, [](const std::string &S) { return StringRef(S); }));
+  auto DTLTOCompilerPrependArgsSV = llvm::to_vector<0>(
+      llvm::map_range(DTLTOCompilerPrependArgs,
+                      [](const std::string &S) { return StringRef(S); }));
   auto DTLTOCompilerArgsSV = llvm::to_vector<0>(llvm::map_range(
       DTLTOCompilerArgs, [](const std::string &S) { return StringRef(S); }));
 
@@ -388,7 +397,7 @@ static int run(int argc, char **argv) {
         llvm::heavyweight_hardware_concurrency(Threads),
         /*OnWrite=*/{}, ThinLTOEmitIndexes, ThinLTOEmitImports, OutputFilename,
         DTLTODistributor, DTLTODistributorArgsSV, DTLTOCompiler,
-        DTLTOCompilerArgsSV, SaveTemps);
+        DTLTOCompilerPrependArgsSV, DTLTOCompilerArgsSV, SaveTemps);
   } else
     Backend = createInProcessThinBackend(
         llvm::heavyweight_hardware_concurrency(Threads),
diff --git a/llvm/tools/llvm-objdump/MachODump.cpp b/llvm/tools/llvm-objdump/MachODump.cpp
index 8e9c91fde544d..f633ed52943da 100644
--- a/llvm/tools/llvm-objdump/MachODump.cpp
+++ b/llvm/tools/llvm-objdump/MachODump.cpp
@@ -13,6 +13,7 @@
 #include "MachODump.h"
 
 #include "ObjdumpOptID.h"
+#include "SourcePrinter.h"
 #include "llvm-objdump.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
@@ -7415,18 +7416,28 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
   std::unique_ptr<DIContext> diContext;
   std::unique_ptr<Binary> DSYMBinary;
   std::unique_ptr<MemoryBuffer> DSYMBuf;
-  if (UseDbg) {
-    // If separate DSym file path was specified, parse it as a macho file,
-    // get the sections and supply it to the section name parsing machinery.
-    if (const ObjectFile *DbgObj =
-            getMachODSymObject(MachOOF, Filename, DSYMBinary, DSYMBuf)) {
+  const ObjectFile *DbgObj = MachOOF;
+  if (UseDbg || PrintSource || PrintLines) {
+    // Look for debug info in external dSYM file or embedded in the object.
+    // getMachODSymObject returns MachOOF by default if no external dSYM found.
+    const ObjectFile *DSym =
+        getMachODSymObject(MachOOF, Filename, DSYMBinary, DSYMBuf);
+    if (!DSym)
+      return;
+    DbgObj = DSym;
+    if (UseDbg || PrintLines) {
       // Setup the DIContext
       diContext = DWARFContext::create(*DbgObj);
-    } else {
-      return;
     }
   }
 
+  std::optional<SourcePrinter> SP;
+  std::optional<LiveElementPrinter> LEP;
+  if (PrintSource || PrintLines) {
+    SP.emplace(DbgObj, TheTarget->getName());
+    LEP.emplace(*MRI, *STI);
+  }
+
   if (FilterSections.empty())
     outs() << "(" << DisSegName << "," << DisSectName << ") section\n";
 
@@ -7605,6 +7616,12 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
           outs() << SymName << ":\n";
 
         uint64_t PC = SectAddress + Index;
+
+        if (PrintSource || PrintLines) {
+          formatted_raw_ostream FOS(outs());
+          SP->printSourceLine(FOS, {PC, SectIdx}, Filename, *LEP);
+        }
+
         if (LeadingAddr) {
           if (FullLeadingAddr) {
             if (MachOOF->is64Bit())
@@ -7696,6 +7713,11 @@ static void DisassembleMachO(StringRef Filename, MachOObjectFile *MachOOF,
 
         uint64_t PC = SectAddress + Index;
 
+        if (PrintSource || PrintLines) {
+          formatted_raw_ostream FOS(outs());
+          SP->printSourceLine(FOS, {PC, SectIdx}, Filename, *LEP);
+        }
+
         if (DumpAndSkipDataInCode(PC, Bytes.data() + Index, Dices, InstSize))
           continue;
 
diff --git a/llvm/tools/llvm-profdata/llvm-profdata.cpp b/llvm/tools/llvm-profdata/llvm-profdata.cpp
index 15ddb05f953ee..a356bcd0773e0 100644
--- a/llvm/tools/llvm-profdata/llvm-profdata.cpp
+++ b/llvm/tools/llvm-profdata/llvm-profdata.cpp
@@ -34,7 +34,7 @@
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/Support/LLVMDriver.h"
+#include "llvm/Support/InitLLVM.h"
 #include "llvm/Support/MD5.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
@@ -3465,6 +3465,7 @@ static int order_main() {
 }
 
 int main(int argc, const char *argv[]) {
+  InitLLVM X(argc, argv);
   StringRef ProgName(sys::path::filename(argv[0]));
 
   if (argc < 2) {
diff --git a/llvm/unittests/ADT/CMakeLists.txt b/llvm/unittests/ADT/CMakeLists.txt
index 848ccba696e76..af503d9b82843 100644
--- a/llvm/unittests/ADT/CMakeLists.txt
+++ b/llvm/unittests/ADT/CMakeLists.txt
@@ -63,6 +63,7 @@ add_llvm_unittest(ADTTests
   PointerUnionTest.cpp
   PostOrderIteratorTest.cpp
   PriorityWorklistTest.cpp
+  RadixTreeTest.cpp
   RangeAdapterTest.cpp
   RewriteBufferTest.cpp
   SCCIteratorTest.cpp
diff --git a/llvm/unittests/ADT/DenseMapTest.cpp b/llvm/unittests/ADT/DenseMapTest.cpp
index 50e9c6e138ef1..aceb4f30d878d 100644
--- a/llvm/unittests/ADT/DenseMapTest.cpp
+++ b/llvm/unittests/ADT/DenseMapTest.cpp
@@ -70,6 +70,16 @@ class CtorTester {
 
   int getValue() const { return Value; }
   bool operator==(const CtorTester &RHS) const { return Value == RHS.Value; }
+
+  // Return the number of live CtorTester objects, excluding the empty and
+  // tombstone keys.
+  static size_t getNumConstructed() {
+    return std::count_if(Constructed.begin(), Constructed.end(),
+                         [](const CtorTester *Obj) {
+                           int V = Obj->getValue();
+                           return V != -1 && V != -2;
+                         });
+  }
 };
 
 std::set<CtorTester *> CtorTester::Constructed;
@@ -1031,4 +1041,64 @@ TEST(SmallDenseMapCustomTest, InitSize) {
   }
 }
 
+TEST(DenseMapCustomTest, KeyDtor) {
+  // This test relies on CtorTester being non-trivially destructible.
+  static_assert(!std::is_trivially_destructible_v<CtorTester>,
+                "CtorTester must not be trivially destructible");
+
+  // Test that keys are destructed on scope exit.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+    Map.try_emplace(CtorTester(0), 1);
+    Map.try_emplace(CtorTester(1), 2);
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+  // Test that keys are destructed on erase and shrink_and_clear.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<CtorTester, int, CtorTesterMapInfo> Map;
+    Map.try_emplace(CtorTester(0), 1);
+    Map.try_emplace(CtorTester(1), 2);
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+    Map.erase(CtorTester(1));
+    EXPECT_EQ(1u, CtorTester::getNumConstructed());
+    Map.shrink_and_clear();
+    EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
+TEST(DenseMapCustomTest, ValueDtor) {
+  // This test relies on CtorTester being non-trivially destructible.
+  static_assert(!std::is_trivially_destructible_v<CtorTester>,
+                "CtorTester must not be trivially destructible");
+
+  // Test that values are destructed on scope exit.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<int, CtorTester> Map;
+    Map.try_emplace(0, CtorTester(1));
+    Map.try_emplace(1, CtorTester(2));
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+
+  // Test that values are destructed on erase and shrink_and_clear.
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  {
+    DenseMap<int, CtorTester> Map;
+    Map.try_emplace(0, CtorTester(1));
+    Map.try_emplace(1, CtorTester(2));
+    EXPECT_EQ(2u, CtorTester::getNumConstructed());
+    Map.erase(1);
+    EXPECT_EQ(1u, CtorTester::getNumConstructed());
+    Map.shrink_and_clear();
+    EXPECT_EQ(0u, CtorTester::getNumConstructed());
+  }
+  EXPECT_EQ(0u, CtorTester::getNumConstructed());
+}
+
 } // namespace
diff --git a/llvm/unittests/ADT/RadixTreeTest.cpp b/llvm/unittests/ADT/RadixTreeTest.cpp
new file mode 100644
index 0000000000000..b2dd67c13a713
--- /dev/null
+++ b/llvm/unittests/ADT/RadixTreeTest.cpp
@@ -0,0 +1,379 @@
+//===- llvm/unittest/ADT/RadixTreeTest.cpp --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/RadixTree.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+#include <iterator>
+#include <list>
+#include <vector>
+
+using namespace llvm;
+namespace {
+
+using ::testing::ElementsAre;
+using ::testing::ElementsAreArray;
+using ::testing::Pair;
+using ::testing::UnorderedElementsAre;
+
+// Test with StringRef.
+
+TEST(RadixTreeTest, Empty) {
+  RadixTree<StringRef, int> T;
+  EXPECT_TRUE(T.empty());
+  EXPECT_EQ(T.size(), 0u);
+
+  EXPECT_TRUE(T.find_prefixes("").empty());
+  EXPECT_TRUE(T.find_prefixes("A").empty());
+
+  EXPECT_EQ(T.countNodes(), 1u);
+}
+
+TEST(RadixTreeTest, InsertEmpty) {
+  RadixTree<StringRef, int> T;
+  auto [It, IsNew] = T.emplace("", 4);
+  EXPECT_TRUE(!T.empty());
+  EXPECT_EQ(T.size(), 1u);
+  EXPECT_TRUE(IsNew);
+  const auto &[K, V] = *It;
+  EXPECT_TRUE(K.empty());
+  EXPECT_EQ(4, V);
+
+  EXPECT_THAT(T, ElementsAre(Pair("", 4)));
+
+  EXPECT_THAT(T.find_prefixes(""), ElementsAre(Pair("", 4)));
+
+  EXPECT_THAT(T.find_prefixes("a"), ElementsAre(Pair("", 4)));
+
+  EXPECT_EQ(T.countNodes(), 1u);
+}
+
+TEST(RadixTreeTest, Complex) {
+  RadixTree<StringRef, int> T;
+  T.emplace("abcd", 1);
+  EXPECT_EQ(T.countNodes(), 2u);
+  T.emplace("abklm", 2);
+  EXPECT_EQ(T.countNodes(), 4u);
+  T.emplace("123abklm", 3);
+  EXPECT_EQ(T.countNodes(), 5u);
+  T.emplace("123abklm", 4);
+  EXPECT_EQ(T.countNodes(), 5u);
+  T.emplace("ab", 5);
+  EXPECT_EQ(T.countNodes(), 5u);
+  T.emplace("1234567", 6);
+  EXPECT_EQ(T.countNodes(), 7u);
+  T.emplace("123456", 7);
+  EXPECT_EQ(T.countNodes(), 8u);
+  T.emplace("123456789", 8);
+  EXPECT_EQ(T.countNodes(), 9u);
+
+  EXPECT_THAT(T, UnorderedElementsAre(Pair("abcd", 1), Pair("abklm", 2),
+                                      Pair("123abklm", 3), Pair("ab", 5),
+                                      Pair("1234567", 6), Pair("123456", 7),
+                                      Pair("123456789", 8)));
+
+  EXPECT_THAT(T.find_prefixes("1234567890"),
+              UnorderedElementsAre(Pair("1234567", 6), Pair("123456", 7),
+                                   Pair("123456789", 8)));
+
+  EXPECT_THAT(T.find_prefixes("123abklm"),
+              UnorderedElementsAre(Pair("123abklm", 3)));
+
+  EXPECT_THAT(T.find_prefixes("abcdefg"),
+              UnorderedElementsAre(Pair("abcd", 1), Pair("ab", 5)));
+
+  EXPECT_EQ(T.countNodes(), 9u);
+}
+
+TEST(RadixTreeTest, ValueWith2Parameters) {
+  RadixTree<StringRef, std::pair<std::string, int>> T;
+  T.emplace("abcd", "a", 3);
+
+  EXPECT_THAT(T, UnorderedElementsAre(Pair("abcd", Pair("a", 3))));
+}
+
+// Test different types, less readable.
+
+template <typename T> struct TestData {
+  static const T Data1[];
+  static const T Data2[];
+};
+
+template <> const char TestData<char>::Data1[] = "abcdedcba";
+template <> const char TestData<char>::Data2[] = "abCDEDCba";
+
+template <> const int TestData<int>::Data1[] = {1, 2, 3, 4, 5, 4, 3, 2, 1};
+template <> const int TestData<int>::Data2[] = {1, 2, 4, 8, 16, 8, 4, 2, 1};
+
+template <typename T> class RadixTreeTypeTest : public ::testing::Test {
+public:
+  using IteratorType = decltype(adl_begin(std::declval<const T &>()));
+  using CharType = remove_cvref_t<decltype(*adl_begin(std::declval<T &>()))>;
+
+  T make(const CharType *Data, size_t N) { return T(StringRef(Data, N)); }
+
+  T make1(size_t N) { return make(TestData<CharType>::Data1, N); }
+  T make2(size_t N) { return make(TestData<CharType>::Data2, N); }
+};
+
+template <>
+iterator_range<StringRef::const_iterator>
+RadixTreeTypeTest<iterator_range<StringRef::const_iterator>>::make(
+    const char *Data, size_t N) {
+  return StringRef(Data).take_front(N);
+}
+
+template <>
+iterator_range<StringRef::const_reverse_iterator>
+RadixTreeTypeTest<iterator_range<StringRef::const_reverse_iterator>>::make(
+    const char *Data, size_t N) {
+  return reverse(StringRef(Data).take_back(N));
+}
+
+template <>
+ArrayRef<int> RadixTreeTypeTest<ArrayRef<int>>::make(const int *Data,
+                                                     size_t N) {
+  return ArrayRef<int>(Data, Data + N);
+}
+
+template <>
+std::vector<int> RadixTreeTypeTest<std::vector<int>>::make(const int *Data,
+                                                           size_t N) {
+  return std::vector<int>(Data, Data + N);
+}
+
+template <>
+std::list<int> RadixTreeTypeTest<std::list<int>>::make(const int *Data,
+                                                       size_t N) {
+  return std::list<int>(Data, Data + N);
+}
+
+class TypeNameGenerator {
+public:
+  template <typename T> static std::string GetName(int) {
+    if (std::is_same_v<T, StringRef>)
+      return "StringRef";
+    if (std::is_same_v<T, std::string>)
+      return "string";
+    if (std::is_same_v<T, iterator_range<StringRef::const_iterator>>)
+      return "iterator_range";
+    if (std::is_same_v<T, iterator_range<StringRef::const_reverse_iterator>>)
+      return "reverse_iterator_range";
+    if (std::is_same_v<T, ArrayRef<int>>)
+      return "ArrayRef";
+    if (std::is_same_v<T, std::vector<int>>)
+      return "vector";
+    if (std::is_same_v<T, std::list<int>>)
+      return "list";
+    return "Unknown";
+  }
+};
+
+using TestTypes =
+    ::testing::Types<StringRef, std::string,
+                     iterator_range<StringRef::const_iterator>,
+                     iterator_range<StringRef::const_reverse_iterator>,
+                     ArrayRef<int>, std::vector<int>, std::list<int>>;
+
+TYPED_TEST_SUITE(RadixTreeTypeTest, TestTypes, TypeNameGenerator);
+
+TYPED_TEST(RadixTreeTypeTest, Helpers) {
+  for (size_t i = 0; i < 9; ++i) {
+    auto R1 = this->make1(i);
+    auto R2 = this->make2(i);
+    EXPECT_EQ(llvm::range_size(R1), i);
+    EXPECT_EQ(llvm::range_size(R2), i);
+    auto [I1, I2] = llvm::mismatch(R1, R2);
+    // Exactly 2 first elements of Data1 and Data2 must match.
+    EXPECT_EQ(std::distance(R1.begin(), I1), std::min<int>(2, i));
+  }
+}
+
+TYPED_TEST(RadixTreeTypeTest, Empty) {
+  RadixTree<TypeParam, int> T;
+  EXPECT_TRUE(T.empty());
+  EXPECT_EQ(T.size(), 0u);
+
+  EXPECT_TRUE(T.find_prefixes(this->make1(0)).empty());
+  EXPECT_TRUE(T.find_prefixes(this->make2(1)).empty());
+
+  EXPECT_EQ(T.countNodes(), 1u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertEmpty) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+  auto [It, IsNew] = T.emplace(this->make1(0), 5);
+  EXPECT_TRUE(!T.empty());
+  EXPECT_EQ(T.size(), 1u);
+  EXPECT_TRUE(IsNew);
+  const auto &[K, V] = *It;
+  EXPECT_TRUE(K.empty());
+  EXPECT_EQ(5, V);
+
+  EXPECT_THAT(T.find_prefixes(this->make1(0)),
+              ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_THAT(T.find_prefixes(this->make2(1)),
+              ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_THAT(T, ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_EQ(T.countNodes(), 1u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertEmptyTwice) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+  T.emplace(this->make1(0), 5);
+  auto [It, IsNew] = T.emplace(this->make1(0), 6);
+  EXPECT_TRUE(!T.empty());
+  EXPECT_EQ(T.size(), 1u);
+  EXPECT_TRUE(!IsNew);
+  const auto &[K, V] = *It;
+  EXPECT_TRUE(K.empty());
+  EXPECT_EQ(5, V);
+
+  EXPECT_THAT(T.find_prefixes(this->make1(0)),
+              ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_THAT(T.find_prefixes(this->make2(1)),
+              ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_THAT(T, ElementsAre(Pair(ElementsAre(), 5)));
+
+  EXPECT_EQ(T.countNodes(), 1u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertOne) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+  auto [It, IsNew] = T.emplace(this->make1(1), 4);
+  EXPECT_TRUE(!T.empty());
+  EXPECT_EQ(T.size(), 1u);
+  EXPECT_TRUE(IsNew);
+  const auto &[K, V] = *It;
+  EXPECT_THAT(K, ElementsAreArray(this->make1(1)));
+  EXPECT_EQ(4, V);
+
+  EXPECT_THAT(T, ElementsAre(Pair(ElementsAreArray(this->make1(1)), 4)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(1)),
+              ElementsAre(Pair(ElementsAreArray(this->make1(1)), 4)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(2)),
+              ElementsAre(Pair(ElementsAreArray(this->make1(1)), 4)));
+
+  EXPECT_EQ(T.countNodes(), 2u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertOneTwice) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+  T.emplace(this->make1(1), 4);
+  auto [It, IsNew] = T.emplace(this->make1(1), 4);
+  EXPECT_TRUE(!T.empty());
+  EXPECT_EQ(T.size(), 1u);
+  EXPECT_TRUE(!IsNew);
+
+  EXPECT_THAT(T, ElementsAre(Pair(ElementsAreArray(this->make1(1)), 4)));
+  EXPECT_EQ(T.countNodes(), 2u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertSuperStrings) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+
+  for (size_t Len = 0; Len < 7; Len += 2) {
+    auto [It, IsNew] = T.emplace(this->make1(Len), Len);
+    EXPECT_TRUE(IsNew);
+  }
+
+  EXPECT_THAT(T,
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(0)), 0),
+                                   Pair(ElementsAreArray(this->make1(2)), 2),
+                                   Pair(ElementsAreArray(this->make1(4)), 4),
+                                   Pair(ElementsAreArray(this->make1(6)), 6)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(0)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(0)), 0)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(3)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(0)), 0),
+                                   Pair(ElementsAreArray(this->make1(2)), 2)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(7)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(0)), 0),
+                                   Pair(ElementsAreArray(this->make1(2)), 2),
+                                   Pair(ElementsAreArray(this->make1(4)), 4),
+                                   Pair(ElementsAreArray(this->make1(6)), 6)));
+
+  EXPECT_EQ(T.countNodes(), 4u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertSubStrings) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+
+  for (size_t Len = 0; Len < 7; Len += 2) {
+    auto [It, IsNew] = T.emplace(this->make1(7 - Len), 7 - Len);
+    EXPECT_TRUE(IsNew);
+  }
+
+  EXPECT_THAT(T,
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(1)), 1),
+                                   Pair(ElementsAreArray(this->make1(3)), 3),
+                                   Pair(ElementsAreArray(this->make1(5)), 5),
+                                   Pair(ElementsAreArray(this->make1(7)), 7)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(0)), UnorderedElementsAre());
+
+  EXPECT_THAT(T.find_prefixes(this->make1(3)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(1)), 1),
+                                   Pair(ElementsAreArray(this->make1(3)), 3)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(6)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(1)), 1),
+                                   Pair(ElementsAreArray(this->make1(3)), 3),
+                                   Pair(ElementsAreArray(this->make1(5)), 5)));
+
+  EXPECT_EQ(T.countNodes(), 5u);
+}
+
+TYPED_TEST(RadixTreeTypeTest, InsertVShape) {
+  using TreeType = RadixTree<TypeParam, int>;
+  TreeType T;
+
+  EXPECT_EQ(T.countNodes(), 1u);
+  T.emplace(this->make1(5), 15);
+  EXPECT_EQ(T.countNodes(), 2u);
+  T.emplace(this->make2(6), 26);
+  EXPECT_EQ(T.countNodes(), 4u);
+  T.emplace(this->make2(1), 21);
+  EXPECT_EQ(T.countNodes(), 5u);
+
+  EXPECT_THAT(T,
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make1(5)), 15),
+                                   Pair(ElementsAreArray(this->make2(6)), 26),
+                                   Pair(ElementsAreArray(this->make2(1)), 21)));
+
+  EXPECT_THAT(T.find_prefixes(this->make1(7)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make2(1)), 21),
+                                   Pair(ElementsAreArray(this->make1(5)), 15)));
+
+  EXPECT_THAT(T.find_prefixes(this->make2(7)),
+              UnorderedElementsAre(Pair(ElementsAreArray(this->make2(1)), 21),
+                                   Pair(ElementsAreArray(this->make2(6)), 26)));
+
+  EXPECT_EQ(T.countNodes(), 5u);
+}
+
+} // namespace
diff --git a/llvm/unittests/ADT/STLExtrasTest.cpp b/llvm/unittests/ADT/STLExtrasTest.cpp
index 474699835a7dc..966b1f01e8a31 100644
--- a/llvm/unittests/ADT/STLExtrasTest.cpp
+++ b/llvm/unittests/ADT/STLExtrasTest.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "gmock/gmock.h"
 #include "gtest/gtest.h"
@@ -27,6 +28,7 @@
 using namespace llvm;
 
 using testing::ElementsAre;
+using testing::ElementsAreArray;
 using testing::UnorderedElementsAre;
 
 namespace {
@@ -772,48 +774,30 @@ TEST(STLExtrasTest, DropBeginTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
   for (int n = 0; n < 5; ++n) {
-    int i = n;
-    for (auto &v : drop_begin(vec, n)) {
-      EXPECT_EQ(v, i);
-      i += 1;
-    }
-    EXPECT_EQ(i, 5);
+    EXPECT_THAT(drop_begin(vec, n),
+                ElementsAreArray(ArrayRef(&vec[n], vec.size() - n)));
   }
 }
 
 TEST(STLExtrasTest, DropBeginDefaultTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
-  int i = 1;
-  for (auto &v : drop_begin(vec)) {
-    EXPECT_EQ(v, i);
-    i += 1;
-  }
-  EXPECT_EQ(i, 5);
+  EXPECT_THAT(drop_begin(vec), ElementsAre(1, 2, 3, 4));
 }
 
 TEST(STLExtrasTest, DropEndTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
   for (int n = 0; n < 5; ++n) {
-    int i = 0;
-    for (auto &v : drop_end(vec, n)) {
-      EXPECT_EQ(v, i);
-      i += 1;
-    }
-    EXPECT_EQ(i, 5 - n);
+    EXPECT_THAT(drop_end(vec, n),
+                ElementsAreArray(ArrayRef(vec.data(), vec.size() - n)));
   }
 }
 
 TEST(STLExtrasTest, DropEndDefaultTest) {
   SmallVector<int, 5> vec{0, 1, 2, 3, 4};
 
-  int i = 0;
-  for (auto &v : drop_end(vec)) {
-    EXPECT_EQ(v, i);
-    i += 1;
-  }
-  EXPECT_EQ(i, 4);
+  EXPECT_THAT(drop_end(vec), ElementsAre(0, 1, 2, 3));
 }
 
 TEST(STLExtrasTest, MapRangeTest) {
diff --git a/llvm/unittests/ADT/STLForwardCompatTest.cpp b/llvm/unittests/ADT/STLForwardCompatTest.cpp
index 2a97e8d6a552f..c6ae6e36cfbff 100644
--- a/llvm/unittests/ADT/STLForwardCompatTest.cpp
+++ b/llvm/unittests/ADT/STLForwardCompatTest.cpp
@@ -185,7 +185,7 @@ TEST(TransformTest, ToUnderlying) {
 }
 
 TEST(STLForwardCompatTest, IdentityCxx20) {
-  llvm::identity_cxx20 identity;
+  llvm::identity identity;
 
   // Test with an lvalue.
   int X = 42;
diff --git a/llvm/unittests/ADT/SmallVectorTest.cpp b/llvm/unittests/ADT/SmallVectorTest.cpp
index e2e778f44b5e9..1a01f30e8dd35 100644
--- a/llvm/unittests/ADT/SmallVectorTest.cpp
+++ b/llvm/unittests/ADT/SmallVectorTest.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Compiler.h"
+#include "gmock/gmock.h"
 #include "gtest/gtest.h"
 #include <list>
 #include <stdarg.h>
@@ -599,6 +600,15 @@ TYPED_TEST(SmallVectorTest, AssignSmallVector) {
   assertValuesInOrder(V, 2u, 7, 7);
 }
 
+TYPED_TEST(SmallVectorTest, AssignArrayRef) {
+  SCOPED_TRACE("AssignArrayRef");
+  auto &V = this->theVector;
+  Constructable Other[] = {7, 8, 9};
+  V.push_back(Constructable(1));
+  V.assign(ArrayRef(Other));
+  assertValuesInOrder(V, 3u, 7, 8, 9);
+}
+
 // Move-assign test
 TYPED_TEST(SmallVectorTest, MoveAssignTest) {
   SCOPED_TRACE("MoveAssignTest");
@@ -1147,6 +1157,17 @@ TEST(SmallVectorTest, InitializerList) {
   EXPECT_TRUE(ArrayRef(V2).equals({4, 5, 3, 2}));
 }
 
+namespace namespace_with_adl {
+struct MyVector {
+  std::vector<int> data;
+};
+
+std::vector<int>::const_iterator begin(const MyVector &V) {
+  return V.data.begin();
+}
+std::vector<int>::const_iterator end(const MyVector &V) { return V.data.end(); }
+} // namespace namespace_with_adl
+
 TEST(SmallVectorTest, ToVector) {
   {
     std::vector<char> v = {'a', 'b', 'c'};
@@ -1164,6 +1185,15 @@ TEST(SmallVectorTest, ToVector) {
     for (size_t I = 0; I < v.size(); ++I)
       EXPECT_EQ(v[I], Vector[I]);
   }
+  {
+    // Check that to_vector and to_vector_of work with types that require ADL
+    // for being/end iterators.
+    namespace_with_adl::MyVector V = {{1, 2, 3}};
+    auto IntVector = to_vector(V);
+    EXPECT_THAT(IntVector, testing::ElementsAre(1, 2, 3));
+    IntVector = to_vector<3>(V);
+    EXPECT_THAT(IntVector, testing::ElementsAre(1, 2, 3));
+  }
 }
 
 struct To {
@@ -1222,6 +1252,15 @@ TEST(SmallVectorTest, ToVectorOf) {
     for (size_t I = 0; I < StdVector.size(); ++I)
       EXPECT_EQ(StdVector[I], Vector[I]);
   }
+  {
+    // Check that to_vector works with types that require ADL for being/end
+    // iterators.
+    namespace_with_adl::MyVector V = {{1, 2, 3}};
+    auto UnsignedVector = to_vector_of<unsigned>(V);
+    EXPECT_THAT(UnsignedVector, testing::ElementsAre(1u, 2u, 3u));
+    UnsignedVector = to_vector_of<unsigned, 3>(V);
+    EXPECT_THAT(UnsignedVector, testing::ElementsAre(1u, 2u, 3u));
+  }
 }
 
 template <class VectorT>
diff --git a/llvm/unittests/ADT/StringSwitchTest.cpp b/llvm/unittests/ADT/StringSwitchTest.cpp
index d88a0ff034d61..c94feb54d0b7d 100644
--- a/llvm/unittests/ADT/StringSwitchTest.cpp
+++ b/llvm/unittests/ADT/StringSwitchTest.cpp
@@ -157,7 +157,7 @@ TEST(StringSwitchTest, Cases) {
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
-        .Cases(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+        .Cases({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
                OSType::Windows)
         .Cases({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .Cases({"macos", "osx"}, OSType::MacOS)
@@ -189,7 +189,7 @@ TEST(StringSwitchTest, CasesLower) {
 
   auto Translate = [](StringRef S) {
     return llvm::StringSwitch<OSType>(S)
-        .CasesLower(StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt",
+        .CasesLower({StringLiteral::withInnerNUL("wind\0ws"), "win32", "winnt"},
                     OSType::Windows)
         .CasesLower({"linux", "unix", "*nix", "posix"}, OSType::Linux)
         .CasesLower({"macos", "osx"}, OSType::MacOS)
diff --git a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
index d1c0f643f5cd7..113b05299252b 100644
--- a/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
+++ b/llvm/unittests/Analysis/MemoryProfileInfoTest.cpp
@@ -638,7 +638,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
 !0 = !{!1, !3, !5, !7, !9, !11}
 !1 = !{!2, !"cold"}
 !2 = !{i64 1, i64 2, i64 3}
-!3 = !{!4, !"cold"}
+!3 = !{!4, !"cold", !13}
 !4 = !{i64 1, i64 2, i64 4}
 !5 = !{!6, !"notcold"}
 !6 = !{i64 1, i64 5, i64 6}
@@ -648,6 +648,7 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
 !10 = !{i64 1, i64 8, i64 9}
 !11 = !{!12, !"hot"}
 !12 = !{i64 1, i64 8, i64 10}  
+!13 = !{i64 123, i64 456}
 )IR");
 
   Function *Func = M->getFunction("test");
@@ -683,10 +684,25 @@ declare dso_local noalias noundef i8* @malloc(i64 noundef)
     auto *StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(0));
     EXPECT_EQ(StackId->getZExtValue(), 1u);
     StackId = mdconst::dyn_extract<ConstantInt>(StackMD->getOperand(1));
-    if (StackId->getZExtValue() == 2u)
+    if (StackId->getZExtValue() == 2u) {
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::Cold);
-    else if (StackId->getZExtValue() == 5u)
+      // We should propagate the single context size info from the second cold
+      // context above onto the new merged/trimmed context.
+      ASSERT_EQ(MIB->getNumOperands(), 3u);
+      MDNode *ContextSizePair = dyn_cast<MDNode>(MIB->getOperand(2));
+      assert(ContextSizePair->getNumOperands() == 2);
+      EXPECT_EQ(
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(0))
+              ->getZExtValue(),
+          123u);
+      EXPECT_EQ(
+          mdconst::dyn_extract<ConstantInt>(ContextSizePair->getOperand(1))
+              ->getZExtValue(),
+          456u);
+    } else if (StackId->getZExtValue() == 5u) {
       EXPECT_EQ(getMIBAllocType(MIB), AllocationType::NotCold);
+      ASSERT_EQ(MIB->getNumOperands(), 2u);
+    }
   }
 }
 
diff --git a/llvm/unittests/AsmParser/AsmParserTest.cpp b/llvm/unittests/AsmParser/AsmParserTest.cpp
index ce226705068af..898a8293925b6 100644
--- a/llvm/unittests/AsmParser/AsmParserTest.cpp
+++ b/llvm/unittests/AsmParser/AsmParserTest.cpp
@@ -6,7 +6,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/AsmParser/AsmParserContext.h"
 #include "llvm/AsmParser/Parser.h"
 #include "llvm/AsmParser/SlotMapping.h"
 #include "llvm/IR/Constants.h"
@@ -14,10 +16,14 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Debug.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/SourceMgr.h"
 #include "gtest/gtest.h"
 
+#define DEBUG_TYPE "unittest-asm-parser-tests"
+
 using namespace llvm;
 
 namespace {
@@ -479,4 +485,53 @@ TEST(AsmParserTest, DIExpressionBodyAtBeginningWithSlotMappingParsing) {
   ASSERT_EQ(Mapping.MetadataNodes.size(), 0u);
 }
 
+#define ASSERT_EQ_LOC(Loc1, Loc2)                                              \
+  do {                                                                         \
+    EXPECT_TRUE(Loc1.contains(Loc2) && Loc2.contains(Loc1))                    \
+        << #Loc1 " location: " << Loc1.Start.Line << ":" << Loc1.Start.Col     \
+        << " - " << Loc1.End.Line << ":" << Loc1.End.Col << "\n"               \
+        << #Loc2 " location: " << Loc2.Start.Line << ":" << Loc2.Start.Col     \
+        << " - " << Loc2.End.Line << ":" << Loc2.End.Col << "\n";              \
+  } while (false)
+
+TEST(AsmParserTest, ParserObjectLocations) {
+  StringRef Source = "define i32 @main() {\n"
+                     "entry:\n"
+                     "    %a = add i32 1, 2\n"
+                     "    ret i32 %a\n"
+                     "}\n";
+  LLVMContext Ctx;
+  SMDiagnostic Error;
+  SlotMapping Mapping;
+  AsmParserContext ParserContext;
+  auto Mod = parseAssemblyString(Source, Error, Ctx, &Mapping, &ParserContext);
+
+  auto *MainFn = Mod->getFunction("main");
+  ASSERT_TRUE(MainFn != nullptr);
+
+  auto MaybeMainLoc = ParserContext.getFunctionLocation(MainFn);
+  EXPECT_TRUE(MaybeMainLoc.has_value());
+  auto MainLoc = MaybeMainLoc.value();
+  auto ExpectedMainLoc = FileLocRange(FileLoc{0, 0}, FileLoc{4, 1});
+  ASSERT_EQ_LOC(MainLoc, ExpectedMainLoc);
+
+  auto &EntryBB = MainFn->getEntryBlock();
+  auto MaybeEntryBBLoc = ParserContext.getBlockLocation(&EntryBB);
+  ASSERT_TRUE(MaybeEntryBBLoc.has_value());
+  auto EntryBBLoc = MaybeEntryBBLoc.value();
+  auto ExpectedEntryBBLoc = FileLocRange(FileLoc{1, 0}, FileLoc{3, 14});
+  ASSERT_EQ_LOC(EntryBBLoc, ExpectedEntryBBLoc);
+
+  SmallVector<FileLocRange> InstructionLocations = {
+      FileLocRange(FileLoc{2, 4}, FileLoc{2, 21}),
+      FileLocRange(FileLoc{3, 4}, FileLoc{3, 14})};
+
+  for (const auto &[Inst, ExpectedLoc] : zip(EntryBB, InstructionLocations)) {
+    auto MaybeInstLoc = ParserContext.getInstructionLocation(&Inst);
+    ASSERT_TRUE(MaybeMainLoc.has_value());
+    auto InstLoc = MaybeInstLoc.value();
+    ASSERT_EQ_LOC(InstLoc, ExpectedLoc);
+  }
+}
+
 } // end anonymous namespace
diff --git a/llvm/unittests/CAS/CASTestConfig.cpp b/llvm/unittests/CAS/CASTestConfig.cpp
index 91d0970367ac3..10e4b689e151e 100644
--- a/llvm/unittests/CAS/CASTestConfig.cpp
+++ b/llvm/unittests/CAS/CASTestConfig.cpp
@@ -9,6 +9,7 @@
 #include "CASTestConfig.h"
 #include "llvm/CAS/ObjectStore.h"
 #include "gtest/gtest.h"
+#include <mutex>
 
 using namespace llvm;
 using namespace llvm::cas;
diff --git a/llvm/unittests/CAS/OnDiskCommonUtils.h b/llvm/unittests/CAS/OnDiskCommonUtils.h
index 57c8c228867fd..89f93e08366c9 100644
--- a/llvm/unittests/CAS/OnDiskCommonUtils.h
+++ b/llvm/unittests/CAS/OnDiskCommonUtils.h
@@ -45,7 +45,7 @@ inline HashType digest(StringRef Data) {
 }
 
 inline ValueType valueFromString(StringRef S) {
-  ValueType Val;
+  ValueType Val = {};
   llvm::copy(S.substr(0, sizeof(Val)), Val.data());
   return Val;
 }
diff --git a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
index 19ea8f5d30b71..41512d04b60bd 100644
--- a/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
+++ b/llvm/unittests/CAS/OnDiskKeyValueDBTest.cpp
@@ -65,7 +65,7 @@ TEST_F(OnDiskCASTest, OnDiskKeyValueDBTest) {
     // Insert a lot of entries.
     for (unsigned I = 0; I < 1024 * 100; ++I) {
       std::string Index = Twine(I).str();
-      ArrayRef<char> Val;
+      std::optional<ArrayRef<char>> Val;
       ASSERT_THAT_ERROR(
           DB->put(digest(Index), valueFromString(Index)).moveInto(Val),
           Succeeded());
diff --git a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
index 6c08173f78622..af2d56df33d38 100644
--- a/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
+++ b/llvm/unittests/CodeGen/AsmPrinterDwarfTest.cpp
@@ -383,14 +383,14 @@ class AsmPrinterHandlerTest : public AsmPrinterFixtureBase {
 
   public:
     TestHandler(AsmPrinterHandlerTest &Test) : Test(Test) {}
-    virtual ~TestHandler() {}
-    virtual void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
-    virtual void beginModule(Module *M) override { Test.BeginCount++; }
-    virtual void endModule() override { Test.EndCount++; }
-    virtual void beginFunction(const MachineFunction *MF) override {}
-    virtual void endFunction(const MachineFunction *MF) override {}
-    virtual void beginInstruction(const MachineInstr *MI) override {}
-    virtual void endInstruction() override {}
+    ~TestHandler() override {}
+    void setSymbolSize(const MCSymbol *Sym, uint64_t Size) override {}
+    void beginModule(Module *M) override { Test.BeginCount++; }
+    void endModule() override { Test.EndCount++; }
+    void beginFunction(const MachineFunction *MF) override {}
+    void endFunction(const MachineFunction *MF) override {}
+    void beginInstruction(const MachineInstr *MI) override {}
+    void endInstruction() override {}
   };
 
 protected:
diff --git a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
index ce2a38b785655..ff87e7b6a1018 100644
--- a/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
+++ b/llvm/unittests/CodeGen/InstrRefLDVTest.cpp
@@ -69,7 +69,7 @@ class InstrRefLDVTest : public testing::Test {
 
   InstrRefLDVTest() : Ctx(), Mod(std::make_unique<Module>("beehives", Ctx)) {}
 
-  void SetUp() {
+  void SetUp() override {
     // Boilerplate that creates a MachineFunction and associated blocks.
 
     Mod->setDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
diff --git a/llvm/unittests/CodeGen/MIR2VecTest.cpp b/llvm/unittests/CodeGen/MIR2VecTest.cpp
index 8710d6b45969a..d42749c553eea 100644
--- a/llvm/unittests/CodeGen/MIR2VecTest.cpp
+++ b/llvm/unittests/CodeGen/MIR2VecTest.cpp
@@ -54,6 +54,9 @@ class MIR2VecVocabTestFixture : public ::testing::Test {
   std::unique_ptr<Module> M;
   std::unique_ptr<TargetMachine> TM;
   const TargetInstrInfo *TII = nullptr;
+  const TargetRegisterInfo *TRI = nullptr;
+  std::unique_ptr<MachineModuleInfo> MMI;
+  MachineFunction *MF = nullptr;
 
   static void SetUpTestCase() {
     InitializeAllTargets();
@@ -90,15 +93,24 @@ class MIR2VecVocabTestFixture : public ::testing::Test {
     Function *F =
         Function::Create(FT, Function::ExternalLinkage, "test", M.get());
 
-    // Get the target instruction info
+    // Create MMI and MF to get TRI and MRI
+    MMI = std::make_unique<MachineModuleInfo>(TM.get());
+    MF = &MMI->getOrCreateMachineFunction(*F);
+
+    // Get the target instruction info and register info
     TII = TM->getSubtargetImpl(*F)->getInstrInfo();
-    if (!TII) {
-      GTEST_SKIP() << "Failed to get target instruction info; Skipping test";
+    TRI = TM->getSubtargetImpl(*F)->getRegisterInfo();
+    if (!TII || !TRI) {
+      GTEST_SKIP()
+          << "Failed to get target instruction/register info; Skipping test";
       return;
     }
   }
 
-  void TearDown() override { TII = nullptr; }
+  void TearDown() override {
+    TII = nullptr;
+    TRI = nullptr;
+  }
 
   // Find an opcode by name
   int findOpcodeByName(StringRef Name) {
@@ -110,17 +122,94 @@ class MIR2VecVocabTestFixture : public ::testing::Test {
   }
 
   // Create a vocabulary with specific opcodes and embeddings
-  Expected<MIRVocabulary>
-  createTestVocab(std::initializer_list<std::pair<const char *, float>> opcodes,
-                  unsigned dimension = 2) {
-    assert(TII && "TargetInstrInfo not initialized");
-    VocabMap VMap;
-    for (const auto &[name, value] : opcodes)
-      VMap[name] = Embedding(dimension, value);
-    return MIRVocabulary::create(std::move(VMap), *TII);
+  // This might cause errors in future when the validation in
+  // MIRVocabulary::generateStorage() enforces hard checks on the vocabulary
+  // entries.
+  Expected<MIRVocabulary> createTestVocab(
+      std::initializer_list<std::pair<const char *, float>> Opcodes,
+      std::initializer_list<std::pair<const char *, float>> CommonOperands,
+      std::initializer_list<std::pair<const char *, float>> PhyRegs,
+      std::initializer_list<std::pair<const char *, float>> VirtRegs,
+      unsigned Dimension = 2) {
+    assert(TII && TRI && MF && "Target info not initialized");
+    VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+    for (const auto &[Name, Value] : Opcodes)
+      OpcodeMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : CommonOperands)
+      CommonOperandMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : PhyRegs)
+      PhyRegMap[Name] = Embedding(Dimension, Value);
+
+    for (const auto &[Name, Value] : VirtRegs)
+      VirtRegMap[Name] = Embedding(Dimension, Value);
+
+    // If any section is empty, create minimal maps for other vocabulary
+    // sections to satisfy validation
+    if (Opcodes.size() == 0)
+      OpcodeMap["NOOP"] = Embedding(Dimension, 0.0f);
+    if (CommonOperands.size() == 0)
+      CommonOperandMap["Immediate"] = Embedding(Dimension, 0.0f);
+    if (PhyRegs.size() == 0)
+      PhyRegMap["GR32"] = Embedding(Dimension, 0.0f);
+    if (VirtRegs.size() == 0)
+      VirtRegMap["GR32"] = Embedding(Dimension, 0.0f);
+
+    return MIRVocabulary::create(
+        std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+        std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
   }
 };
 
+// Parameterized test for empty vocab sections
+class MIR2VecVocabEmptySectionTestFixture
+    : public MIR2VecVocabTestFixture,
+      public ::testing::WithParamInterface<int> {
+protected:
+  void SetUp() override {
+    MIR2VecVocabTestFixture::SetUp();
+    // If base class setup was skipped (TII not initialized), skip derived setup
+    if (!TII)
+      GTEST_SKIP() << "Failed to get target instruction info in "
+                      "the base class setup; Skipping test";
+  }
+};
+
+TEST_P(MIR2VecVocabEmptySectionTestFixture, EmptySectionFailsValidation) {
+  int EmptySection = GetParam();
+  VocabMap OpcodeMap, CommonOperandMap, PhyRegMap, VirtRegMap;
+
+  if (EmptySection != 0)
+    OpcodeMap["ADD"] = Embedding(2, 1.0f);
+  if (EmptySection != 1)
+    CommonOperandMap["Immediate"] = Embedding(2, 0.0f);
+  if (EmptySection != 2)
+    PhyRegMap["GR32"] = Embedding(2, 0.0f);
+  if (EmptySection != 3)
+    VirtRegMap["GR32"] = Embedding(2, 0.0f);
+
+  ASSERT_TRUE(TII != nullptr);
+  ASSERT_TRUE(TRI != nullptr);
+  ASSERT_TRUE(MF != nullptr);
+
+  auto VocabOrErr = MIRVocabulary::create(
+      std::move(OpcodeMap), std::move(CommonOperandMap), std::move(PhyRegMap),
+      std::move(VirtRegMap), *TII, *TRI, MF->getRegInfo());
+  EXPECT_FALSE(static_cast<bool>(VocabOrErr))
+      << "Factory method should fail when section " << EmptySection
+      << " is empty";
+
+  if (!VocabOrErr) {
+    auto Err = VocabOrErr.takeError();
+    std::string ErrorMsg = toString(std::move(Err));
+    EXPECT_FALSE(ErrorMsg.empty());
+  }
+}
+
+INSTANTIATE_TEST_SUITE_P(EmptySection, MIR2VecVocabEmptySectionTestFixture,
+                         ::testing::Values(0, 1, 2, 3));
+
 TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
   // Test that same base opcodes get same canonical indices
   std::string BaseName1 = MIRVocabulary::extractBaseOpcodeName("ADD16ri");
@@ -133,7 +222,7 @@ TEST_F(MIR2VecVocabTestFixture, CanonicalOpcodeMappingTest) {
   // Create a MIRVocabulary instance to test the mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
   Embedding Val = Embedding(64, 1.0f);
-  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -190,7 +279,7 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
   // Create a MIRVocabulary instance to test deterministic mapping
   // Use a minimal MIRVocabulary to trigger canonical mapping construction
-  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, 64);
+  auto TestVocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {}, 64);
   ASSERT_TRUE(static_cast<bool>(TestVocabOrErr))
       << "Failed to create vocabulary: "
       << toString(TestVocabOrErr.takeError());
@@ -210,7 +299,8 @@ TEST_F(MIR2VecVocabTestFixture, DeterministicMapping) {
 
 // Test MIRVocabulary construction
 TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
-  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, 128);
+  auto VocabOrErr =
+      createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 128);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -231,42 +321,15 @@ TEST_F(MIR2VecVocabTestFixture, VocabularyConstruction) {
   EXPECT_GT(Count, 0u);
 }
 
-// Test factory method with empty vocabulary
-TEST_F(MIR2VecVocabTestFixture, EmptyVocabularyCreation) {
-  VocabMap EmptyVMap;
-
-  auto VocabOrErr = MIRVocabulary::create(std::move(EmptyVMap), *TII);
-  EXPECT_FALSE(static_cast<bool>(VocabOrErr))
-      << "Factory method should fail with empty vocabulary";
-
-  // Consume the error
-  if (!VocabOrErr) {
-    auto Err = VocabOrErr.takeError();
-    std::string ErrorMsg = toString(std::move(Err));
-    EXPECT_FALSE(ErrorMsg.empty());
-  }
-}
-
 // Fixture for embedding related tests
 class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture {
 protected:
-  std::unique_ptr<MachineModuleInfo> MMI;
-  MachineFunction *MF = nullptr;
-
   void SetUp() override {
     MIR2VecVocabTestFixture::SetUp();
     // If base class setup was skipped (TII not initialized), skip derived setup
     if (!TII)
       GTEST_SKIP() << "Failed to get target instruction info in "
                       "the base class setup; Skipping test";
-
-    // Create a dummy function for MachineFunction
-    FunctionType *FT = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Function *F =
-        Function::Create(FT, Function::ExternalLinkage, "test", M.get());
-
-    MMI = std::make_unique<MachineModuleInfo>(TM.get());
-    MF = &MMI->getOrCreateMachineFunction(*F);
   }
 
   void TearDown() override { MIR2VecVocabTestFixture::TearDown(); }
@@ -298,7 +361,8 @@ class MIR2VecEmbeddingTestFixture : public MIR2VecVocabTestFixture {
 
 // Test factory method for creating embedder
 TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -307,7 +371,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, CreateSymbolicEmbedder) {
 }
 
 TEST_F(MIR2VecEmbeddingTestFixture, CreateInvalidMode) {
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 1);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 1);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -324,7 +389,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
           {"RET", 2.0f},  // [2.0, 2.0, 2.0, 2.0]
           {"TRAP", 3.0f}  // [3.0, 3.0, 3.0, 3.0]
       },
-      4);
+      {}, {}, {}, 4);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -378,7 +443,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, TestSymbolicEmbedder) {
 // Test embedder with multiple basic blocks
 TEST_F(MIR2VecEmbeddingTestFixture, MultipleBasicBlocks) {
   // Create a test vocabulary
-  auto VocabOrErr = createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}});
+  auto VocabOrErr =
+      createTestVocab({{"NOOP", 1.0f}, {"TRAP", 2.0f}}, {}, {}, {});
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -431,7 +497,8 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
   MF->push_back(MBB);
 
   // Create embedder
-  auto VocabOrErr = MIRVocabulary::createDummyVocabForTest(*TII, 2);
+  auto VocabOrErr =
+      MIRVocabulary::createDummyVocabForTest(*TII, *TRI, MF->getRegInfo(), 2);
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &V = *VocabOrErr;
@@ -452,7 +519,7 @@ TEST_F(MIR2VecEmbeddingTestFixture, EmptyBasicBlock) {
 TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
   // Create a test vocabulary with limited entries
   // SUB is intentionally not included
-  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}});
+  auto VocabOrErr = createTestVocab({{"ADD", 1.0f}}, {}, {}, {});
   ASSERT_TRUE(static_cast<bool>(VocabOrErr))
       << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
   auto &Vocab = *VocabOrErr;
@@ -494,4 +561,210 @@ TEST_F(MIR2VecEmbeddingTestFixture, UnknownOpcodes) {
   Embedding ExpectedBBVector(2, 1.0f * ExpectedWeight);
   EXPECT_TRUE(MBBVector.approximatelyEquals(ExpectedBBVector));
 }
+
+// Test vocabulary string key generation
+TEST_F(MIR2VecEmbeddingTestFixture, VocabularyStringKeys) {
+  auto VocabOrErr =
+      createTestVocab({{"ADD", 1.0f}, {"SUB", 2.0f}}, {}, {}, {}, 2);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  // Test that we can get string keys for all positions
+  for (size_t Pos = 0; Pos < Vocab.getCanonicalSize(); ++Pos) {
+    std::string Key = Vocab.getStringKey(Pos);
+    EXPECT_FALSE(Key.empty()) << "Empty key at position " << Pos;
+  }
+
+  // Test specific known positions if we can identify them
+  unsigned AddIndex = Vocab.getCanonicalIndexForBaseName("ADD");
+  std::string AddKey = Vocab.getStringKey(AddIndex);
+  EXPECT_EQ(AddKey, "ADD");
+
+  unsigned SubIndex = Vocab.getCanonicalIndexForBaseName("SUB");
+  std::string SubKey = Vocab.getStringKey(SubIndex);
+  EXPECT_EQ(SubKey, "SUB");
+
+  unsigned ImmIndex = Vocab.getCanonicalIndexForOperandName("Immediate");
+  std::string ImmKey = Vocab.getStringKey(ImmIndex);
+  EXPECT_EQ(ImmKey, "Immediate");
+
+  unsigned PhyRegIndex = Vocab.getCanonicalIndexForRegisterClass("GR32", true);
+  std::string PhyRegKey = Vocab.getStringKey(PhyRegIndex);
+  EXPECT_EQ(PhyRegKey, "PhyReg_GR32");
+
+  unsigned VirtRegIndex =
+      Vocab.getCanonicalIndexForRegisterClass("GR32", false);
+  std::string VirtRegKey = Vocab.getStringKey(VirtRegIndex);
+  EXPECT_EQ(VirtRegKey, "VirtReg_GR32");
+}
+
+// Test vocabulary dimension consistency
+TEST_F(MIR2VecEmbeddingTestFixture, DimensionConsistency) {
+  auto VocabOrErr = createTestVocab({{"TEST", 1.0f}}, {}, {}, {}, 5);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  EXPECT_EQ(Vocab.getDimension(), 5u);
+
+  // All embeddings should have the same dimension
+  for (auto IT = Vocab.begin(); IT != Vocab.end(); ++IT)
+    EXPECT_EQ((*IT).size(), 5u);
+}
+
+// Test invalid register handling through machine instruction creation
+TEST_F(MIR2VecEmbeddingTestFixture, InvalidRegisterHandling) {
+  float MOVValue = 1.5f;
+  float ImmValue = 0.5f;
+  float PhyRegValue = 0.2f;
+  auto VocabOrErr = createTestVocab(
+      {{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+      {{"GR8_ABCD_H", PhyRegValue}, {"GR8_ABCD_L", PhyRegValue + 0.1f}}, {}, 3);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a MOV instruction with actual operands including potential $noreg
+  // This tests the actual scenario where invalid registers are encountered
+  auto MovOpcode = findOpcodeByName("MOV32mr");
+  ASSERT_NE(MovOpcode, -1) << "MOV32mr opcode not found";
+  const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+  // Use available physical registers from the target
+  unsigned BaseReg =
+      TRI->getNumRegs() > 1 ? 1 : 0; // First available physical register
+  unsigned ValueReg = TRI->getNumRegs() > 2 ? 2 : BaseReg;
+
+  // MOV32mr typically has: base, scale, index, displacement, segment, value
+  // Use the MachineInstrBuilder API properly
+  auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+                     .addReg(BaseReg)   // base
+                     .addImm(1)         // scale
+                     .addReg(0)         // index ($noreg)
+                     .addImm(-4)        // displacement
+                     .addReg(0)         // segment ($noreg)
+                     .addReg(ValueReg); // value
+
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // This should not crash even if the instruction has $noreg operands
+  auto InstEmb = Embedder->getMInstVector(*MovInst);
+  EXPECT_EQ(InstEmb.size(), 3u);
+
+  // Test the expected embedding value
+  Embedding ExpectedOpcodeContribution(3, MOVValue * mir2vec::OpcWeight);
+  auto ExpectedOperandContribution =
+      Embedding(3, PhyRegValue * mir2vec::RegOperandWeight)   // Base
+      + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // Scale
+      + Embedding(3, 0.0f)                                    // noreg
+      + Embedding(3, ImmValue * mir2vec::CommonOperandWeight) // displacement
+      + Embedding(3, 0.0f)                                    // noreg
+      + Embedding(3, (PhyRegValue + 0.1f) * mir2vec::RegOperandWeight); // Value
+  auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "MOV instruction embedding should match expected embedding";
+}
+
+// Test handling of both physical and virtual registers in an instruction
+TEST_F(MIR2VecEmbeddingTestFixture, PhysicalAndVirtualRegisterHandling) {
+  float MOVValue = 2.0f;
+  float ImmValue = 0.7f;
+  float PhyRegValue = 0.3f;
+  float VirtRegValue = 0.9f;
+
+  // Find GR32 register class
+  const TargetRegisterClass *GR32RC = nullptr;
+  for (unsigned i = 0; i < TRI->getNumRegClasses(); ++i) {
+    const TargetRegisterClass *RC = TRI->getRegClass(i);
+    if (std::string(TRI->getRegClassName(RC)) == "GR32") {
+      GR32RC = RC;
+      break;
+    }
+  }
+  ASSERT_TRUE(GR32RC != nullptr && GR32RC->isAllocatable())
+      << "No allocatable GR32 register class found";
+
+  // Get first available physical register from GR32
+  unsigned PhyReg = *GR32RC->begin();
+  // Create a virtual register of class GR32
+  unsigned VirtReg = MF->getRegInfo().createVirtualRegister(GR32RC);
+
+  // Create vocabulary with register class based keys
+  auto VocabOrErr =
+      createTestVocab({{"MOV", MOVValue}}, {{"Immediate", ImmValue}},
+                      {{"GR32_AD", PhyRegValue}}, // GR32_AD is the minimal key
+                      {{"GR32", VirtRegValue}}, 4);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a MOV32rr instruction: MOV32rr dst, src
+  auto MovOpcode = findOpcodeByName("MOV32rr");
+  ASSERT_NE(MovOpcode, -1) << "MOV32rr opcode not found";
+  const MCInstrDesc &Desc = TII->get(MovOpcode);
+
+  // MOV32rr: dst (physical), src (virtual)
+  auto MovInst = BuildMI(*MBB, MBB->end(), DebugLoc(), Desc)
+                     .addReg(PhyReg)   // physical register destination
+                     .addReg(VirtReg); // virtual register source
+
+  // Create embedder with virtual register support
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // This should not crash and should produce a valid embedding
+  auto InstEmb = Embedder->getMInstVector(*MovInst);
+  EXPECT_EQ(InstEmb.size(), 4u);
+
+  // Test the expected embedding value
+  Embedding ExpectedOpcodeContribution(4, MOVValue * mir2vec::OpcWeight);
+  auto ExpectedOperandContribution =
+      Embedding(4, PhyRegValue * mir2vec::RegOperandWeight) // dst (physical)
+      + Embedding(4, VirtRegValue * mir2vec::RegOperandWeight); // src (virtual)
+  auto ExpectedEmb = ExpectedOpcodeContribution + ExpectedOperandContribution;
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "MOV32rr instruction embedding should match expected embedding";
+}
+
+// Test precise embedding calculation with known operands
+TEST_F(MIR2VecEmbeddingTestFixture, EmbeddingCalculation) {
+  auto VocabOrErr = createTestVocab({{"NOOP", 2.0f}}, {}, {}, {}, 2);
+  ASSERT_TRUE(static_cast<bool>(VocabOrErr))
+      << "Failed to create vocabulary: " << toString(VocabOrErr.takeError());
+  auto &Vocab = *VocabOrErr;
+
+  MachineBasicBlock *MBB = MF->CreateMachineBasicBlock();
+  MF->push_back(MBB);
+
+  // Create a simple NOOP instruction (no operands)
+  auto NoopInst = createMachineInstr(*MBB, "NOOP");
+  ASSERT_TRUE(NoopInst != nullptr);
+
+  auto Embedder = SymbolicMIREmbedder::create(*MF, Vocab);
+  ASSERT_TRUE(Embedder != nullptr);
+
+  // Get the instruction embedding
+  auto InstEmb = Embedder->getMInstVector(*NoopInst);
+  EXPECT_EQ(InstEmb.size(), 2u);
+
+  // For NOOP with no operands, the embedding should be exactly the opcode
+  // embedding
+  float ExpectedWeight = mir2vec::OpcWeight;
+  Embedding ExpectedEmb(2, 2.0f * ExpectedWeight);
+
+  EXPECT_TRUE(InstEmb.approximatelyEquals(ExpectedEmb))
+      << "NOOP instruction embedding should match opcode embedding";
+
+  // Verify individual components
+  EXPECT_FLOAT_EQ(InstEmb[0], 2.0f * ExpectedWeight);
+  EXPECT_FLOAT_EQ(InstEmb[1], 2.0f * ExpectedWeight);
+}
 } // namespace
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 16b997901dc1c..aa56aafa2812c 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -550,6 +550,31 @@ TEST_F(SelectionDAGPatternMatchTest, matchNode) {
   EXPECT_FALSE(sd_match(Add, m_Node(ISD::ADD, m_ConstInt(), m_Value())));
 }
 
+TEST_F(SelectionDAGPatternMatchTest, matchSelectLike) {
+  SDLoc DL;
+  auto Int32VT = EVT::getIntegerVT(Context, 32);
+  auto VInt32VT = EVT::getVectorVT(Context, Int32VT, 4);
+
+  SDValue Cond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, Int32VT);
+  SDValue TVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, Int32VT);
+  SDValue FVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, Int32VT);
+
+  SDValue VCond = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 0, VInt32VT);
+  SDValue VTVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 1, VInt32VT);
+  SDValue VFVal = DAG->getCopyFromReg(DAG->getEntryNode(), DL, 2, VInt32VT);
+
+  SDValue Select = DAG->getNode(ISD::SELECT, DL, Int32VT, Cond, TVal, FVal);
+  SDValue VSelect =
+      DAG->getNode(ISD::VSELECT, DL, Int32VT, VCond, VTVal, VFVal);
+
+  using namespace SDPatternMatch;
+  EXPECT_TRUE(sd_match(Select, m_SelectLike(m_Specific(Cond), m_Specific(TVal),
+                                            m_Specific(FVal))));
+  EXPECT_TRUE(
+      sd_match(VSelect, m_SelectLike(m_Specific(VCond), m_Specific(VTVal),
+                                     m_Specific(VFVal))));
+}
+
 namespace {
 struct VPMatchContext : public SDPatternMatch::BasicMatchContext {
   using SDPatternMatch::BasicMatchContext::BasicMatchContext;
diff --git a/llvm/unittests/DebugInfo/LogicalView/CompareElementsTest.cpp b/llvm/unittests/DebugInfo/LogicalView/CompareElementsTest.cpp
index e9c1fbae4e0a7..d3bf26ba14ff0 100644
--- a/llvm/unittests/DebugInfo/LogicalView/CompareElementsTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/CompareElementsTest.cpp
@@ -75,8 +75,8 @@ class ReaderTestCompare : public LVReader {
     setInstance(this);
   }
 
-  Error createScopes() { return LVReader::createScopes(); }
-  Error printScopes() { return LVReader::printScopes(); }
+  Error createScopes() override { return LVReader::createScopes(); }
+  Error printScopes() override { return LVReader::printScopes(); }
 
   void createElements();
   void addElements(bool IsReference, bool IsTarget);
diff --git a/llvm/unittests/DebugInfo/LogicalView/LocationRangesTest.cpp b/llvm/unittests/DebugInfo/LogicalView/LocationRangesTest.cpp
index 86949714afae4..7cd681369bc85 100644
--- a/llvm/unittests/DebugInfo/LogicalView/LocationRangesTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/LocationRangesTest.cpp
@@ -34,7 +34,7 @@ class ReaderTest : public LVReader {
 public:
   ReaderTest(ScopedPrinter &W) : LVReader("", "", W) { setInstance(this); }
 
-  Error createScopes() { return LVReader::createScopes(); }
+  Error createScopes() override { return LVReader::createScopes(); }
 };
 
 // Helper function to add a logical element to a given scope.
diff --git a/llvm/unittests/DebugInfo/LogicalView/LogicalElementsTest.cpp b/llvm/unittests/DebugInfo/LogicalView/LogicalElementsTest.cpp
index 8aa856ad30397..866739f09cf3f 100644
--- a/llvm/unittests/DebugInfo/LogicalView/LogicalElementsTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/LogicalElementsTest.cpp
@@ -72,8 +72,8 @@ class ReaderTestElements : public LVReader {
     setInstance(this);
   }
 
-  Error createScopes() { return LVReader::createScopes(); }
-  Error printScopes() { return LVReader::printScopes(); }
+  Error createScopes() override { return LVReader::createScopes(); }
+  Error printScopes() override { return LVReader::printScopes(); }
 
   void createElements();
   void addElements();
diff --git a/llvm/unittests/DebugInfo/LogicalView/SelectElementsTest.cpp b/llvm/unittests/DebugInfo/LogicalView/SelectElementsTest.cpp
index 70835ce05507c..2653347e76a41 100644
--- a/llvm/unittests/DebugInfo/LogicalView/SelectElementsTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/SelectElementsTest.cpp
@@ -60,7 +60,7 @@ class ReaderTestSelection : public LVReader {
     setInstance(this);
   }
 
-  Error createScopes() { return LVReader::createScopes(); }
+  Error createScopes() override { return LVReader::createScopes(); }
 
   void createElements();
   void addElements();
diff --git a/llvm/unittests/DebugInfo/LogicalView/WarningInternalTest.cpp b/llvm/unittests/DebugInfo/LogicalView/WarningInternalTest.cpp
index 36c6e16a78dfa..011321b92427c 100644
--- a/llvm/unittests/DebugInfo/LogicalView/WarningInternalTest.cpp
+++ b/llvm/unittests/DebugInfo/LogicalView/WarningInternalTest.cpp
@@ -117,7 +117,7 @@ class ReaderTestWarningInternal : public LVReader {
     setInstance(this);
   }
 
-  Error createScopes() { return LVReader::createScopes(); }
+  Error createScopes() override { return LVReader::createScopes(); }
 
   void setMapping();
   void createElements();
diff --git a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
index dc077f900d195..f03c82f76c324 100644
--- a/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
+++ b/llvm/unittests/ExecutionEngine/JITLink/JITLinkTestUtils.h
@@ -132,7 +132,7 @@ class MockJITLinkContext : public llvm::jitlink::JITLinkContext {
       : JITLinkContext(&JD), MJMM(std::move(MJMM)),
         HandleFailed(std::move(HandleFailed)) {}
 
-  ~MockJITLinkContext() {
+  ~MockJITLinkContext() override {
     if (auto Err = MJMM->deallocate(std::move(FinalizedAllocs)))
       notifyFailed(std::move(Err));
   }
diff --git a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
index 7e3ebc88cea63..b06aa2565bb04 100644
--- a/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
+++ b/llvm/unittests/ExecutionEngine/Orc/CMakeLists.txt
@@ -5,6 +5,7 @@ set(LLVM_LINK_COMPONENTS
   IRReader
   JITLink
   Object
+  ObjectYAML
   OrcDebugging
   OrcJIT
   OrcShared
diff --git a/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp
index 2bcff3356b86d..a72dc5f7af7a6 100644
--- a/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/EPCGenericMemoryAccessTest.cpp
@@ -140,7 +140,7 @@ class EPCGenericMemoryAccessTest : public testing::Test {
     MemAccess = std::make_unique<EPCGenericMemoryAccess>(*EPC, FAs);
   }
 
-  ~EPCGenericMemoryAccessTest() { cantFail(EPC->disconnect()); }
+  ~EPCGenericMemoryAccessTest() override { cantFail(EPC->disconnect()); }
 
 protected:
   std::shared_ptr<SelfExecutorProcessControl> EPC;
diff --git a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
index a57241b8a3da6..018a6786a92dc 100644
--- a/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/JITLinkRedirectionManagerTest.cpp
@@ -18,7 +18,7 @@ static int finalTarget() { return 53; }
 
 class JITLinkRedirectionManagerTest : public testing::Test {
 public:
-  ~JITLinkRedirectionManagerTest() {
+  ~JITLinkRedirectionManagerTest() override {
     if (ES)
       if (auto Err = ES->endSession())
         ES->reportError(std::move(Err));
diff --git a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
index 5ff3e26f35296..5deebec967be1 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ObjectLinkingLayerTest.cpp
@@ -34,7 +34,7 @@ ArrayRef<char> BlockContent(BlockContentBytes);
 
 class ObjectLinkingLayerTest : public testing::Test {
 public:
-  ~ObjectLinkingLayerTest() {
+  ~ObjectLinkingLayerTest() override {
     if (auto Err = ES.endSession())
       ES.reportError(std::move(Err));
   }
@@ -161,7 +161,7 @@ TEST_F(ObjectLinkingLayerTest, HandleErrorDuringPostAllocationPass) {
   // abandon the in-flight allocation and report an error.
   class TestPlugin : public ObjectLinkingLayer::Plugin {
   public:
-    ~TestPlugin() { EXPECT_TRUE(ErrorReported); }
+    ~TestPlugin() override { EXPECT_TRUE(ErrorReported); }
 
     void modifyPassConfig(MaterializationResponsibility &MR,
                           jitlink::LinkGraph &G,
@@ -215,7 +215,7 @@ TEST_F(ObjectLinkingLayerTest, AddAndRemovePlugins) {
     TestPlugin(size_t &ActivationCount, bool &PluginDestroyed)
         : ActivationCount(ActivationCount), PluginDestroyed(PluginDestroyed) {}
 
-    ~TestPlugin() { PluginDestroyed = true; }
+    ~TestPlugin() override { PluginDestroyed = true; }
 
     void modifyPassConfig(MaterializationResponsibility &MR,
                           jitlink::LinkGraph &G,
@@ -336,7 +336,7 @@ TEST(ObjectLinkingLayerSearchGeneratorTest, AbsoluteSymbolsObjectLayer) {
 
   class CheckDefs : public ObjectLinkingLayer::Plugin {
   public:
-    ~CheckDefs() { EXPECT_TRUE(SawSymbolDef); }
+    ~CheckDefs() override { EXPECT_TRUE(SawSymbolDef); }
 
     void modifyPassConfig(MaterializationResponsibility &MR,
                           jitlink::LinkGraph &G,
diff --git a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
index 469de2a3665a0..e22c35f1332f7 100644
--- a/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
+++ b/llvm/unittests/ExecutionEngine/Orc/OrcTestCommon.h
@@ -47,7 +47,7 @@ namespace orc {
 // (5) V -- A JITDylib associated with ES.
 class CoreAPIsBasedStandardTest : public testing::Test {
 public:
-  ~CoreAPIsBasedStandardTest() {
+  ~CoreAPIsBasedStandardTest() override {
     if (auto Err = ES.endSession())
       ES.reportError(std::move(Err));
   }
diff --git a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
index 686d85dc6c15b..ac591c1e94a18 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ReOptimizeLayerTest.cpp
@@ -26,7 +26,7 @@ using namespace llvm::jitlink;
 
 class ReOptimizeLayerTest : public testing::Test {
 public:
-  ~ReOptimizeLayerTest() {
+  ~ReOptimizeLayerTest() override {
     if (ES)
       if (auto Err = ES->endSession())
         ES->reportError(std::move(Err));
diff --git a/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp b/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp
index 0e5d151714fcc..3591829bdbcec 100644
--- a/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp
+++ b/llvm/unittests/ExecutionEngine/Orc/ResourceTrackerTest.cpp
@@ -50,7 +50,7 @@ class SimpleResourceManager : public ResourceManager {
   SimpleResourceManager(SimpleResourceManager &&) = delete;
   SimpleResourceManager &operator=(SimpleResourceManager &&) = delete;
 
-  ~SimpleResourceManager() { ES.deregisterResourceManager(*this); }
+  ~SimpleResourceManager() override { ES.deregisterResourceManager(*this); }
 
   /// Set the HandleRemove function object.
   void setHandleRemove(HandleRemoveFunction HandleRemove) {
diff --git a/llvm/unittests/IR/ModuleTest.cpp b/llvm/unittests/IR/ModuleTest.cpp
index 36c356730d27a..30eda738020d0 100644
--- a/llvm/unittests/IR/ModuleTest.cpp
+++ b/llvm/unittests/IR/ModuleTest.cpp
@@ -103,6 +103,36 @@ TEST(ModuleTest, setModuleFlagInt) {
   EXPECT_EQ(Val2, A2->getZExtValue());
 }
 
+TEST(ModuleTest, setModuleFlagTwoMod) {
+  LLVMContext Context;
+  Module MA("MA", Context);
+  Module MB("MB", Context);
+  StringRef Key = "Key";
+  uint32_t Val1 = 1;
+  uint32_t Val2 = 2;
+
+  // Set a flag to MA
+  EXPECT_EQ(nullptr, MA.getModuleFlag(Key));
+  MA.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val1);
+  auto A1 = mdconst::extract_or_null<ConstantInt>(MA.getModuleFlag(Key));
+  EXPECT_EQ(Val1, A1->getZExtValue());
+
+  // Set a flag to MB
+  EXPECT_EQ(nullptr, MB.getModuleFlag(Key));
+  MB.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val1);
+  auto B1 = mdconst::extract_or_null<ConstantInt>(MB.getModuleFlag(Key));
+  EXPECT_EQ(Val1, B1->getZExtValue());
+
+  // Change the flag of MA
+  MA.setModuleFlag(Module::ModFlagBehavior::Error, Key, Val2);
+  auto A2 = mdconst::extract_or_null<ConstantInt>(MA.getModuleFlag(Key));
+  EXPECT_EQ(Val2, A2->getZExtValue());
+
+  // MB should keep the original flag value
+  auto B2 = mdconst::extract_or_null<ConstantInt>(MB.getModuleFlag(Key));
+  EXPECT_EQ(Val1, B2->getZExtValue());
+}
+
 const char *IRString = R"IR(
   !llvm.module.flags = !{!0}
 
diff --git a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
index 01ff1f37f434b..87fad37635320 100644
--- a/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/SystemZ/SystemZMCDisassemblerTest.cpp
@@ -61,7 +61,7 @@ Context &getContext() {
 class SystemZMCSymbolizerTest : public MCSymbolizer {
 public:
   SystemZMCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~SystemZMCSymbolizerTest() {}
+  ~SystemZMCSymbolizerTest() override {}
 
   bool tryAddingSymbolicOperand([[maybe_unused]] MCInst &Inst,
                                 [[maybe_unused]] raw_ostream &CStream,
diff --git a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
index 8d4e46c081e19..286528fdd21c3 100644
--- a/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
+++ b/llvm/unittests/MC/X86/X86MCDisassemblerTest.cpp
@@ -62,7 +62,7 @@ Context &getContext() {
 class X86MCSymbolizerTest : public MCSymbolizer {
 public:
   X86MCSymbolizerTest(MCContext &MC) : MCSymbolizer(MC, nullptr) {}
-  ~X86MCSymbolizerTest() {}
+  ~X86MCSymbolizerTest() override {}
 
   struct OpInfo {
     int64_t Value = 0;
diff --git a/llvm/unittests/Support/GlobPatternTest.cpp b/llvm/unittests/Support/GlobPatternTest.cpp
index 58fd7678131c6..872a21e948d7a 100644
--- a/llvm/unittests/Support/GlobPatternTest.cpp
+++ b/llvm/unittests/Support/GlobPatternTest.cpp
@@ -329,6 +329,72 @@ TEST_F(GlobPatternTest, PrefixSuffix) {
   EXPECT_EQ("cd", Pat->suffix());
 }
 
+TEST_F(GlobPatternTest, Substr) {
+  auto Pat = GlobPattern::create("");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("*abcd");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("abcd*");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*d");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bc", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc*def*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("def", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*ef*g");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd*efg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcd[ef]g*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcd", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc[d]efg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("efg", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bc[]]efg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("efg", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde\\[fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcde?fg*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcde", Pat->longest_substr());
+
+  Pat = GlobPattern::create("a*bcdef{g}*h");
+  ASSERT_TRUE((bool)Pat);
+  EXPECT_EQ("bcdef", Pat->longest_substr());
+}
+
 TEST_F(GlobPatternTest, Pathological) {
   std::string P, S(40, 'a');
   StringRef Pieces[] = {"a*", "[ba]*", "{b*,a*}*"};
diff --git a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
index 5d69a316a1f16..bfc127530570d 100644
--- a/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
+++ b/llvm/unittests/TargetParser/RISCVISAInfoTest.cpp
@@ -730,6 +730,11 @@ TEST(ParseArchString, MissingDepency) {
     EXPECT_EQ(toString(RISCVISAInfo::parseArchString(Input, true).takeError()),
               "");
   }
+
+  EXPECT_EQ(toString(RISCVISAInfo::parseArchString("rv64i_xsfvfbfexp16e", true)
+                         .takeError()),
+            "'xsfvfbfexp16e' requires 'zvfbfmin' or 'zvfbfa' extension to also "
+            "be specified");
 }
 
 TEST(ParseArchString, RejectsUnrecognizedProfileNames) {
@@ -1162,6 +1167,11 @@ R"(All available -march extensions for RISC-V
     xsfmm64t             0.6
     xsfmmbase            0.6
     xsfvcp               1.0
+    xsfvfbfexp16e        0.5
+    xsfvfexp16e          0.5
+    xsfvfexp32e          0.5
+    xsfvfexpa            0.2
+    xsfvfexpa64e         0.2
     xsfvfnrclipxfqf      1.0
     xsfvfwmaccqqq        1.0
     xsfvqmaccdod         1.0
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 53a64b6474480..ef6aeae24c833 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -47,6 +47,7 @@ const char *ARMArch[] = {
     "armv9-a",     "armv9",     "armv9a",       "armv9.1-a",   "armv9.1a",
     "armv9.2-a",   "armv9.2a",  "armv9.3-a",    "armv9.3a",    "armv9.4-a",
     "armv9.4a",    "armv9.5-a", "armv9.5a",     "armv9.6a",    "armv9.6-a",
+    "armv9.7a",    "armv9.7-a",
 };
 
 std::string FormatExtensionFlags(int64_t Flags) {
@@ -622,6 +623,8 @@ TEST(TargetParserTest, testARMArch) {
                           ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(testARMArch("armv9.6-a", "generic", "v9.6a",
                           ARMBuildAttrs::CPUArch::v9_A));
+  EXPECT_TRUE(testARMArch("armv9.7-a", "generic", "v9.7a",
+                          ARMBuildAttrs::CPUArch::v9_A));
   EXPECT_TRUE(
       testARMArch("armv8-r", "generic", "v8r", ARMBuildAttrs::CPUArch::v8_R));
   EXPECT_TRUE(testARMArch("armv8-m.base", "generic", "v8m.base",
@@ -937,6 +940,7 @@ TEST(TargetParserTest, ARMparseArchProfile) {
     case ARM::ArchKind::ARMV9_4A:
     case ARM::ArchKind::ARMV9_5A:
     case ARM::ArchKind::ARMV9_6A:
+    case ARM::ArchKind::ARMV9_7A:
       EXPECT_EQ(ARM::ProfileKind::A, ARM::parseArchProfile(ARMArch[i]));
       break;
     default:
@@ -1294,6 +1298,7 @@ TEST(TargetParserTest, testAArch64Arch) {
   EXPECT_TRUE(testAArch64Arch("armv9.4-a"));
   EXPECT_TRUE(testAArch64Arch("armv9.5-a"));
   EXPECT_TRUE(testAArch64Arch("armv9.6-a"));
+  EXPECT_TRUE(testAArch64Arch("armv9.7-a"));
 }
 
 bool testAArch64Extension(StringRef CPUName, StringRef ArchExt) {
@@ -1438,7 +1443,13 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_SVEAES,       AArch64::AEK_SME_MOP4,
       AArch64::AEK_SME_TMOP,     AArch64::AEK_SVEBITPERM,
       AArch64::AEK_SSVE_BITPERM, AArch64::AEK_SVESHA3,
-      AArch64::AEK_SVESM4,
+      AArch64::AEK_SVESM4,       AArch64::AEK_CMH,
+      AArch64::AEK_LSCP,         AArch64::AEK_TLBID,
+      AArch64::AEK_MPAMV2,       AArch64::AEK_MTETC,
+      AArch64::AEK_GCIE,         AArch64::AEK_SME2P3,
+      AArch64::AEK_SVE2P3,       AArch64::AEK_SVE_B16MM,
+      AArch64::AEK_F16MM,        AArch64::AEK_F16F32DOT,
+      AArch64::AEK_F16F32MM,
   };
 
   std::vector<StringRef> Features;
@@ -1550,6 +1561,18 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+pops"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-mop4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-tmop"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+cmh"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+lscp"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+tlbid"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+mpamv2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+mtetc"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+gcie"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sme2p3"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve2p3"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-b16mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16f32dot"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f16f32mm"));
 
   // Assuming we listed every extension above, this should produce the same
   // result.
@@ -1576,6 +1599,7 @@ TEST(TargetParserTest, AArch64ArchFeatures) {
   EXPECT_EQ(AArch64::ARMV9_4A.ArchFeature, "+v9.4a");
   EXPECT_EQ(AArch64::ARMV9_5A.ArchFeature, "+v9.5a");
   EXPECT_EQ(AArch64::ARMV9_6A.ArchFeature, "+v9.6a");
+  EXPECT_EQ(AArch64::ARMV9_7A.ArchFeature, "+v9.7a");
   EXPECT_EQ(AArch64::ARMV8R.ArchFeature, "+v8r");
 }
 
@@ -1605,7 +1629,8 @@ TEST(TargetParserTest, AArch64ArchPartialOrder) {
 
   for (const auto *A :
        {&AArch64::ARMV9_1A, &AArch64::ARMV9_2A, &AArch64::ARMV9_3A,
-        &AArch64::ARMV9_4A, &AArch64::ARMV9_5A, &AArch64::ARMV9_6A})
+        &AArch64::ARMV9_4A, &AArch64::ARMV9_5A, &AArch64::ARMV9_6A,
+        &AArch64::ARMV9_7A})
     EXPECT_TRUE(A->implies(AArch64::ARMV9A));
 
   EXPECT_TRUE(AArch64::ARMV8_1A.implies(AArch64::ARMV8A));
@@ -1624,6 +1649,7 @@ TEST(TargetParserTest, AArch64ArchPartialOrder) {
   EXPECT_TRUE(AArch64::ARMV9_4A.implies(AArch64::ARMV9_3A));
   EXPECT_TRUE(AArch64::ARMV9_5A.implies(AArch64::ARMV9_4A));
   EXPECT_TRUE(AArch64::ARMV9_6A.implies(AArch64::ARMV9_5A));
+  EXPECT_TRUE(AArch64::ARMV9_7A.implies(AArch64::ARMV9_6A));
 
   EXPECT_TRUE(AArch64::ARMV9A.implies(AArch64::ARMV8_5A));
   EXPECT_TRUE(AArch64::ARMV9_1A.implies(AArch64::ARMV8_6A));
@@ -1713,6 +1739,18 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"pops", "nopops", "+pops", "-pops"},
       {"sme-mop4", "nosme-mop4", "+sme-mop4", "-sme-mop4"},
       {"sme-tmop", "nosme-tmop", "+sme-tmop", "-sme-tmop"},
+      {"cmh", "nocmh", "+cmh", "-cmh"},
+      {"lscp", "nolscp", "+lscp", "-lscp"},
+      {"tlbid", "notlbid", "+tlbid", "-tlbid"},
+      {"mpamv2", "nompamv2", "+mpamv2", "-mpamv2"},
+      {"mtetc", "nomtetc", "+mtetc", "-mtetc"},
+      {"gcie", "nogcie", "+gcie", "-gcie"},
+      {"sme2p3", "nosme2p3", "+sme2p3", "-sme2p3"},
+      {"sve2p3", "nosve2p3", "+sve2p3", "-sve2p3"},
+      {"sve-b16mm", "nosve-b16mm", "+sve-b16mm", "-sve-b16mm"},
+      {"f16mm", "nof16mm", "+f16mm", "-f16mm"},
+      {"f16f32dot", "nof16f32dot", "+f16f32dot", "-f16f32dot"},
+      {"f16f32mm", "nof16f32mm", "+f16f32mm", "-f16f32mm"},
   };
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {
@@ -1927,7 +1965,8 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nofp", "fprcvt"}, {"fp-armv8", "fprcvt"}, {}},
         {AArch64::ARMV9_6A, {"fprcvt", "nofp"}, {}, {"fp-armv8", "fprcvt"}},
 
-        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, faminmax, lut, fp8}
+        // simd -> {aes, sha2, sha3, sm4, f8f16mm, f8f32mm, faminmax, lut, fp8,
+        // f16f32dot, f16f32mm}
         {AArch64::ARMV8A, {"nosimd", "aes"}, {"neon", "aes"}, {}},
         {AArch64::ARMV8A, {"aes", "nosimd"}, {}, {"neon", "aes"}},
         {AArch64::ARMV8A, {"nosimd", "sha2"}, {"neon", "sha2"}, {}},
@@ -1946,6 +1985,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosimd", "lut"}, {"neon", "lut"}, {}},
         {AArch64::ARMV9_6A, {"fp8", "nosimd"}, {}, {"neon", "fp8"}},
         {AArch64::ARMV9_6A, {"nosimd", "fp8"}, {"neon", "fp8"}, {}},
+        {AArch64::ARMV9_7A, {"nosimd", "f16f32mm"}, {"neon", "f16f32mm"}, {}},
+        {AArch64::ARMV9_7A, {"f16f32mm", "nosimd"}, {}, {"neon", "f16f32mm"}},
+        {AArch64::ARMV9_7A, {"nosimd", "f16f32dot"}, {"neon", "f16f32dot"}, {}},
+        {AArch64::ARMV9_7A, {"f16f32dot", "nosimd"}, {}, {"neon", "f16f32dot"}},
 
         // fp8 -> {fp8dot4, fp8dot2}
         {AArch64::ARMV9_6A, {"nofp8", "fp8dot4"}, {"fp8", "fp8dot4"}, {}},
@@ -1961,17 +2004,35 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"nosimd", "fcma"}, {"neon", "complxnum"}, {}},
         {AArch64::ARMV8A, {"fcma", "nosimd"}, {}, {"neon", "complxnum"}},
 
-        // fp16 -> {fp16fml, sve}
+        // fp16 -> {fp16fml, sve, f16f32dot, f16f32mm, f16mm}
         {AArch64::ARMV8A, {"nofp16", "fp16fml"}, {"fullfp16", "fp16fml"}, {}},
         {AArch64::ARMV8A, {"fp16fml", "nofp16"}, {}, {"fullfp16", "fp16fml"}},
         {AArch64::ARMV8A, {"nofp16", "sve"}, {"fullfp16", "sve"}, {}},
         {AArch64::ARMV8A, {"sve", "nofp16"}, {}, {"fullfp16", "sve"}},
+        {AArch64::ARMV9_7A, {"nofp16", "f16mm"}, {"fullfp16", "f16mm"}, {}},
+        {AArch64::ARMV9_7A, {"f16mm", "nofp16"}, {}, {"fullfp16", "f16mm"}},
+        {AArch64::ARMV9_7A,
+         {"nofp16", "f16f32mm"},
+         {"fullfp16", "f16f32mm"},
+         {}},
+        {AArch64::ARMV9_7A,
+         {"f16f32mm", "nofp16"},
+         {},
+         {"fullfp16", "f16f32mm"}},
+        {AArch64::ARMV9_7A,
+         {"nofp16", "f16f32dot"},
+         {"fullfp16", "f16f32dot"},
+         {}},
+        {AArch64::ARMV9_7A,
+         {"f16f32dot", "nofp16"},
+         {},
+         {"fullfp16", "f16f32dot"}},
 
         // bf16 -> {sme}
         {AArch64::ARMV8A, {"nobf16", "sme"}, {"bf16", "sme"}, {}},
         {AArch64::ARMV8A, {"sme", "nobf16"}, {}, {"bf16", "sme"}},
 
-        // sve -> {sve2, f32mm, f64mm, sve-f16f32mm}
+        // sve -> {sve2, f32mm, f64mm, sve-f16f32mm, sve-b16mm}
         {AArch64::ARMV8A, {"nosve", "sve2"}, {"sve", "sve2"}, {}},
         {AArch64::ARMV8A, {"sve2", "nosve"}, {}, {"sve", "sve2"}},
         {AArch64::ARMV8A, {"nosve", "f32mm"}, {"sve", "f32mm"}, {}},
@@ -1986,6 +2047,8 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {"sve-f16f32mm", "nosve"},
          {},
          {"sve", "sve-f16f32mm"}},
+        {AArch64::ARMV9_7A, {"nosve", "sve-b16mm"}, {"sve", "sve-b16mm"}, {}},
+        {AArch64::ARMV9_7A, {"sve-b16mm", "nosve"}, {}, {"sve", "sve-b16mm"}},
 
         // aes -> {sve-aes}
         {AArch64::ARMV8A, {"noaes", "sve-aes"}, {"aes", "sve-aes"}, {}},
@@ -2031,6 +2094,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosve2p1", "sve2p2"}, {"sve2p1", "sve2p2"}, {}},
         {AArch64::ARMV9_6A, {"sve2p2", "nosve2p1"}, {}, {"sve2p1", "sve2p2"}},
 
+        // sve2p2 -> {sve2p3}
+        {AArch64::ARMV9_7A, {"nosve2p2", "sve2p3"}, {"sve2p2", "sve2p3"}, {}},
+        {AArch64::ARMV9_7A, {"sve2p3", "nosve2p2"}, {}, {"sve2p2", "sve2p3"}},
+
         // sme -> {sme2, sme-f16f16, sme-f64f64, sme-i16i64, sme-fa64}
         {AArch64::ARMV8A, {"nosme", "sme2"}, {"sme", "sme2"}, {}},
         {AArch64::ARMV8A, {"sme2", "nosme"}, {}, {"sme", "sme2"}},
@@ -2084,6 +2151,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}},
         {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}},
 
+        // sme2p2 -> {sme2p3}
+        {AArch64::ARMV9_7A, {"nosme2p2", "sme2p3"}, {"sme2p3", "sme2p2"}, {}},
+        {AArch64::ARMV9_7A, {"sme2p3", "nosme2p2"}, {}, {"sme2p2", "sme2p3"}},
+
         // fp8 -> {sme-f8f16, sme-f8f32, f8f16mm, f8f32mm, fp8dot4, fp8dot2,
         // ssve-fp8dot4, ssve-fp8dot2}
         {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}},
diff --git a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
index 3c9374b526b09..4235c93f275f0 100644
--- a/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
+++ b/llvm/unittests/Transforms/Utils/BasicBlockUtilsTest.cpp
@@ -716,3 +716,32 @@ attributes #0 = { presplitcoroutine }
   EXPECT_FALSE(llvm::isPresplitCoroSuspendExitEdge(
       *ExitN.getSinglePredecessor(), ExitN));
 }
+
+TEST(BasicBlockUtils, BasicBlockPrintable) {
+  std::string S;
+  std::string SCheck;
+  llvm::raw_string_ostream OS{S};
+  llvm::raw_string_ostream OSCheck{SCheck};
+
+  LLVMContext C;
+  std::unique_ptr<Module> M = parseIR(C, R"IR(
+define void @foo() {
+  br label %bb0
+bb0:
+  br label %.exit
+.exit:
+  ret void
+}
+)IR");
+
+  Function *F = M->getFunction("foo");
+  for (const BasicBlock &BB : *F) {
+    OS << printBasicBlock(&BB);
+    BB.printAsOperand(OSCheck);
+    EXPECT_EQ(OS.str(), OSCheck.str());
+    S.clear();
+    SCheck.clear();
+  }
+  OS << printBasicBlock(nullptr);
+  EXPECT_EQ(OS.str(), "<nullptr>");
+}
diff --git a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
index be7537c83da3a..cd866469792a2 100644
--- a/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
+++ b/llvm/utils/TableGen/Basic/CodeGenIntrinsics.cpp
@@ -278,15 +278,21 @@ CodeGenIntrinsic::CodeGenIntrinsic(const Record *R,
   TargetPrefix = R->getValueAsString("TargetPrefix");
   Name = R->getValueAsString("LLVMName").str();
 
+  std::string DefaultName = "llvm." + EnumName.str();
+  llvm::replace(DefaultName, '_', '.');
+
   if (Name == "") {
     // If an explicit name isn't specified, derive one from the DefName.
-    Name = "llvm." + EnumName.str();
-    llvm::replace(Name, '_', '.');
+    Name = std::move(DefaultName);
   } else {
     // Verify it starts with "llvm.".
     if (!StringRef(Name).starts_with("llvm."))
       PrintFatalError(DefLoc, "Intrinsic '" + DefName +
                                   "'s name does not start with 'llvm.'!");
+
+    if (Name == DefaultName)
+      PrintNote(DefLoc, "Explicitly specified name matches default name, "
+                        "consider dropping it");
   }
 
   // If TargetPrefix is specified, make sure that Name starts with
diff --git a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
index ab88d312c0580..c814ef48564ca 100644
--- a/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
+++ b/llvm/utils/TableGen/Common/GlobalISel/GlobalISelMatchTable.h
@@ -366,7 +366,7 @@ class GroupMatcher final : public Matcher {
   /// has been already called. If any of the matchers are moved out, the group
   /// becomes safe to destroy, but not safe to re-use for anything else.
   iterator_range<std::vector<Matcher *>::iterator> matchers() {
-    return make_range(Matchers.begin(), Matchers.end());
+    return Matchers;
   }
   size_t size() const { return Matchers.size(); }
   bool empty() const { return Matchers.empty(); }
@@ -705,9 +705,7 @@ class RuleMatcher : public Matcher {
     return make_range(PhysRegOperands.begin(), PhysRegOperands.end());
   }
 
-  iterator_range<MatchersTy::iterator> insnmatchers() {
-    return make_range(Matchers.begin(), Matchers.end());
-  }
+  iterator_range<MatchersTy::iterator> insnmatchers() { return Matchers; }
   bool insnmatchers_empty() const { return Matchers.empty(); }
   void insnmatchers_pop_front();
 };
diff --git a/llvm/utils/TableGen/InstrInfoEmitter.cpp b/llvm/utils/TableGen/InstrInfoEmitter.cpp
index d1b14fbbdcd3e..0b90f9104208a 100644
--- a/llvm/utils/TableGen/InstrInfoEmitter.cpp
+++ b/llvm/utils/TableGen/InstrInfoEmitter.cpp
@@ -285,7 +285,7 @@ emitGetNamedOperandIdx(raw_ostream &OS,
 
 static void
 emitGetOperandIdxName(raw_ostream &OS,
-                      MapVector<StringRef, unsigned> OperandNameToID,
+                      const MapVector<StringRef, unsigned> &OperandNameToID,
                       const MapVector<SmallVector<int>, unsigned> &OperandMap,
                       unsigned MaxNumOperands, unsigned NumOperandNames) {
   OS << "LLVM_READONLY OpName getOperandIdxName(uint16_t Opcode, int16_t Idx) "
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index e00b06ca3f537..61b0b661c0f32 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -102,7 +102,7 @@ class RegisterBank {
 
   iterator_range<typename RegisterClassesTy::const_iterator>
   register_classes() const {
-    return llvm::make_range(RCs.begin(), RCs.end());
+    return RCs;
   }
 };
 
diff --git a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
index 9981d100fd555..4da907cbdd938 100644
--- a/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/AST/BUILD.gn
@@ -121,6 +121,7 @@ static_library("AST") {
     "ExternalASTMerger.cpp",
     "ExternalASTSource.cpp",
     "FormatString.cpp",
+    "InferAlloc.cpp",
     "InheritViz.cpp",
     "ItaniumCXXABI.cpp",
     "ItaniumMangle.cpp",
diff --git a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
index 29c617881dc4c..3c523aeada6cb 100644
--- a/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/lib/Headers/BUILD.gn
@@ -104,6 +104,9 @@ copy("Headers") {
     "__clang_hip_runtime_wrapper.h",
     "__clang_hip_stdlib.h",
     "__clang_spirv_builtins.h",
+    "__float_float.h",
+    "__float_header_macro.h",
+    "__float_infinity_nan.h",
     "__stdarg___gnuc_va_list.h",
     "__stdarg___va_copy.h",
     "__stdarg_header_macro.h",
diff --git a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
index e7470060899a1..278c29c766ddb 100644
--- a/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
+++ b/llvm/utils/gn/secondary/libcxx/include/BUILD.gn
@@ -911,7 +911,6 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/cmath",
       "__cxx03/codecvt",
       "__cxx03/complex",
-      "__cxx03/complex.h",
       "__cxx03/condition_variable",
       "__cxx03/csetjmp",
       "__cxx03/csignal",
@@ -924,25 +923,20 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/cstring",
       "__cxx03/ctgmath",
       "__cxx03/ctime",
-      "__cxx03/ctype.h",
       "__cxx03/cuchar",
       "__cxx03/cwchar",
       "__cxx03/cwctype",
       "__cxx03/deque",
-      "__cxx03/errno.h",
       "__cxx03/exception",
       "__cxx03/experimental/__config",
       "__cxx03/experimental/utility",
       "__cxx03/ext/__hash",
       "__cxx03/ext/hash_map",
       "__cxx03/ext/hash_set",
-      "__cxx03/fenv.h",
-      "__cxx03/float.h",
       "__cxx03/forward_list",
       "__cxx03/fstream",
       "__cxx03/functional",
       "__cxx03/future",
-      "__cxx03/inttypes.h",
       "__cxx03/iomanip",
       "__cxx03/ios",
       "__cxx03/iosfwd",
@@ -969,11 +963,8 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/sstream",
       "__cxx03/stack",
       "__cxx03/stdatomic.h",
-      "__cxx03/stdbool.h",
-      "__cxx03/stddef.h",
       "__cxx03/stdexcept",
       "__cxx03/stdint.h",
-      "__cxx03/stdio.h",
       "__cxx03/stdlib.h",
       "__cxx03/streambuf",
       "__cxx03/string",
@@ -981,7 +972,6 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/string_view",
       "__cxx03/strstream",
       "__cxx03/system_error",
-      "__cxx03/tgmath.h",
       "__cxx03/thread",
       "__cxx03/type_traits",
       "__cxx03/typeindex",
@@ -994,7 +984,6 @@ if (current_toolchain == default_toolchain) {
       "__cxx03/vector",
       "__cxx03/version",
       "__cxx03/wchar.h",
-      "__cxx03/wctype.h",
       "__debug_utils/randomize_range.h",
       "__debug_utils/sanitizers.h",
       "__debug_utils/strict_weak_ordering_check.h",
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
index 022cd87a5b303..1ca929219a953 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/BUILD.gn
@@ -301,7 +301,6 @@ write_cmake_config("llvm-config") {
     "LLVM_BUILD_SHARED_LIBS=",
     "LLVM_ENABLE_LLVM_C_EXPORT_ANNOTATIONS=",
     "LLVM_ENABLE_TELEMETRY=",
-    "LLVM_ENABLE_ONDISK_CAS=",
     "LLVM_DEFAULT_TARGET_TRIPLE=$llvm_target_triple",
     "LLVM_ENABLE_DEBUGLOC_TRACKING_COVERAGE=",
     "LLVM_ENABLE_DEBUGLOC_TRACKING_ORIGIN=",
@@ -367,6 +366,12 @@ write_cmake_config("llvm-config") {
     values += [ "LLVM_ENABLE_DIA_SDK=" ]
   }
 
+  if (llvm_enable_ondisk_cas) {
+    values += [ "LLVM_ENABLE_ONDISK_CAS=1" ]
+  } else {
+    values += [ "LLVM_ENABLE_ONDISK_CAS=" ]
+  }
+
   if (llvm_enable_threads) {
     values += [ "LLVM_ENABLE_THREADS=1" ]
   } else {
diff --git a/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni b/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni
index 8c2ab8af8ed1f..715b03e91c756 100644
--- a/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni
+++ b/llvm/utils/gn/secondary/llvm/include/llvm/Config/config.gni
@@ -1,4 +1,7 @@
 declare_args() {
   # Iterate unordered llvm containers in reverse.
   llvm_enable_reverse_iteration = false
+
+  # Iterate unordered llvm containers in reverse.
+  llvm_enable_ondisk_cas = false
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn
index 9ff794f7c9b32..2ad867dc80001 100644
--- a/llvm/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/AsmParser/BUILD.gn
@@ -6,6 +6,7 @@ static_library("AsmParser") {
     "//llvm/lib/Support",
   ]
   sources = [
+    "AsmParserContext.cpp",
     "LLLexer.cpp",
     "LLParser.cpp",
     "Parser.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
index b4edd8d65ecb0..5590b27ac3784 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CAS/BUILD.gn
@@ -10,6 +10,8 @@ static_library("CAS") {
     "ObjectStore.cpp",
     "OnDiskCommon.cpp",
     "OnDiskDataAllocator.cpp",
+    "OnDiskGraphDB.cpp",
+    "OnDiskKeyValueDB.cpp",
     "OnDiskTrieRawHashMap.cpp",
   ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
index 39dacf7586233..444670212cafb 100644
--- a/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/CodeGen/BUILD.gn
@@ -122,6 +122,7 @@ static_library("CodeGen") {
     "MLRegAllocPriorityAdvisor.cpp",
     "MachineBasicBlock.cpp",
     "MachineBlockFrequencyInfo.cpp",
+    "MachineBlockHashInfo.cpp",
     "MachineBlockPlacement.cpp",
     "MachineBranchProbabilityInfo.cpp",
     "MachineCFGPrinter.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
index c4ce9906a7021..937e81b476967 100644
--- a/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/ExecutionEngine/Orc/TargetProcess/BUILD.gn
@@ -12,6 +12,8 @@ static_library("TargetProcess") {
     "JITLoaderGDB.cpp",
     "JITLoaderPerf.cpp",
     "JITLoaderVTune.cpp",
+    "LibraryResolver.cpp",
+    "LibraryScanner.cpp",
     "OrcRTBootstrap.cpp",
     "RegisterEHFrames.cpp",
     "SimpleExecutorDylibManager.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
index 38ba4661daacc..df9ddf91f2c49 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Support/BUILD.gn
@@ -45,6 +45,7 @@ static_library("Support") {
     "ARMAttributeParser.cpp",
     "ARMBuildAttributes.cpp",
     "ARMWinEH.cpp",
+    "AllocToken.cpp",
     "Allocator.cpp",
     "AutoConvert.cpp",
     "BalancedPartitioning.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
index b778521d13a64..1c7c2b463f552 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/AMDGPU/BUILD.gn
@@ -136,6 +136,7 @@ static_library("LLVMAMDGPUCodeGen") {
     "AMDGPUAsmPrinter.cpp",
     "AMDGPUAtomicOptimizer.cpp",
     "AMDGPUAttributor.cpp",
+    "AMDGPUBarrierLatency.cpp",
     "AMDGPUCallLowering.cpp",
     "AMDGPUCodeGenPrepare.cpp",
     "AMDGPUCombinerHelper.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
index c05500185fb61..065d33dd7a3bc 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Target/Hexagon/BUILD.gn
@@ -79,6 +79,7 @@ static_library("LLVMHexagonCodeGen") {
     "HexagonOptAddrMode.cpp",
     "HexagonOptimizeSZextends.cpp",
     "HexagonPeephole.cpp",
+    "HexagonQFPOptimizer.cpp",
     "HexagonRDFOpt.cpp",
     "HexagonRegisterInfo.cpp",
     "HexagonSelectionDAGInfo.cpp",
diff --git a/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn
index 07a795122c76c..4d752018ac7e9 100644
--- a/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/tools/llvm-ir2vec/BUILD.gn
@@ -1,9 +1,16 @@
 executable("llvm-ir2vec") {
   deps = [
     "//llvm/lib/Analysis",
+    "//llvm/lib/CodeGen",
+    "//llvm/lib/CodeGen/MIRParser",
     "//llvm/lib/IR",
     "//llvm/lib/IRReader",
     "//llvm/lib/Support",
+    "//llvm/lib/Target:AllTargetsAsmParsers",
+    "//llvm/lib/Target:AllTargetsCodeGens",
+    "//llvm/lib/Target:AllTargetsDescs",
+    "//llvm/lib/Target:AllTargetsInfos",
+    "//llvm/lib/TargetParser",
   ]
   sources = [ "llvm-ir2vec.cpp" ]
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
index 52a64beb9cac2..2d9eb6814c376 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/CAS/BUILD.gn
@@ -1,3 +1,4 @@
+import("//llvm/include/llvm/Config/config.gni")
 import("//third-party/unittest/unittest.gni")
 
 unittest("CASTests") {
@@ -10,8 +11,15 @@ unittest("CASTests") {
     "ActionCacheTest.cpp",
     "CASTestConfig.cpp",
     "ObjectStoreTest.cpp",
-    "OnDiskDataAllocatorTest.cpp",
-    "OnDiskTrieRawHashMapTest.cpp",
-    "ProgramTest.cpp",
   ]
+
+  if (llvm_enable_ondisk_cas) {
+    sources += [
+      "OnDiskDataAllocatorTest.cpp",
+      "OnDiskGraphDBTest.cpp",
+      "OnDiskKeyValueDBTest.cpp",
+      "OnDiskTrieRawHashMapTest.cpp",
+      "ProgramTest.cpp",
+    ]
+  }
 }
diff --git a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
index 376f68977ed6b..111e4c997de92 100644
--- a/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/unittests/ExecutionEngine/Orc/BUILD.gn
@@ -24,6 +24,7 @@ unittest("OrcJITTests") {
     "JITLinkRedirectionManagerTest.cpp",
     "JITTargetMachineBuilderTest.cpp",
     "LazyCallThroughAndReexportsTest.cpp",
+    "LibraryResolverTest.cpp",
     "LookupAndRecordAddrsTest.cpp",
     "MachOPlatformTest.cpp",
     "MapperJITLinkMemoryManagerTest.cpp",
@@ -42,6 +43,7 @@ unittest("OrcJITTests") {
     "SymbolStringPoolTest.cpp",
     "TaskDispatchTest.cpp",
     "ThreadSafeModuleTest.cpp",
+    "WaitingOnGraphTest.cpp",
     "WrapperFunctionUtilsTest.cpp",
   ]
 
diff --git a/llvm/utils/lit/lit/TestRunner.py b/llvm/utils/lit/lit/TestRunner.py
index a7e2705f609af..f88314547bb3f 100644
--- a/llvm/utils/lit/lit/TestRunner.py
+++ b/llvm/utils/lit/lit/TestRunner.py
@@ -92,12 +92,12 @@ class ShellEnvironment(object):
     we maintain a dir stack for pushd/popd.
     """
 
-    def __init__(self, cwd, env, umask=-1, ulimit={}):
+    def __init__(self, cwd, env, umask=-1, ulimit=None):
         self.cwd = cwd
         self.env = dict(env)
         self.umask = umask
         self.dirStack = []
-        self.ulimit = ulimit
+        self.ulimit = ulimit if ulimit else {}
 
     def change_dir(self, newdir):
         if os.path.isabs(newdir):
diff --git a/llvm/utils/lit/lit/run.py b/llvm/utils/lit/lit/run.py
index 62070e824e87f..55de914d4d2e0 100644
--- a/llvm/utils/lit/lit/run.py
+++ b/llvm/utils/lit/lit/run.py
@@ -137,6 +137,10 @@ def _increase_process_limit(self):
                     "Raised process limit from %d to %d" % (soft_limit, desired_limit)
                 )
         except Exception as ex:
-            # Warn, unless this is Windows or z/OS, in which case this is expected.
-            if os.name != "nt" and platform.system() != "OS/390":
+            # Warn, unless this is Windows, z/OS, or Cygwin in which case this is expected.
+            if (
+                os.name != "nt"
+                and platform.system() != "OS/390"
+                and platform.sys.platform != "cygwin"
+            ):
                 self.lit_config.warning("Failed to raise process limit: %s" % ex)
diff --git a/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt
new file mode 100644
index 0000000000000..011d6db6c127f
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-ulimit/ulimit_reset.txt
@@ -0,0 +1,3 @@
+# RUN: %{python} %S/print_limits.py
+# Fail the test so that we can assert on the output.
+# RUN: not echo return
diff --git a/llvm/utils/lit/tests/shtest-ulimit.py b/llvm/utils/lit/tests/shtest-ulimit.py
index e84327772d3a1..09cd475b737c1 100644
--- a/llvm/utils/lit/tests/shtest-ulimit.py
+++ b/llvm/utils/lit/tests/shtest-ulimit.py
@@ -3,11 +3,15 @@
 # ulimit does not work on non-POSIX platforms.
 # Solaris for some reason does not respect ulimit -n, so mark it unsupported
 # as well.
-# UNSUPPORTED: system-windows, system-solaris
+# UNSUPPORTED: system-windows, system-cygwin, system-solaris
 
-# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit | FileCheck %s
+# RUN: %{python} %S/Inputs/shtest-ulimit/print_limits.py | grep RLIMIT_NOFILE \
+# RUN:   | sed -n -e 's/.*=//p' | tr -d '\n' > %t.nofile_limit
 
-# CHECK: -- Testing: 2 tests{{.*}}
+# RUN: not %{lit} -a -v %{inputs}/shtest-ulimit --order=lexical \
+# RUN:   | FileCheck -DBASE_NOFILE_LIMIT=%{readfile:%t.nofile_limit} %s
+
+# CHECK: -- Testing: 3 tests{{.*}}
 
 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit-bad-arg.txt ({{[^)]*}})
 # CHECK: ulimit -n
@@ -16,3 +20,6 @@
 # CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_okay.txt ({{[^)]*}})
 # CHECK: ulimit -n 50
 # CHECK: RLIMIT_NOFILE=50
+
+# CHECK-LABEL: FAIL: shtest-ulimit :: ulimit_reset.txt ({{[^)]*}})
+# CHECK: RLIMIT_NOFILE=[[BASE_NOFILE_LIMIT]]
diff --git a/llvm/utils/lldbDataFormatters.py b/llvm/utils/lldbDataFormatters.py
index 5e553caa4446d..a3e4ae15930d9 100644
--- a/llvm/utils/lldbDataFormatters.py
+++ b/llvm/utils/lldbDataFormatters.py
@@ -197,6 +197,11 @@ def StringRefSummaryProvider(valobj, internal_dict):
         return '""'
 
     data = data_pointer.deref
+    # StringRef may be uninitialized with length exceeding available memory,
+    # potentially causing bad_alloc exceptions. Limit the length to max string summary setting.
+    limit_obj = valobj.target.debugger.GetSetting("target.max-string-summary-length")
+    if limit_obj:
+        length = min(length, limit_obj.GetUnsignedIntegerValue())
     # Get a char[N] type, from the underlying char type.
     array_type = data.type.GetArrayType(length)
     # Cast the char* string data to a char[N] array.
diff --git a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
index 80ac4c61c7871..dba9e2c137586 100644
--- a/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
+++ b/llvm/utils/mlgo-utils/IR2Vec/generateTriplets.py
@@ -1,14 +1,19 @@
 # Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 # See https://llvm.org/LICENSE.txt for license information.
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-"""IR2Vec Triplet Generator
+"""IR2Vec/MIR2Vec Triplet Generator
 
-Generates IR2Vec triplets by applying random optimization levels to LLVM IR files
-and extracting triplets using llvm-ir2vec. Automatically generates preprocessed
-files: entity2id.txt, relation2id.txt, and train2id.txt.
+Generates IR2Vec or MIR2Vec triplets by applying random optimization levels to
+LLVM IR files (or processing MIR files) and extracting triplets using llvm-ir2vec.
+Automatically generates preprocessed files (entity2id.txt, relation2id.txt, and
+train2id.txt) necessary for training IR2Vec or MIR2Vec vocabularies.
 
 Usage:
-    python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+    For LLVM IR:
+        python generateTriplets.py <llvm_build_dir> <num_optimizations> <ll_file_list> <output_dir>
+
+    For Machine IR:
+        python generateTriplets.py --mode=mir <llvm_build_dir> <mir_file_list> <output_dir>
 """
 
 import argparse
@@ -41,7 +46,7 @@ def __init__(self, triplets: Set[str], max_relation: int):
 
 
 class IR2VecTripletGenerator:
-    """Main class for generating IR2Vec triplets"""
+    """Main class for generating IR2Vec or MIR2Vec triplets"""
 
     def __init__(
         self,
@@ -49,11 +54,13 @@ def __init__(
         num_optimizations: int,
         output_dir: Path,
         max_workers: int = DEFAULT_MAX_WORKERS,
+        mode: str = "llvm",
     ):
         self.llvm_build_dir = llvm_build_dir
         self.num_optimizations = num_optimizations
         self.output_dir = output_dir
         self.max_workers = max_workers
+        self.mode = mode  # "llvm" or "mir"
 
         # Tool paths
         self.opt_binary = os.path.join(llvm_build_dir, "bin", "opt")
@@ -85,7 +92,11 @@ def _validate_setup(self):
                 f"llvm-ir2vec binary not found or not executable: {self.ir2vec_binary}"
             )
 
-        if not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
+        if self.mode not in ["llvm", "mir"]:
+            raise ValueError(f"Mode must be 'llvm' or 'mir', got: {self.mode}")
+
+        # For LLVM IR mode, validate optimization count
+        if self.mode == "llvm" and not (1 <= self.num_optimizations <= len(OPT_LEVELS)):
             raise ValueError(
                 f"Number of optimizations must be between 1-{len(OPT_LEVELS)}"
             )
@@ -95,19 +106,28 @@ def _select_optimization_levels(self) -> List[str]:
         return random.sample(OPT_LEVELS, self.num_optimizations)
 
     def _process_single_file(self, input_file: Path) -> TripletResult:
-        """Process a single LLVM IR file with multiple optimization levels"""
+        """Process a single LLVM IR or MIR file"""
         all_triplets = set()
         max_relation = 1
-        opt_levels = self._select_optimization_levels()
 
-        for opt_level in opt_levels:
-            triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
+        if self.mode == "mir":
+            # For MIR files, process directly without optimization
+            triplets, file_max_relation = self._run_mir_pipeline(input_file)
             if triplets:
                 all_triplets.update(triplets)
                 max_relation = max(max_relation, file_max_relation)
-                logger.debug(
-                    f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
-                )
+                logger.debug(f"Generated {len(triplets)} triplets for {input_file}")
+        else:
+            # For LLVM IR files, apply multiple optimization levels
+            opt_levels = self._select_optimization_levels()
+            for opt_level in opt_levels:
+                triplets, file_max_relation = self._run_pipeline(input_file, opt_level)
+                if triplets:
+                    all_triplets.update(triplets)
+                    max_relation = max(max_relation, file_max_relation)
+                    logger.debug(
+                        f"Generated {len(triplets)} triplets for {input_file} with {opt_level}"
+                    )
 
         return TripletResult(all_triplets, max_relation)
 
@@ -124,7 +144,7 @@ def _run_pipeline(self, input_file: Path, opt_level: str) -> Tuple[Set[str], int
 
             # Run llvm-ir2vec with opt's output as input
             ir2vec_proc = subprocess.Popen(
-                [self.ir2vec_binary, "triplets", "-", "-o", "-"],
+                [self.ir2vec_binary, "triplets", "--mode=llvm", "-", "-o", "-"],
                 stdin=opt_proc.stdout,
                 stdout=subprocess.PIPE,
                 stderr=subprocess.PIPE,
@@ -143,6 +163,32 @@ def _run_pipeline(self, input_file: Path, opt_level: str) -> Tuple[Set[str], int
         except (subprocess.SubprocessError, OSError):
             return set(), 1
 
+    def _run_mir_pipeline(self, input_file: Path) -> Tuple[Set[str], int]:
+        """Run llvm-ir2vec pipeline for MIR files."""
+        try:
+            # Run llvm-ir2vec directly on MIR file
+            result = subprocess.run(
+                [
+                    self.ir2vec_binary,
+                    "triplets",
+                    "--mode=mir",
+                    str(input_file),
+                    "-o",
+                    "-",
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+                text=True,
+                check=False,
+            )
+
+            if result.returncode != 0:
+                return set(), 1
+
+            return self._parse_triplet_output(result.stdout)
+        except (subprocess.SubprocessError, OSError):
+            return set(), 1
+
     def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
         """Parse triplet output and extract max relation"""
         if not output.strip():
@@ -160,12 +206,21 @@ def _parse_triplet_output(self, output: str) -> Tuple[Set[str], int]:
         return set(lines), max_relation
 
     def generate_triplets(self, file_list: Path) -> None:
-        """Main method to generate triplets from a list of LLVM IR files"""
+        """Main method to generate triplets from a list of LLVM IR or MIR files"""
+        # Store file_list_path for later use in entity generation
+        self.file_list_path = file_list
+
         input_files = self._read_file_list(file_list)
-        logger.info(
-            f"Processing {len(input_files)} files with {self.num_optimizations} "
-            f"optimization levels using {self.max_workers} workers"
-        )
+
+        if self.mode == "mir":
+            logger.info(
+                f"Processing {len(input_files)} MIR files using {self.max_workers} workers"
+            )
+        else:
+            logger.info(
+                f"Processing {len(input_files)} files with {self.num_optimizations} "
+                f"optimization levels using {self.max_workers} workers"
+            )
 
         all_triplets = set()
         global_max_relation = 1
@@ -222,28 +277,60 @@ def _generate_output_files(self, all_triplets: Set[str], max_relation: int) -> N
 
     def _generate_entity2id(self, output_file: Path) -> None:
         """Generate entity2id.txt using llvm-ir2vec"""
-        subprocess.run(
-            [str(self.ir2vec_binary), "entities", "-o", str(output_file)],
-            check=True,
-            capture_output=True,
-        )
+        if self.mode == "mir":
+            # For MIR mode, we need to provide a sample MIR file to determine target
+            # Use the first file from the processed list
+            input_files = self._read_file_list(self.file_list_path)
+            if not input_files:
+                raise ValueError("No input files available for entity generation")
+
+            subprocess.run(
+                [
+                    str(self.ir2vec_binary),
+                    "entities",
+                    "--mode=mir",
+                    str(input_files[0]),
+                    "-o",
+                    str(output_file),
+                ],
+                check=True,
+                capture_output=True,
+            )
+        else:
+            subprocess.run(
+                [
+                    str(self.ir2vec_binary),
+                    "entities",
+                    "--mode=llvm",
+                    "-o",
+                    str(output_file),
+                ],
+                check=True,
+                capture_output=True,
+            )
 
     def _generate_relation2id(self, output_file: Path, max_relation: int) -> None:
         """Generate relation2id.txt from max relation"""
-        max_relation = max(max_relation, 1)  # At least Type and Next relations
+        max_relation = max(max_relation, 1)  # At least Next relation
         num_relations = max_relation + 1
 
         with open(output_file, "w") as f:
             f.write(f"{num_relations}\n")
-            f.write("Type\t0\n")
-            f.write("Next\t1\n")
-            f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))
+            if self.mode == "llvm":
+                # LLVM IR has Type relation at 0
+                f.write("Type\t0\n")
+                f.write("Next\t1\n")
+                f.writelines(f"Arg{i-2}\t{i}\n" for i in range(2, num_relations))
+            else:
+                # MIR doesn't have Type relation, starts with Next at 0
+                f.write("Next\t0\n")
+                f.writelines(f"Arg{i-1}\t{i}\n" for i in range(1, num_relations))
 
 
 def main():
     """Main entry point"""
     parser = argparse.ArgumentParser(
-        description="Generate IR2Vec triplets from LLVM IR files",
+        description="Generate IR2Vec or MIR2Vec triplets from LLVM IR or Machine IR files",
         formatter_class=argparse.RawDescriptionHelpFormatter,
     )
 
@@ -253,16 +340,25 @@ def main():
     parser.add_argument(
         "num_optimizations",
         type=int,
-        help="Number of optimization levels to apply (1-6)",
+        nargs="?",
+        default=1,
+        help="Number of optimization levels to apply (1-6) for LLVM IR mode",
     )
     parser.add_argument(
-        "ll_file_list",
+        "input_file_list",
         type=Path,
-        help="File containing list of LLVM IR files to process",
+        help="File containing list of LLVM IR or MIR files to process",
     )
     parser.add_argument(
         "output_dir", type=Path, help="Output directory for generated files"
     )
+    parser.add_argument(
+        "--mode",
+        type=str,
+        choices=["llvm", "mir"],
+        default="llvm",
+        help="Operation mode: 'llvm' for LLVM IR (default) or 'mir' for Machine IR",
+    )
     parser.add_argument(
         "-j",
         "--max-workers",
@@ -296,8 +392,9 @@ def main():
         args.num_optimizations,
         args.output_dir,
         args.max_workers,
+        args.mode,
     )
-    generator.generate_triplets(args.ll_file_list)
+    generator.generate_triplets(args.input_file_list)
 
 
 if __name__ == "__main__":
diff --git a/llvm/utils/profcheck-xfail.txt b/llvm/utils/profcheck-xfail.txt
index b570f8de06880..151b06573d961 100644
--- a/llvm/utils/profcheck-xfail.txt
+++ b/llvm/utils/profcheck-xfail.txt
@@ -71,6 +71,7 @@ CodeGen/Hexagon/autohvx/interleave.ll
 CodeGen/Hexagon/loop-idiom/hexagon-memmove1.ll
 CodeGen/Hexagon/loop-idiom/hexagon-memmove2.ll
 CodeGen/Hexagon/loop-idiom/memmove-rt-check.ll
+CodeGen/Hexagon/masked_gather.ll
 CodeGen/NVPTX/lower-ctor-dtor.ll
 CodeGen/RISCV/zmmul.ll
 CodeGen/WebAssembly/memory-interleave.ll
@@ -331,6 +332,10 @@ Instrumentation/MemorySanitizer/AArch64/arm64-vshift.ll
 Instrumentation/MemorySanitizer/AArch64/module-flags-aarch64.ll
 Instrumentation/MemorySanitizer/AArch64/neon_vst_float.ll
 Instrumentation/MemorySanitizer/AArch64/qshrn.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme-aarch64-svcount.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add-mini.ll
+Instrumentation/MemorySanitizer/AArch64/sme2-intrinsics-add.ll
 Instrumentation/MemorySanitizer/AArch64/vararg.ll
 Instrumentation/MemorySanitizer/AArch64/vararg_shadow.ll
 Instrumentation/MemorySanitizer/abs-vector.ll
diff --git a/llvm/utils/update_mc_test_checks.py b/llvm/utils/update_mc_test_checks.py
index ab7fe19c432de..67fff56333e03 100755
--- a/llvm/utils/update_mc_test_checks.py
+++ b/llvm/utils/update_mc_test_checks.py
@@ -290,11 +290,9 @@ def update_test(ti: common.TestInfo):
 
         # prefix is selected and generated with most shared output lines
         # each run_id can only be used once
-        gen_prefix = ""
         used_runid = set()
 
-        # line number diff between generated prefix and testline
-        line_offset = 1
+        selected_prefixes = set()
         for prefix, tup in p_dict_sorted.items():
             o, run_ids = tup
 
@@ -308,18 +306,24 @@ def update_test(ti: common.TestInfo):
                 else:
                     used_runid.add(i)
             if not skip:
-                used_prefixes.add(prefix)
+                selected_prefixes.add(prefix)
 
-                if hasErr(o):
-                    newline = getErrCheckLine(prefix, o, mc_mode, line_offset)
-                else:
-                    newline = getStdCheckLine(prefix, o, mc_mode)
+        # Generate check lines in alphabetical order.
+        check_lines = []
+        for prefix in sorted(selected_prefixes):
+            o, run_ids = p_dict[prefix]
+            used_prefixes.add(prefix)
+
+            if hasErr(o):
+                line_offset = len(check_lines) + 1
+                check = getErrCheckLine(prefix, o, mc_mode, line_offset)
+            else:
+                check = getStdCheckLine(prefix, o, mc_mode)
 
-                if newline:
-                    gen_prefix += newline
-                    line_offset += 1
+            if check:
+                check_lines.append(check.strip())
 
-        generated_prefixes[input_line] = gen_prefix.rstrip("\n")
+        generated_prefixes[input_line] = "\n".join(check_lines)
 
     # write output
     for input_info in ti.iterlines(output_lines):
diff --git a/mlir/docs/Bindings/Python.md b/mlir/docs/Bindings/Python.md
index 6f778b09e2a8f..877ae5170d68c 100644
--- a/mlir/docs/Bindings/Python.md
+++ b/mlir/docs/Bindings/Python.md
@@ -9,7 +9,7 @@
 ### Pre-requisites
 
 *   A relatively recent Python3 installation
-*   Installation of python dependencies as specified in
+*   Installation of Python dependencies as specified in
     `mlir/python/requirements.txt`
 
 ### CMake variables
@@ -27,8 +27,8 @@
 
 ### Recommended development practices
 
-It is recommended to use a python virtual environment. Many ways exist for this,
-but the following is the simplest:
+It is recommended to use a Python virtual environment. Many ways exist for this,
+but one of the following is generally recommended:
 
 ```shell
 # Make sure your 'python' is what you expect. Note that on multi-python
@@ -37,7 +37,22 @@ but the following is the simplest:
 which python
 python -m venv ~/.venv/mlirdev
 source ~/.venv/mlirdev/bin/activate
+```
+
+Or, if you have uv installed on your system, you can also use the following commands
+to create the same environment (targeting a Python 3.12 toolchain in this example):
+
+```shell
+uv venv ~/.venv/mlirdev --seed -p 3.12
+source ~/.venv/mlirdev/bin/activate
+```
+
+You can change the Python version (`-p` flag) as needed - if you request any Python interpreter
+not present on your system, uv will attempt to download it, unless the `--no-python-downloads` option is given.
+For information on how to install uv, refer to the official documentation at
+https://docs.astral.sh/uv/getting-started/installation/
 
+```shell
 # Note that many LTS distros will bundle a version of pip itself that is too
 # old to download all of the latest binaries for certain platforms.
 # The pip version can be obtained with `python -m pip --version`, and for
@@ -46,14 +61,16 @@ source ~/.venv/mlirdev/bin/activate
 # It is recommended to upgrade pip:
 python -m pip install --upgrade pip
 
-
 # Now the `python` command will resolve to your virtual environment and
 # packages will be installed there.
 python -m pip install -r mlir/python/requirements.txt
 
+# In a uv-generated virtual environment, you can instead run:
+uv pip install -r mlir/python/requirements.txt
+
 # Now run your build command with `cmake`, `ninja`, et al.
 
-# Run mlir tests. For example, to run python bindings tests only using ninja:
+# Run mlir tests. For example, to run Python bindings tests only using ninja:
 ninja check-mlir-python
 ```
 
@@ -65,7 +82,7 @@ the `PYTHONPATH`. Typically:
 export PYTHONPATH=$(cd build && pwd)/tools/mlir/python_packages/mlir_core
 ```
 
-Note that if you have installed (i.e. via `ninja install`, et al), then python
+Note that if you have installed (i.e. via `ninja install`, et al), then Python
 packages for all enabled projects will be in your install tree under
 `python_packages/` (i.e. `python_packages/mlir_core`). Official distributions
 are built with a more specialized setup.
@@ -74,14 +91,14 @@ are built with a more specialized setup.
 
 ### Use cases
 
-There are likely two primary use cases for the MLIR python bindings:
+There are likely two primary use cases for the MLIR Python bindings:
 
 1.  Support users who expect that an installed version of LLVM/MLIR will yield
     the ability to `import mlir` and use the API in a pure way out of the box.
 
 1.  Downstream integrations will likely want to include parts of the API in
     their private namespace or specially built libraries, probably mixing it
-    with other python native bits.
+    with other Python native bits.
 
 ### Composable modules
 
@@ -89,8 +106,8 @@ In order to support use case \#2, the Python bindings are organized into
 composable modules that downstream integrators can include and re-export into
 their own namespace if desired. This forces several design points:
 
-*   Separate the construction/populating of a `py::module` from
-    `PYBIND11_MODULE` global constructor.
+*   Separate the construction/populating of a `nb::module` from
+    `NB_MODULE` global constructor.
 
 *   Introduce headers for C++-only wrapper classes as other related C++ modules
     will need to interop with it.
@@ -130,7 +147,7 @@ registration, etc.
 
 ### Loader
 
-LLVM/MLIR is a non-trivial python-native project that is likely to co-exist with
+LLVM/MLIR is a non-trivial Python-native project that is likely to co-exist with
 other non-trivial native extensions. As such, the native extension (i.e. the
 `.so`/`.pyd`/`.dylib`) is exported as a notionally private top-level symbol
 (`_mlir`), while a small set of Python code is provided in
@@ -160,7 +177,7 @@ are) with non-RTTI polymorphic C++ code (the default compilation mode of LLVM).
 ### Ownership in the Core IR
 
 There are several top-level types in the core IR that are strongly owned by
-their python-side reference:
+their Python-side reference:
 
 *   `PyContext` (`mlir.ir.Context`)
 *   `PyModule` (`mlir.ir.Module`)
@@ -219,23 +236,24 @@ Due to the validity and parenting accounting needs, `PyOperation` is the owner
 for regions and blocks. Operations are also the only entities which are allowed to be in
 a detached state.
 
-**Note**: Multiple `PyOperation` objects (i.e., the Python objects themselves) can alias a single `mlir::Operation`. 
-This means, for example, if you have `py_op1` and `py_op2` which wrap the same `mlir::Operation op` 
+**Note**: Multiple `PyOperation` objects (i.e., the Python objects themselves) can alias a single `mlir::Operation`.
+This means, for example, if you have `py_op1` and `py_op2` which wrap the same `mlir::Operation op`
 and you somehow transform `op` (e.g., you run a pass on `op`) then walking the MLIR AST via either/or `py_op1`, `py_op2`
-will reflect the same MLIR AST. This is perfectly safe and supported. What is not supported is invalidating any 
-operation while there exist multiple Python objects wrapping that operation **and then manipulating those wrappers**. 
-For example if `py_op1` and `py_op2` wrap the same operation under a root `py_op3` and then `py_op3` is 
-transformed such that the operation referenced (by `py_op1`, `py_op2`) is erased. Then `py_op1`, `py_op2` 
-become "undefined" in a sense; manipulating them in any way is "formally forbidden". Note, this also applies to 
-`SymbolTable` mutation, which is considered a transformation of the root `SymbolTable`-supporting operation for the 
-purposes of the discussion here. Metaphorically, one can think of this similarly to how STL container iterators are invalidated once the container itself is changed. The "best practices" recommendation is to structure your code such that 
+will reflect the same MLIR AST. This is perfectly safe and supported. What is not supported is invalidating any
+operation while there exist multiple Python objects wrapping that operation **and then manipulating those wrappers**.
+For example if `py_op1` and `py_op2` wrap the same operation under a root `py_op3` and then `py_op3` is
+transformed such that the operation referenced (by `py_op1`, `py_op2`) is erased. Then `py_op1`, `py_op2`
+become "undefined" in a sense; manipulating them in any way is "formally forbidden". Note, this also applies to
+`SymbolTable` mutation, which is considered a transformation of the root `SymbolTable`-supporting operation for the
+purposes of the discussion here. Metaphorically, one can think of this similarly to how STL container iterators are invalidated
+once the container itself is changed. The "best practices" recommendation is to structure your code such that
 
 1. First, query/manipulate various Python wrapper objects `py_op1`, `py_op2`, `py_op3`, etc.;
 2. Second, transform the AST/erase operations/etc. via a single root object;
 3. Invalidate all queried nodes (e.g., using `op._set_invalid()`).
 
-Ideally this should be done in a function body so that step (3) corresponds to the end of the function and there are no 
-risks of Python wrapper objects leaking/living longer than necessary. In summary, you should scope your changes based on 
+Ideally this should be done in a function body so that step (3) corresponds to the end of the function and there are no
+risks of Python wrapper objects leaking/living longer than necessary. In summary, you should scope your changes based on
 nesting i.e., change leaf nodes first before going up in hierarchy, and only in very rare cases query nested ops post
 modifying a parent op.
 
@@ -773,7 +791,7 @@ This allows to invoke op creation of an op with a `I32Attr` with
 foo.Op(30)
 ```
 
-The registration is based on the ODS name but registry is via pure python
+The registration is based on the ODS name but registry is via pure Python
 method. Only single custom builder is allowed to be registered per ODS attribute
 type (e.g., I32Attr can have only one, which can correspond to multiple of the
 underlying IntegerAttr type).
@@ -795,13 +813,13 @@ either for practicality or to give the resulting library an appropriately
 
 Generally favor converting trivial methods like `getContext()`, `getName()`,
 `isEntryBlock()`, etc to read-only Python properties (i.e. `context`). It is
-primarily a matter of calling `def_property_readonly` vs `def` in binding code,
+primarily a matter of calling `def_prop_ro` vs `def` in binding code,
 and makes things feel much nicer to the Python side.
 
 For example, prefer:
 
 ```c++
-m.def_property_readonly("context", ...)
+m.def_prop_ro("context", ...)
 ```
 
 Over:
@@ -914,7 +932,7 @@ def create_my_op():
 The MLIR Python bindings integrate with the tablegen-based ODS system for
 providing user-friendly wrappers around MLIR dialects and operations. There are
 multiple parts to this integration, outlined below. Most details have been
-elided: refer to the build rules and python sources under `mlir.dialects` for
+elided: refer to the build rules and Python sources under `mlir.dialects` for
 the canonical way to use this facility.
 
 Users are responsible for providing a `{DIALECT_NAMESPACE}.py` (or an equivalent
@@ -922,9 +940,9 @@ directory with `__init__.py` file) as the entrypoint.
 
 ### Generating `_{DIALECT_NAMESPACE}_ops_gen.py` wrapper modules
 
-Each dialect with a mapping to python requires that an appropriate
+Each dialect with a mapping to Python requires that an appropriate
 `_{DIALECT_NAMESPACE}_ops_gen.py` wrapper module is created. This is done by
-invoking `mlir-tblgen` on a python-bindings specific tablegen wrapper that
+invoking `mlir-tblgen` on a Python-bindings specific tablegen wrapper that
 includes the boilerplate and actual dialect specific `td` file. An example, for
 the `Func` (which is assigned the namespace `func` as a special case):
 
@@ -954,7 +972,7 @@ from ._my_dialect_ops_gen import *
 
 ### Extending the search path for wrapper modules
 
-When the python bindings need to locate a wrapper module, they consult the
+When the Python bindings need to locate a wrapper module, they consult the
 `dialect_search_path` and use it to find an appropriately named module. For the
 main repository, this search path is hard-coded to include the `mlir.dialects`
 module, which is where wrappers are emitted by the above build rule. Out of tree
@@ -1153,7 +1171,7 @@ subclasses can be defined using
 [`include/mlir/Bindings/Python/PybindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/PybindAdaptors.h)
 or
 [`include/mlir/Bindings/Python/NanobindAdaptors.h`](https://github.com/llvm/llvm-project/blob/main/mlir/include/mlir/Bindings/Python/NanobindAdaptors.h)
-utilities that mimic pybind11/nanobind API for defining functions and
+utilities that mimic pybind11/nanobind APIs for defining functions and
 properties. These bindings are to be included in a separate module. The
 utilities also provide automatic casting between C API handles `MlirAttribute`
 and `MlirType` and their Python counterparts so that the C API handles can be
@@ -1176,11 +1194,11 @@ are available when the dialect is loaded from Python.
 Dialect-specific passes can be made available to the pass manager in Python by
 registering them with the context and relying on the API for pass pipeline
 parsing from string descriptions. This can be achieved by creating a new
-pybind11 module, defined in `lib/Bindings/Python/<Dialect>Passes.cpp`, that
+nanobind module, defined in `lib/Bindings/Python/<Dialect>Passes.cpp`, that
 calls the registration C API, which must be provided first. For passes defined
 declaratively using Tablegen, `mlir-tblgen -gen-pass-capi-header` and
 `-mlir-tblgen -gen-pass-capi-impl` automate the generation of C API. The
-pybind11 module must be compiled into a separate “Python extension” library,
+nanobind module must be compiled into a separate “Python extension” library,
 which can be `import`ed  from the main dialect file, i.e.
 `python/mlir/dialects/<dialect-namespace>.py` or
 `python/mlir/dialects/<dialect-namespace>/__init__.py`, or from a separate
diff --git a/mlir/examples/standalone/python/CMakeLists.txt b/mlir/examples/standalone/python/CMakeLists.txt
index 905c944939756..df19fa86067eb 100644
--- a/mlir/examples/standalone/python/CMakeLists.txt
+++ b/mlir/examples/standalone/python/CMakeLists.txt
@@ -74,7 +74,12 @@ add_mlir_python_common_capi_library(StandalonePythonCAPI
 
 set(StandalonePythonModules_ROOT_PREFIX "${MLIR_BINARY_DIR}/${MLIR_BINDINGS_PYTHON_INSTALL_PREFIX}")
 
-if(NOT CMAKE_CROSSCOMPILING)
+set(_mlir_python_stubgen_enabled ON)
+if(CMAKE_CROSSCOMPILING OR (NOT LLVM_USE_SANITIZER STREQUAL ""))
+  set(_mlir_python_stubgen_enabled OFF)
+endif()
+
+if(_mlir_python_stubgen_enabled)
   # Everything here is very tightly coupled. See the ample descriptions at the bottom of
   # mlir/python/CMakeLists.txt.
 
@@ -141,7 +146,7 @@ set(_declared_sources
 )
 # For an external projects build, the MLIRPythonExtension.Core.type_stub_gen
 # target already exists and can just be added to DECLARED_SOURCES.
-if(EXTERNAL_PROJECT_BUILD AND (NOT CMAKE_CROSSCOMPILING))
+if(EXTERNAL_PROJECT_BUILD AND _mlir_python_stubgen_enabled)
   list(APPEND _declared_sources MLIRPythonExtension.Core.type_stub_gen)
 endif()
 
@@ -153,7 +158,7 @@ add_mlir_python_modules(StandalonePythonModules
     StandalonePythonCAPI
 )
 
-if(NOT CMAKE_CROSSCOMPILING)
+if(_mlir_python_stubgen_enabled)
   if(NOT EXTERNAL_PROJECT_BUILD)
     add_dependencies(StandalonePythonModules "${_mlir_typestub_gen_target}")
   endif()
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
index 7184de93bfacb..37db096f1ba75 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPU.td
@@ -912,12 +912,10 @@ def ScaledMFMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[32], [F8E5M2, F8E4M3FN
                                    VectorOfLengthAndType<[32], [F6E2M3FN, F6E3M2FN, F4E2M1FN]>]>;
 def ScaledMFMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 16], [F32]>]>;
 // wmma
-def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<
-                             [4, 8, 16],
-                             [F16, BF16,
-                              I8, SI8, UI8,
-                              I<4>, SI<4>, UI<4>,
-                              F8E4M3FN, F8E5M2]>]>;
+def WMMAInTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>,
+                             VectorOfLengthAndType<[4, 8, 16], [I8, SI8, UI8]>,
+                             VectorOfLengthAndType<[4, 8], [F8E4M3FN, F8E5M2]>,
+                             VectorOfLengthAndType<[4, 8, 16], [I<4>, SI<4>, UI<4>]>]>;
 def WMMAOutTypes : AnyTypeOf<[VectorOfLengthAndType<[4, 8], [F32, I32]>,
                               VectorOfLengthAndType<[4, 8, 16], [F16, BF16]>]>;
 
@@ -925,10 +923,10 @@ def AMDGPU_MFMAOp :
     AMDGPU_Op<"mfma", [AllTypesMatch<["destC", "destD"]>,
                         Pure]>,
     Arguments<(ins
-                   I32Attr:$m,
-                   I32Attr:$n,
-                   I32Attr:$k,
-                   I32Attr:$blocks,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[4, 16, 32]>]>:$m,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[4, 16, 32]>]>:$n,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[1, 2, 4, 8, 16, 32, 64, 128]>]>:$k,
+                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntIsOneOf<[1, 2, 4, 16]>]>, "1">:$blocks,
                    MFMAInTypes:$sourceA,
                    MFMAInTypes:$sourceB,
                    MFMAOutTypes:$destC,
@@ -968,9 +966,19 @@ def AMDGPU_MFMAOp :
 
     The negateA, negateB, and negateC flags are only supported for double-precision
     operations on gfx94x.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.mfma 16x16x16 %matA * %matB + %matC
+        : vector<4xf16>, vector<4xf16>, vector<4xf32>
+
+      %1 = amdgpu.mfma 32x32x1 %matD * %matE + %matF
+        { abid = 1 : i32, cbsz = 1 : i32, blocks = 2 : i32 }
+        blgp = bcast_second_32 : f32, f32, vector<32xf32>
+    ```
   }];
   let assemblyFormat = [{
-    $sourceA `*` $sourceB `+` $destC
+    custom<MNKDimensionList>($m, $n, $k) $sourceA `*` $sourceB `+` $destC
     attr-dict
     `blgp` `=` $blgp
     `:` type($sourceA) `,` type($sourceB) `,` type($destC)
@@ -982,36 +990,43 @@ def AMDGPU_WMMAOp :
     AMDGPU_Op<"wmma", [AllTypesMatch<["destC", "destD"]>,
                        Pure]>,
     Arguments<(ins
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$m,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16]>]>:$n,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$k,
                    WMMAInTypes:$sourceA,
                    WMMAInTypes:$sourceB,
                    WMMAOutTypes:$destC,
-                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntMinValue<0>, IntMaxValue<1>]>, "0">:$subwordOffset,
+                   DefaultValuedAttr<ConfinedAttr<I32Attr, [IntIsOneOf<[0, 1]>]>, "0">:$subwordOffset,
                    UnitAttr:$unsignedA,
                    UnitAttr:$unsignedB,
                    UnitAttr:$clamp)>,
     Results<(outs WMMAOutTypes: $destD)> {
-  let summary = "MLIR wrapper for RDNA3 wmma instructions";
+  let summary = "MLIR wrapper for wmma instructions";
   let description = [{
-    The `amdgpu.wmma` op is an MLIR wrapper around intrinsics
-    for various `wmma` instructions in the RDNA3 or RDNA4 architecture, which
-    perform a 16x16 * 16x16 matrix multiplication for different data types.
-    Note that in gfx12/RDNA4, there is also a 16x32 * 32x16 instruction for 4-bit
-    integer inputs.
+    The `amdgpu.wmma` op is an MLIR wrapper around intrinsics for various `wmma`
+    instructions in the AMDGPU architecture, which perform matrix multiplication.
+    Note that all wmma intrinsics have M=N=16 dimensions but vary by in allowed K
+    dimensions.
 
     On gfx11/RDNA3, emitting f16->f16 (or bf16->bf16) wmma the output is a 16xf16
     (or 16xbf16) vector containing only 8 valid values:
       - If `subwordOffset` is 0, then the output is stored at indices 0, 2, 4, ..., 14.
       - If `subwordOffset` is 1, then the output is stored at indices 1, 3, 5, ..., 15.
-    On gfx12/RDNA4, the result is instead returned as a vector<8 x f16/bf16> where
-    all values are valid and the `subwordOffset` must be `0`, as it cannot be used.
+    On gfx12/RDNA4 and gfx1250, the result is instead returned as vector where all
+    the values are valid and the `subwordOffset` must be `0`, as it cannot be used.
 
     `unsignedA` and `unsignedB` flag that the `int8` LLVM inputs are unsigned.
 
-    The `clamp` flag is used to saturate the output of type T to numeric_limits<T>::max()
+    The `clamp` flag is used to saturate the output of type T to `numeric_limits<T>::max()`
     in case of overflow.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.wmma 16x16x16 %matA * %matB + %matC : vector<16xf16>, vector<16xf16>, vector<8xf16>
+    ```
   }];
   let assemblyFormat = [{
-    $sourceA `*` $sourceB `+` $destC
+    custom<MNKDimensionList>($m, $n, $k) $sourceA `*` $sourceB `+` $destC
     attr-dict
     `:` type($sourceA) `,` type($sourceB) `,` type($destC)
   }];
@@ -1096,9 +1111,9 @@ def AMDGPU_ScaledMFMAOp :
     AMDGPU_Op<"scaled_mfma", [AllTypesMatch<["destC", "destD"]>,
                         Pure]>,
     Arguments<(ins
-                   I32Attr:$m,
-                   I32Attr:$n,
-                   I32Attr:$k,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$m,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[16, 32]>]>:$n,
+                   ConfinedAttr<I32Attr, [IntIsOneOf<[64, 128]>]>:$k,
                    ScaledMFMAInTypes:$sourceA,
                    ScaledMFMAInTypes:$sourceB,
                    ScaledMFMAOutTypes:$destC,
@@ -1111,8 +1126,8 @@ def AMDGPU_ScaledMFMAOp :
   let summary = "MLIR wrapper for CDNA scaled mfma instructions";
   let description = [{
     The `amdgpu.scaled_mfma` op is an MLIR wrapper around intrinsics
-    for various scaled versions of `mfma` instructions in the CDNA architecture, which perform
-    multiple outer products in order to allow fast matrix multiplication.
+    for various scaled versions of `mfma` instructions in the CDNA architecture, which
+    perform multiple outer products in order to allow fast matrix multiplication.
 
     The wrapper will select an appropriate `mfma` instruction, if one is available,
     based on the provided `m`, `k`, `n`, and `nBlks` attributes, along with the
@@ -1127,15 +1142,23 @@ def AMDGPU_ScaledMFMAOp :
 
     This wrapper takes inspiration from `amdgpu.mfma`, but has some key differences:
     - `amdgpu.scaled_mfma` operates on fp4 (f4E2M1FN), fp6 (f6E2M3FN and f6E3M2FN) and
-    fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as their tile
-    size.
+      fp8 (f8E4M3FN and f8E5M2) types using either M=N=16, K=128 or M=N=32, K=64 as
+      their tile size.
     - `amdgpu.scaled_mfma` does not support broadcasting. So, `cbsz`, `abid`, and `blgp`
-    are omitted from this wrapper.
-    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported for
-    double-precision operations on gfx94x and so are not included here.
+      are omitted from this wrapper.
+    - The `negateA`, `negateB`, and `negateC` flags in `amdgpu.mfma` are only supported
+      for double-precision operations on gfx94x and so are not included here.
+
+    Example:
+    ```mlir
+      %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2
+        : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+    ```
   }];
   let assemblyFormat = [{
-    `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*` `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
+    custom<MNKDimensionList>($m, $n, $k) ` `
+    `(` $scalesA `[` $scalesIdxA `]` `*` $sourceA `)` `*`
+    `(` $scalesB `[` $scalesIdxB `]` `*` $sourceB `)` `+` $destC
     attr-dict
     `:` type($scalesA) `,` type($sourceA) `,` type($scalesB) `,` type($sourceB) `,` type($destC)
   }];
diff --git a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
index 3de57c923178a..dcd9f95a7561f 100644
--- a/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
+++ b/mlir/include/mlir/Dialect/AMDGPU/IR/AMDGPUDialect.h
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 //
 // This file declares a dialect for MLIR wrappers around AMDGPU-specific
-// intrinssics and for other AMD GPU-specific functionality.
+// intrinsics and for other AMD GPU-specific functionality.
 //
 //===----------------------------------------------------------------------===//
 
@@ -26,6 +26,29 @@
 
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUEnums.h.inc"
 
+namespace mlir::amdgpu {
+/// Parser for the `custom<MNKDimensionList>` custom assembly format used by
+/// WMMAOp.
+ParseResult parseMNKDimensionList(OpAsmParser &parser, IntegerAttr &m,
+                                  IntegerAttr &n, IntegerAttr &k);
+inline ParseResult parseMNKDimensionList(OpAsmParser &parser, Operation *,
+                                         IntegerAttr &m, IntegerAttr &n,
+                                         IntegerAttr &k) {
+  return parseMNKDimensionList(parser, m, n, k);
+}
+
+/// Printer for the `custom<MNKDimensionList>` custom assembly format used by
+/// WMMAOp.
+inline void printMNKDimensionList(OpAsmPrinter &printer, IntegerAttr m,
+                                  IntegerAttr n, IntegerAttr k) {
+  printer.printDimensionList(ArrayRef{m.getInt(), n.getInt(), k.getInt()});
+}
+inline void printMNKDimensionList(OpAsmPrinter &printer, Operation *,
+                                  IntegerAttr m, IntegerAttr n, IntegerAttr k) {
+  printMNKDimensionList(printer, m, n, k);
+}
+} // namespace mlir::amdgpu
+
 #define GET_ATTRDEF_CLASSES
 #include "mlir/Dialect/AMDGPU/IR/AMDGPUAttributes.h.inc"
 
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.h b/mlir/include/mlir/Dialect/Affine/Passes.h
index 2f70f24dd3ef2..ec349ec48e33b 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.h
+++ b/mlir/include/mlir/Dialect/Affine/Passes.h
@@ -106,7 +106,6 @@ std::unique_ptr<OperationPass<func::FuncOp>> createLoopTilingPass();
 /// all) or the default unroll factor is used (LoopUnroll:kDefaultUnrollFactor).
 std::unique_ptr<InterfacePass<FunctionOpInterface>> createLoopUnrollPass(
     int unrollFactor = -1, bool unrollUpToFactor = false,
-    bool unrollFull = false,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr);
 
 /// Creates a loop unroll jam pass to unroll jam by the specified factor. A
diff --git a/mlir/include/mlir/Dialect/Affine/Passes.td b/mlir/include/mlir/Dialect/Affine/Passes.td
index 6ad45b828f657..bb6b41c0bba35 100644
--- a/mlir/include/mlir/Dialect/Affine/Passes.td
+++ b/mlir/include/mlir/Dialect/Affine/Passes.td
@@ -203,12 +203,10 @@ def AffineLoopUnroll : InterfacePass<"affine-loop-unroll", "FunctionOpInterface"
   let summary = "Unroll affine loops";
   let constructor = "mlir::affine::createLoopUnrollPass()";
   let options = [
-    Option<"unrollFactor", "unroll-factor", "unsigned", /*default=*/"4",
+    Option<"unrollFactor", "unroll-factor", "int64_t", /*default=*/"4",
            "Use this unroll factor for all loops being unrolled">,
     Option<"unrollUpToFactor", "unroll-up-to-factor", "bool",
            /*default=*/"false", "Allow unrolling up to the factor specified">,
-    Option<"unrollFull", "unroll-full", "bool", /*default=*/"false",
-           "Fully unroll loops">,
     Option<"numRepetitions", "unroll-num-reps", "unsigned", /*default=*/"1",
            "Unroll innermost loops repeatedly this many times">,
     Option<"unrollFullThreshold", "unroll-full-threshold", "unsigned",
diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index d959464836043..4f483859ac18d 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -1872,6 +1872,55 @@ def NVVM_ConvertBF16x2ToF8x2Op : NVVM_Op<"convert.bf16x2.to.f8x2"> {
   }];
 }
 
+class NVVM_ConvertToFP16x2Op_Base <string srcType, Type srcArgType, string dstType>
+: NVVM_Op<"convert." # !tolower(srcType) # "x2.to." # !tolower(dstType) # "x2"> {
+  let summary = "Convert a pair of " # !tolower(srcType) # " inputs to " # !tolower(dstType) # "x2";
+  let description = [{
+    This Op converts the given }] # !tolower(srcType) # [{ inputs in a }] #
+    !if(!eq(srcType, "F4"), "packed i8", "i8x2 vector") # [{ to }] #
+    !tolower(dstType) # [{.
+
+    The result `dst` is represented as a vector of }] # !tolower(dstType) # [{ elements.
+    }] #
+    !if(!eq(dstType, "F16"),
+    [{The `relu` attribute, when set, lowers to the '.relu' variant of 
+    the cvt instruction."}], "") # [{
+    
+    [For more information, see PTX ISA](https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cvt)
+  }];
+  let results = (outs VectorOfLengthAndType<[2], [!cast<Type>(dstType)]>:$dst);
+  let arguments = !if(!eq(dstType, "F16"),
+    (ins srcArgType:$src,
+         DefaultValuedAttr<BoolAttr, "false">:$relu,
+         TypeAttr:$srcType),
+    (ins srcArgType:$src,
+         TypeAttr:$srcType));
+  let assemblyFormat = "$src attr-dict `:` type($src) `(` $srcType `)` `->` type($dst)";
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    static IDArgPair
+    getIntrinsicIDAndArgs(Operation &op, LLVM::ModuleTranslation &mt, 
+                          llvm::IRBuilderBase &builder);
+  }];
+
+  string llvmBuilder = [{
+    auto [intId, args] =
+    NVVM::Convert}] # srcType # [{x2To}] # dstType # 
+      [{x2Op::getIntrinsicIDAndArgs(*op, moduleTranslation, builder);
+    $dst = createIntrinsicCall(builder, intId, args);
+  }];
+}
+
+def NVVM_ConvertF8x2ToF16x2Op :
+  NVVM_ConvertToFP16x2Op_Base<"F8", VectorOfLengthAndType<[2], [I8]>, "F16">;
+def NVVM_ConvertF8x2ToBF16x2Op :
+  NVVM_ConvertToFP16x2Op_Base<"F8", VectorOfLengthAndType<[2], [I8]>, "BF16">;
+def NVVM_ConvertF6x2ToF16x2Op :
+  NVVM_ConvertToFP16x2Op_Base<"F6", VectorOfLengthAndType<[2], [I8]>, "F16">;
+def NVVM_ConvertF4x2ToF16x2Op :
+  NVVM_ConvertToFP16x2Op_Base<"F4", I8, "F16">;
+
 //===----------------------------------------------------------------------===//
 // NVVM MMA Ops
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
new file mode 100644
index 0000000000000..0833462ea0509
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h
@@ -0,0 +1,135 @@
+//===- OpenACCSupport.h - OpenACC Support Interface -------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the OpenACCSupport analysis interface, which provides
+// extensible support for OpenACC passes. Custom implementations
+// can be registered to provide pipeline and dialect-specific information
+// that cannot be adequately expressed through type or operation interfaces
+// alone.
+//
+// Usage Pattern:
+// ==============
+//
+// A pass that needs this functionality should call
+// getAnalysis<OpenACCSupport>(), which will provide either:
+// - A cached version if previously initialized, OR
+// - A default implementation if not previously initialized
+//
+// This analysis is never invalidated (isInvalidated returns false), so it only
+// needs to be initialized once and will persist throughout the pass pipeline.
+//
+// Registering a Custom Implementation:
+// =====================================
+//
+// If a custom implementation is needed, create a pass that runs BEFORE the pass
+// that needs the analysis. In this setup pass, use
+// getAnalysis<OpenACCSupport>() followed by setImplementation() to register
+// your custom implementation. The custom implementation will need to provide
+// implementation for all methods defined in the `OpenACCSupportTraits::Concept`
+// class.
+//
+// Example:
+//   void MySetupPass::runOnOperation() {
+//     OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+//     support.setImplementation(MyCustomImpl());
+//   }
+//
+//   void MyAnalysisConsumerPass::runOnOperation() {
+//     OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+//     std::string name = support.getVariableName(someValue);
+//     // ... use the analysis results
+//   }
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
+#define MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
+
+#include "mlir/IR/Value.h"
+#include "mlir/Pass/AnalysisManager.h"
+#include <memory>
+#include <string>
+
+namespace mlir {
+namespace acc {
+
+namespace detail {
+/// This class contains internal trait classes used by OpenACCSupport.
+/// It follows the Concept-Model pattern used throughout MLIR (e.g., in
+/// AliasAnalysis and interface definitions).
+struct OpenACCSupportTraits {
+  class Concept {
+  public:
+    virtual ~Concept() = default;
+
+    /// Get the variable name for a given MLIR value.
+    virtual std::string getVariableName(Value v) = 0;
+  };
+
+  /// This class wraps a concrete OpenACCSupport implementation and forwards
+  /// interface calls to it. This provides type erasure, allowing different
+  /// implementation types to be used interchangeably without inheritance.
+  template <typename ImplT>
+  class Model final : public Concept {
+  public:
+    explicit Model(ImplT &&impl) : impl(std::forward<ImplT>(impl)) {}
+    ~Model() override = default;
+
+    std::string getVariableName(Value v) final {
+      return impl.getVariableName(v);
+    }
+
+  private:
+    ImplT impl;
+  };
+};
+} // namespace detail
+
+//===----------------------------------------------------------------------===//
+// OpenACCSupport
+//===----------------------------------------------------------------------===//
+
+class OpenACCSupport {
+  using Concept = detail::OpenACCSupportTraits::Concept;
+  template <typename ImplT>
+  using Model = detail::OpenACCSupportTraits::Model<ImplT>;
+
+public:
+  OpenACCSupport() = default;
+  OpenACCSupport(Operation *op) {}
+
+  /// Register a custom OpenACCSupport implementation. Only one implementation
+  /// can be registered at a time; calling this replaces any existing
+  /// implementation.
+  template <typename AnalysisT>
+  void setImplementation(AnalysisT &&analysis) {
+    impl =
+        std::make_unique<Model<AnalysisT>>(std::forward<AnalysisT>(analysis));
+  }
+
+  /// Get the variable name for a given value.
+  ///
+  /// \param v The MLIR value to get the variable name for.
+  /// \return The variable name, or an empty string if unavailable.
+  std::string getVariableName(Value v);
+
+  /// Signal that this analysis should always be preserved so that
+  /// underlying implementation registration is not lost.
+  bool isInvalidated(const AnalysisManager::PreservedAnalyses &pa) {
+    return false;
+  }
+
+private:
+  /// The registered custom implementation (if any).
+  std::unique_ptr<Concept> impl;
+};
+
+} // namespace acc
+} // namespace mlir
+
+#endif // MLIR_DIALECT_OPENACC_ANALYSIS_OPENACCSUPPORT_H
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
index e2a60f57940f6..05d2316711c8a 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACC.h
@@ -46,10 +46,10 @@
   mlir::acc::CopyinOp, mlir::acc::CreateOp, mlir::acc::PresentOp,              \
       mlir::acc::NoCreateOp, mlir::acc::AttachOp, mlir::acc::DevicePtrOp,      \
       mlir::acc::GetDevicePtrOp, mlir::acc::PrivateOp,                         \
-      mlir::acc::FirstprivateOp, mlir::acc::UpdateDeviceOp,                    \
-      mlir::acc::UseDeviceOp, mlir::acc::ReductionOp,                          \
-      mlir::acc::DeclareDeviceResidentOp, mlir::acc::DeclareLinkOp,            \
-      mlir::acc::CacheOp
+      mlir::acc::FirstprivateOp, mlir::acc::FirstprivateMapInitialOp,          \
+      mlir::acc::UpdateDeviceOp, mlir::acc::UseDeviceOp,                       \
+      mlir::acc::ReductionOp, mlir::acc::DeclareDeviceResidentOp,              \
+      mlir::acc::DeclareLinkOp, mlir::acc::CacheOp
 #define ACC_DATA_EXIT_OPS                                                      \
   mlir::acc::CopyoutOp, mlir::acc::DeleteOp, mlir::acc::DetachOp,              \
       mlir::acc::UpdateHostOp
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
index e78cdbe3d3e64..2f87975ebaa04 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCOps.td
@@ -787,6 +787,21 @@ def OpenACC_FirstprivateOp : OpenACC_DataEntryOp<"firstprivate",
   let extraClassDeclaration = extraClassDeclarationBase;
 }
 
+// The mapping of firstprivate cannot be represented through an `acc.copyin`
+// since that operation includes present counter updates (and private variables
+// do not impact counters). Instead, the below operation is used to represent
+// the mapping of that initial value which can be used to initialize the private
+// copies.
+def OpenACC_FirstprivateMapInitialOp : OpenACC_DataEntryOp<"firstprivate_map",
+    "mlir::acc::DataClause::acc_firstprivate", "", [],
+    (ins Arg<OpenACC_AnyPointerOrMappableType,"Host variable",[MemRead]>:$var)> {
+  let summary = "Used to decompose firstprivate semantics and represents the "
+                "mapping of the initial value.";
+  let results = (outs Arg<OpenACC_AnyPointerOrMappableType,
+                          "Accelerator mapped variable",[MemWrite]>:$accVar);
+  let extraClassDeclaration = extraClassDeclarationBase;
+}
+
 //===----------------------------------------------------------------------===//
 // 2.5.15 reduction clause
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
index 378f4348f2cf1..0ee88c6f47b67 100644
--- a/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
+++ b/mlir/include/mlir/Dialect/OpenACC/OpenACCUtils.h
@@ -38,6 +38,11 @@ std::optional<ClauseDefaultValue> getDefaultAttr(mlir::Operation *op);
 /// Get the type category of an OpenACC variable.
 mlir::acc::VariableTypeCategory getTypeCategory(mlir::Value var);
 
+/// Attempts to extract the variable name from a value by walking through
+/// view-like operations until an `acc.var_name` attribute is found. Returns
+/// empty string if no name is found.
+std::string getVariableName(mlir::Value v);
+
 } // namespace acc
 } // namespace mlir
 
diff --git a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
index b6c8dbac13c0b..691163d9ce9ac 100644
--- a/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/include/mlir/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 set(LLVM_TARGET_DEFINITIONS ${LLVM_MAIN_INCLUDE_DIR}/llvm/Frontend/OpenMP/OMP.td)
 mlir_tablegen(OmpCommon.td --gen-directive-decl --directives-dialect=OpenMP)
 add_mlir_dialect_tablegen_target(omp_common_td)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..22f0d92ea4cbf
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,5 @@
+set(LLVM_TARGET_DEFINITIONS Passes.td)
+mlir_tablegen(Passes.h.inc -gen-pass-decls -name OpenMP)
+add_public_tablegen_target(MLIROpenMPPassIncGen)
+
+add_mlir_doc(Passes OpenMPPasses ./ -gen-pass-doc)
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
new file mode 100644
index 0000000000000..21b6d1f466558
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.h
@@ -0,0 +1,26 @@
+//===- Passes.h - OpenMP Pass Construction and Registration -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES_H
+
+#include "mlir/Pass/Pass.h"
+
+namespace mlir {
+
+namespace omp {
+
+/// Generate the code for registering conversion passes.
+#define GEN_PASS_DECL
+#define GEN_PASS_REGISTRATION
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace omp
+} // namespace mlir
+
+#endif // MLIR_DIALECT_LLVMIR_TRANSFORMS_PASSES_H
diff --git a/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
new file mode 100644
index 0000000000000..1fde7e08ab433
--- /dev/null
+++ b/mlir/include/mlir/Dialect/OpenMP/Transforms/Passes.td
@@ -0,0 +1,26 @@
+//===-- Passes.td - OpenMP pass definition file ------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+#define MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
+
+include "mlir/Pass/PassBase.td"
+
+def PrepareForOMPOffloadPrivatizationPass : Pass<"omp-offload-privatization-prepare", "ModuleOp"> {
+    let summary = "Prepare OpenMP maps for privatization for deferred target tasks";
+    let description = [{
+      When generating LLVMIR for privatized variables in an OpenMP offloading directive (eg. omp::TargetOp)
+      that creates a deferred target task (when the nowait clause is used), we need to copy the privatized
+      variable out of the stack of the generating task and into the heap so that the deferred target task
+      can still access it. However, if such a privatized variable is also mapped, typically the case for
+      allocatables, then the corresponding `omp::MapInfoOp` needs to be fixed up to map the new heap-allocated
+      variable and not the original variable.
+    }];
+  let dependentDialects = ["LLVM::LLVMDialect"];
+}
+#endif // MLIR_DIALECT_OPENMP_TRANSFORMS_PASSES
diff --git a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
index d0a3f01afe871..43e48a6d34026 100644
--- a/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
+++ b/mlir/include/mlir/Dialect/SparseTensor/IR/SparseTensor.h
@@ -158,16 +158,14 @@ namespace sparse_tensor {
 /// Convenience method to abbreviate casting `getType()`.
 template <typename T>
 inline RankedTensorType getRankedTensorType(T &&t) {
-  assert(static_cast<bool>(std::forward<T>(t)) &&
-         "getRankedTensorType got null argument");
+  assert(static_cast<bool>(t) && "getRankedTensorType got null argument");
   return dyn_cast<RankedTensorType>(std::forward<T>(t).getType());
 }
 
 /// Convenience method to abbreviate casting `getType()`.
 template <typename T>
 inline MemRefType getMemRefType(T &&t) {
-  assert(static_cast<bool>(std::forward<T>(t)) &&
-         "getMemRefType got null argument");
+  assert(static_cast<bool>(t) && "getMemRefType got null argument");
   return cast<MemRefType>(std::forward<T>(t).getType());
 }
 
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
index 4ecf03c34c1a5..e088eb31338dc 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TargetEnv.h
@@ -54,6 +54,8 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op);
 /// and provide utilities around the TOSA specification version.
 class TosaSpecificationVersion {
 public:
+  TosaSpecificationVersion() = default;
+
   TosaSpecificationVersion(uint32_t major, uint32_t minor)
       : majorVersion(major), minorVersion(minor) {}
   TosaSpecificationVersion(SpecificationVersion version)
@@ -83,6 +85,10 @@ class TosaSpecificationVersion {
   }
 };
 
+TosaSpecificationVersion getMinVersion(const Profile &profile);
+TosaSpecificationVersion getMinVersion(const Extension &extension);
+TosaSpecificationVersion getMinVersion(const Level &level);
+
 llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version);
 
 /// This class represents the capability enabled in the target implementation
@@ -91,22 +97,19 @@ llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version);
 class TargetEnv {
 public:
   TargetEnv() {}
-  explicit TargetEnv(SpecificationVersion specificationVersion, Level level,
-                     const ArrayRef<Profile> &profiles,
-                     const ArrayRef<Extension> &extensions)
-      : specificationVersion(specificationVersion), level(level) {
-    enabledProfiles.insert_range(profiles);
-    enabledExtensions.insert_range(extensions);
-  }
 
-  explicit TargetEnv(TargetEnvAttr targetAttr)
-      : TargetEnv(targetAttr.getSpecificationVersion(), targetAttr.getLevel(),
-                  targetAttr.getProfiles(), targetAttr.getExtensions()) {}
+  static FailureOr<TargetEnv>
+  createTargetEnvFromAttr(TargetEnvAttr targetAttr, Location targetEnvAttrLoc);
+
+  static LogicalResult verifyTargetInformation(TargetEnvAttr targetAttr,
+                                               Location targetAttrLoc);
 
   void addProfile(Profile p) { enabledProfiles.insert(p); }
   void addExtension(Extension e) { enabledExtensions.insert(e); }
 
-  SpecificationVersion getSpecVersion() const { return specificationVersion; }
+  TosaSpecificationVersion getSpecVersion() const {
+    return specificationVersion;
+  }
 
   TosaLevel getLevel() const {
     if (level == Level::eightK)
@@ -140,7 +143,17 @@ class TargetEnv {
   }
 
 private:
-  SpecificationVersion specificationVersion;
+  // Require target information is verified before constructing, via the use of
+  // `createTargetEnvFromAttr`.
+  explicit TargetEnv(SpecificationVersion specificationVersion, Level level,
+                     const ArrayRef<Profile> &profiles,
+                     const ArrayRef<Extension> &extensions)
+      : specificationVersion(specificationVersion), level(level) {
+    enabledProfiles.insert_range(profiles);
+    enabledExtensions.insert_range(extensions);
+  }
+
+  TosaSpecificationVersion specificationVersion;
   Level level;
   llvm::SmallSet<Profile, 3> enabledProfiles;
   llvm::SmallSet<Extension, 13> enabledExtensions;
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
index c1b5e785bd739..8b5934ff0630e 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc
@@ -467,6 +467,13 @@ profileComplianceMap = {
 extensionComplianceMap = {
     {"tosa.argmax",
      {{{Extension::int16}, {{{i16T, i32T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64},
+       {{{i8T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i16T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i32T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp16T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3}, {{{fp8e4m3T, i32T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2}, {{{fp8e5m2T, i32T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, i32T}, SpecificationVersion::V_1_0}}}}},
@@ -554,6 +561,18 @@ extensionComplianceMap = {
        allOf},
       {{Extension::bf16},
        {{{bf16T, bf16T, bf16T, bf16T, fp32T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.matmul_t_block_scaled",
+     {{{Extension::mxfp},
+       {{{fp4e2m1T, fp8ue8m0T, fp4e2m1T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, fp8ue8m0T, fp6e2m3T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, fp8ue8m0T, fp6e3m2T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8ue8m0T, fp8e4m3T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8ue8m0T, fp8e5m2T, fp8ue8m0T, fp32T},
+         SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.max_pool2d",
      {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e4m3},
@@ -589,30 +608,68 @@ extensionComplianceMap = {
     {"tosa.tanh",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.add",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.arithmetic_right_shift",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.bitwise_and",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.bitwise_or",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.bitwise_xor",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.intdiv",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.logical_left_shift",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.logical_right_shift",
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.maximum",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.minimum",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.mul",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pow",
      {{{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.sub",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.table",
      {{{Extension::int16},
        {{{i16T, i16T, i32T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.abs",
-     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.bitwise_not",
+     {{{Extension::int64},
+       {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.ceil",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+    {"tosa.clz",
+     {{{Extension::int64},
+       {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.cos",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.exp",
@@ -622,7 +679,9 @@ extensionComplianceMap = {
     {"tosa.log",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.negate",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reciprocal",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
@@ -631,65 +690,84 @@ extensionComplianceMap = {
     {"tosa.sin",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.select",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.equal",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, boolT}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, boolT}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.greater_equal",
-     {{{Extension::bf16},
+     {{{Extension::int64},
+       {{{i64T, i64T, boolT}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16},
        {{{bf16T, bf16T, boolT}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_max",
-     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_min",
-     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_product",
      {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reduce_sum",
-     {{{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.concat",
      {{{Extension::int16}, {{{i16T, i16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.pad",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64},
+       {{{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
        {{{bf16T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reshape",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.reverse",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.slice",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.tile",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16}, {{{bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
     {"tosa.transpose",
-     {{{Extension::fp8e4m3},
+     {{{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
@@ -700,14 +778,48 @@ extensionComplianceMap = {
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, i32T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{{bf16T, i32T, bf16T}, SpecificationVersion::V_1_0}}}}},
+       {{{bf16T, i32T, bf16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64},
+       {{{i8T, i64T, i8T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i16T, i64T, i16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i32T, i64T, i32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp16T, i64T, fp16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, i64T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3, Extension::int64},
+       {{{fp8e4m3T, i64T, fp8e4m3T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::fp8e5m2, Extension::int64},
+       {{{fp8e5m2T, i64T, fp8e5m2T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::bf16, Extension::int64},
+       {{{bf16T, i64T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
     {"tosa.scatter",
      {{{Extension::fp8e4m3},
        {{{fp8e4m3T, i32T, fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
        {{{fp8e5m2T, i32T, fp8e5m2T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
       {{Extension::bf16},
-       {{{bf16T, i32T, bf16T, bf16T}, SpecificationVersion::V_1_0}}}}},
+       {{{bf16T, i32T, bf16T, bf16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64},
+       {{{i8T, i64T, i8T, i8T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i16T, i64T, i16T, i16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i32T, i64T, i32T, i32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i64T, i64T, i64T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp16T, i64T, fp16T, fp16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, i64T, fp32T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::fp8e4m3, Extension::int64},
+       {{{fp8e4m3T, i64T, fp8e4m3T, fp8e4m3T},
+         SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::fp8e5m2, Extension::int64},
+       {{{fp8e5m2T, i64T, fp8e5m2T, fp8e5m2T},
+         SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::bf16, Extension::int64},
+       {{{bf16T, i64T, bf16T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
     {"tosa.resize",
      {{{Extension::int16},
        {{{i16T, i48T}, SpecificationVersion::V_1_0},
@@ -723,6 +835,9 @@ extensionComplianceMap = {
         {{bf16T, i32T}, SpecificationVersion::V_1_0},
         {{bf16T, fp32T}, SpecificationVersion::V_1_0},
         {{fp32T, bf16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64},
+       {{{i32T, i64T}, SpecificationVersion::V_1_1_DRAFT},
+        {{i64T, i32T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::bf16, Extension::fp8e4m3},
        {{{bf16T, fp8e4m3T}, SpecificationVersion::V_1_0},
         {{fp8e4m3T, bf16T}, SpecificationVersion::V_1_0}},
@@ -740,7 +855,43 @@ extensionComplianceMap = {
        {{{fp8e5m2T, fp16T}, SpecificationVersion::V_1_0},
         {{fp8e5m2T, fp32T}, SpecificationVersion::V_1_0},
         {{fp16T, fp8e5m2T}, SpecificationVersion::V_1_0},
-        {{fp32T, fp8e5m2T}, SpecificationVersion::V_1_0}}}}},
+        {{fp32T, fp8e5m2T}, SpecificationVersion::V_1_0}}},
+      {{Extension::bf16, Extension::mxfp},
+       {{{fp4e2m1T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp6e3m2T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp6e2m3T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
+    {"tosa.cast_from_block_scaled",
+     {{{Extension::bf16, Extension::mxfp},
+       {{{fp4e2m1T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8ue8m0T, bf16T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf},
+      {{Extension::mxfp},
+       {{{fp4e2m1T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e4m3T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp8e5m2T, fp8ue8m0T, fp32T}, SpecificationVersion::V_1_1_DRAFT}}}}},
+    {"tosa.cast_to_block_scaled",
+     {{{Extension::mxfp},
+       {{{bf16T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp4e2m1T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp32T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}}},
+      {{Extension::bf16, Extension::mxfp},
+       {{{bf16T, fp6e2m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp6e3m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp8e4m3T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{bf16T, fp8e5m2T, fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT}},
+       allOf}}},
     {"tosa.rescale",
      {{{Extension::int16},
        {{{i48T, i48T, i8T, i8T}, SpecificationVersion::V_1_0},
@@ -749,12 +900,19 @@ extensionComplianceMap = {
     {"tosa.const",
      {{{Extension::int4}, {{{i4T}, SpecificationVersion::V_1_0}}},
       {{Extension::int16}, {{{i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64}, {{{i64T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3}, {{{fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2}, {{{fp8e5m2T}, SpecificationVersion::V_1_0}}},
-      {{Extension::bf16}, {{{bf16T}, SpecificationVersion::V_1_0}}}}},
+      {{Extension::bf16}, {{{bf16T}, SpecificationVersion::V_1_0}}},
+      {{Extension::mxfp},
+       {{{fp8ue8m0T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e3m2T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp6e2m3T}, SpecificationVersion::V_1_1_DRAFT},
+        {{fp4e2m1T}, SpecificationVersion::V_1_1_DRAFT}}}}},
     {"tosa.identity",
      {{{Extension::int4}, {{{i4T, i4T}, SpecificationVersion::V_1_0}}},
       {{Extension::int16}, {{{i48T, i48T}, SpecificationVersion::V_1_0}}},
+      {{Extension::int64}, {{{i64T, i64T}, SpecificationVersion::V_1_1_DRAFT}}},
       {{Extension::fp8e4m3},
        {{{fp8e4m3T, fp8e4m3T}, SpecificationVersion::V_1_0}}},
       {{Extension::fp8e5m2},
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
index 8376a4c87dbf2..5b595dd8eb32a 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOpBase.td
@@ -228,6 +228,7 @@ class Tosa_I32EnumAttr<string name, string description, string mnemonic,
 // FP  : Floating-Point Inference. Primarily FP16 and FP32 operations.
 //
 // Extension:
+// INT64        : 64-bit integer operations.
 // INT16        : 16-bit integer operations.
 // INT4         : 4-bit integer weights.
 // BF16         : BFloat16 operations.
@@ -270,21 +271,25 @@ def Tosa_EXT_CONTROLFLOW  : I32EnumAttrCase<"controlflow", 8>;
 def Tosa_EXT_DOUBLEROUND  : I32EnumAttrCase<"doubleround", 9>;
 def Tosa_EXT_INEXACTROUND : I32EnumAttrCase<"inexactround", 10>;
 def Tosa_EXT_DYNAMIC      : I32EnumAttrCase<"dynamic", 11>;
+def Tosa_EXT_MXFP         : I32EnumAttrCase<"mxfp", 12>;
+def Tosa_EXT_INT64        : I32EnumAttrCase<"int64", 13>;
+
 
 def Tosa_ExtensionAttr
     : Tosa_I32EnumAttr<"Extension", "supported TOSA extensions", "ext", [
       Tosa_EXT_NONE, Tosa_EXT_INT16, Tosa_EXT_INT4, Tosa_EXT_BF16,
       Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_FFT, Tosa_EXT_VARIABLE,
       Tosa_EXT_CONTROLFLOW, Tosa_EXT_DOUBLEROUND, Tosa_EXT_INEXACTROUND,
-      Tosa_EXT_DYNAMIC
+      Tosa_EXT_DYNAMIC, Tosa_EXT_MXFP, Tosa_EXT_INT64
     ]> {
   let extraClassDeclaration = [{
-    static llvm::SmallVector<Extension, 11> getAllValues() {
+    static llvm::SmallVector<Extension, 13> getAllValues() {
       return {
         Extension::int16, Extension::int4, Extension::bf16,
         Extension::fp8e4m3, Extension::fp8e5m2, Extension::fft,
         Extension::variable, Extension::controlflow, Extension::doubleround,
-        Extension::inexactround, Extension::dynamic
+        Extension::inexactround, Extension::dynamic, Extension::mxfp,
+        Extension::int64
       };
     }
   }];
@@ -437,7 +442,7 @@ def Tosa_TargetEnv : Tosa_Attr<"TargetEnv", "target_env"> {
 }
 
 //===----------------------------------------------------------------------===//
-// Iterable attributes.
+// Enum attributes.
 //===----------------------------------------------------------------------===//
 // Defined in `section 3. Enumerations` of the TOSA specification.
 
@@ -463,6 +468,18 @@ def Tosa_RoundingModeAttr
     : Tosa_I32EnumAttr<"RoundingMode", "Supported rounding modes", "rounding_mode",
                     [Tosa_ROUNDING_SINGLE_ROUND, Tosa_ROUNDING_INEXACT_ROUND, Tosa_ROUNDING_DOUBLE_ROUND]>;
 
+def Tosa_BLOCK_SIZE_32 : I32EnumAttrCase<"BLOCK_SIZE_32", 32>;
+
+def Tosa_BlockSizeAttr
+    : Tosa_I32EnumAttr<"BlockSize", "Block size for the block_scaled formats", "block_size",
+                    [Tosa_BLOCK_SIZE_32]> {
+  let extraClassDeclaration = [{
+    static uint32_t getBlockSizeValue(BlockSize blockSize) {
+      return static_cast<uint32_t>(blockSize);
+    }
+  }];
+}
+
 
 //===----------------------------------------------------------------------===//
 // TOSA Interfaces.
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 697a04e94441a..6e1759119a621 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -52,7 +52,7 @@ def Tosa_ArgMaxOp : Tosa_InferShapedTypeOp<"argmax"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -108,7 +108,6 @@ def Tosa_AvgPool2dOp : Tosa_InferShapedTypeOp<"avg_pool2d"> {
     LogicalResult verifyOutputZeroPoint(int64_t zp);
   }];
 
-  let hasCanonicalizer = 1;
   let hasVerifier = 1;
 
   let assemblyFormat =
@@ -348,6 +347,40 @@ def Tosa_MatMulOp : Tosa_InferShapedTypeOp<"matmul"> {
       "operands attr-dict `:` functional-type(operands, results)";
 }
 
+//===----------------------------------------------------------------------===//
+// Operator: matmul_t_block_scaled
+//===----------------------------------------------------------------------===//
+def Tosa_MatmulTBlockScaledOp : Tosa_InferShapedTypeOp<"matmul_t_block_scaled"> {
+  let summary = "Performs two dimensional matrix multiplications using block scaled tensors.";
+
+  let description = [{
+    Performs two dimensional matrix multiplications using block scaled tensors. The block
+    dimension is always the the last dimension of the tensor, so the result is effectively
+    a matrix multiply of A by the transposed B matrix. If the N dimension of input B is of
+    size 1, the B matrix will be broadcast.
+  }];
+
+  let arguments = (ins
+    Tosa_MXFPDataTensor3D:$a_data,
+    Tosa_MXFPScaleTensor3D:$a_scale,
+    Tosa_MXFPDataTensor3D:$b_data,
+    Tosa_MXFPScaleTensor3D:$b_scale,
+    Tosa_BlockSizeAttr:$block_size
+  );
+
+  let results = (outs
+    Tosa_Tensor3D:$output_data
+  );
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+
+  list<Availability> availability = [
+    Profile<[Tosa_PRO_FP]>,
+    Extension<[Tosa_EXT_MXFP]>
+  ];
+}
+
 //===----------------------------------------------------------------------===//
 // Operator: max_pool2d
 //===----------------------------------------------------------------------===//
@@ -648,7 +681,7 @@ def Tosa_AddOp : Tosa_ElementwiseOp<"add", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -681,7 +714,7 @@ def Tosa_ArithmeticRightShiftOp : Tosa_ElementwiseOp<"arithmetic_right_shift",
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -711,7 +744,7 @@ def Tosa_BitwiseAndOp : Tosa_ElementwiseOp<"bitwise_and", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -741,7 +774,7 @@ def Tosa_BitwiseOrOp : Tosa_ElementwiseOp<"bitwise_or", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -771,7 +804,7 @@ def Tosa_BitwiseXorOp : Tosa_ElementwiseOp<"bitwise_xor", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -802,7 +835,7 @@ def Tosa_IntDivOp : Tosa_ElementwiseOp<"intdiv", [SameOperandsAndResultElementTy
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -864,7 +897,7 @@ def Tosa_LogicalLeftShiftOp : Tosa_ElementwiseOp<"logical_left_shift",
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -894,7 +927,7 @@ def Tosa_LogicalRightShiftOp : Tosa_ElementwiseOp<"logical_right_shift",
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -985,7 +1018,7 @@ def Tosa_MaximumOp : Tosa_ElementwiseOp<"maximum", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
   let hasCustomAssemblyFormat = 1;
 }
@@ -1015,7 +1048,7 @@ def Tosa_MinimumOp : Tosa_ElementwiseOp<"minimum", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasCustomAssemblyFormat = 1;
@@ -1049,7 +1082,7 @@ def Tosa_MulOp : Tosa_Op<"mul", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1110,7 +1143,7 @@ def Tosa_SubOp : Tosa_ElementwiseOp<"sub", [SameOperandsAndResultElementType]> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1195,7 +1228,7 @@ def Tosa_AbsOp : Tosa_ElementwiseUnaryOp<"abs"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1223,7 +1256,7 @@ def Tosa_BitwiseNotOp : Tosa_ElementwiseUnaryOp<"bitwise_not"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -1275,7 +1308,7 @@ def Tosa_ClzOp : Tosa_ElementwiseUnaryOp<"clz"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT]>,
-    Extension<[]>,
+    Extension<[Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -1435,7 +1468,7 @@ def Tosa_NegateOp : Tosa_InferShapedTypeOp<"negate", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let builders = [Tosa_NegateOpQuantInfoBuilder];
@@ -1571,7 +1604,7 @@ def Tosa_SelectOp : Tosa_ElementwiseOp<"select"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasCanonicalizeMethod = 1;
@@ -1621,7 +1654,7 @@ def Tosa_EqualOp : Tosa_ElementwiseOp<"equal", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let extraClassDeclaration = [{
@@ -1656,7 +1689,7 @@ def Tosa_GreaterOp : Tosa_ElementwiseOp<"greater", [SameOperandsElementType]> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1686,7 +1719,7 @@ def Tosa_GreaterEqualOp : Tosa_ElementwiseOp<"greater_equal",
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1802,7 +1835,7 @@ def Tosa_ReduceMaxOp : Tosa_InferTensorTypeOp<"reduce_max"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1844,7 +1877,7 @@ def Tosa_ReduceMinOp : Tosa_InferTensorTypeOp<"reduce_min"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1926,7 +1959,7 @@ def Tosa_ReduceSumOp : Tosa_InferTensorTypeOp<"reduce_sum"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -1972,7 +2005,7 @@ def Tosa_ConcatOp : Tosa_InferTensorTypeOp<"concat"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT16, Tosa_EXT_INT64]>,
   ];
 
   let hasCanonicalizer = 1;
@@ -2028,7 +2061,7 @@ def Tosa_PadOp : Tosa_InferShapedTypeOp<"pad"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let builders = [Tosa_PadOpQuantInfoBuilder];
@@ -2066,7 +2099,7 @@ def Tosa_ReshapeOp : Tosa_InferTensorTypeOp<"reshape"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let extraClassDeclaration = [{
@@ -2103,7 +2136,7 @@ def Tosa_ReverseOp: Tosa_Op<"reverse", [
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -2136,7 +2169,7 @@ def Tosa_SliceOp : Tosa_InferShapedTypeOp<"slice"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasCanonicalizer = 1;
@@ -2167,7 +2200,7 @@ def Tosa_TileOp : Tosa_InferShapedTypeOp<"tile"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let extraClassDeclaration = [{
@@ -2206,7 +2239,7 @@ def Tosa_TransposeOp : Tosa_InferShapedTypeOp<"transpose",
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasCanonicalizer = 1;
@@ -2245,7 +2278,7 @@ def Tosa_GatherOp : Tosa_InferShapedTypeOp<"gather"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasVerifier = 1;
@@ -2283,7 +2316,7 @@ def Tosa_ScatterOp : Tosa_InferShapedTypeOp<"scatter"> {
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let hasVerifier = 1;
@@ -2429,7 +2462,7 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure, SameOperandsAndResultShape,
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_MXFP, Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
@@ -2437,6 +2470,69 @@ def Tosa_CastOp: Tosa_Op<"cast", [Pure, SameOperandsAndResultShape,
   let hasFolder = 1;
 }
 
+//===----------------------------------------------------------------------===//
+// Operator: cast_from_block_scaled
+//===----------------------------------------------------------------------===//
+def Tosa_CastFromBlockScaledOp: Tosa_InferShapedTypeOp<"cast_from_block_scaled"> {
+  let summary = "Apply scales from a scale tensor to the values in a value tensor";
+
+  let description = [{
+    Apply the scales from a scale tensor to the values in a value tensor, casting
+    the result to the output type. The block dimension must be the last dimension
+    of the tensor.
+  }];
+
+  let arguments = (ins
+    Tosa_MXFPDataTensorAtLeast1D:$input_data,
+    Tosa_MXFPScaleTensorAtLeast1D:$input_scale,
+    Tosa_BlockSizeAttr:$block_size
+  );
+
+  let results = (outs
+    Tosa_TensorAtLeast1D: $output_data
+  );
+
+  list<Availability> availability = [
+    Profile<[Tosa_PRO_FP]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_MXFP]>,
+  ];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
+//===----------------------------------------------------------------------===//
+// Operator: cast_to_block_scaled
+//===----------------------------------------------------------------------===//
+def Tosa_CastToBlockScaledOp : Tosa_InferShapedTypeOp<"cast_to_block_scaled"> {
+  let summary = "Calculate scale tensor values per block, output to separate scale and data tensors.";
+
+  let description = [{
+    Calculate a scale value per block of input values and use that to calculate
+    scaled data values from an input tensor. The output tensors are cast to the
+    specified scale and value types. The block dimension will be the last dimension
+    of the tensor.
+  }];
+
+  let arguments = (ins
+    Tosa_TensorAtLeast1D:$input_data,
+    Tosa_BlockSizeAttr:$block_size
+  );
+
+  let results = (outs
+    Tosa_MXFPDataTensorAtLeast1D:$output_data,
+    Tosa_MXFPScaleTensorAtLeast1D:$output_scale
+  );
+
+  list<Availability> availability = [
+    Profile<[Tosa_PRO_FP]>,
+    Extension<[Tosa_EXT_BF16, Tosa_EXT_MXFP]>
+  ];
+
+  let hasVerifier = 1;
+  let hasCustomAssemblyFormat = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // Operator: rescale
 //===----------------------------------------------------------------------===//
@@ -2545,7 +2641,7 @@ def Tosa_ConstOp : Tosa_Op<"const", [ConstantLike, Pure,
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_INT4, Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_INT4, Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_MXFP, Tosa_EXT_INT64]>,
   ];
 
   let hasFolder = 1;
@@ -2573,7 +2669,7 @@ def Tosa_IdentityOp: Tosa_Op<"identity", [Pure,
 
   list<Availability> availability = [
     Profile<[Tosa_PRO_INT, Tosa_PRO_FP]>,
-    Extension<[Tosa_EXT_INT4, Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16]>,
+    Extension<[Tosa_EXT_INT4, Tosa_EXT_INT16, Tosa_EXT_FP8E4M3, Tosa_EXT_FP8E5M2, Tosa_EXT_BF16, Tosa_EXT_INT64]>,
   ];
 
   let assemblyFormat = "operands attr-dict `:` functional-type(operands, results)";
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
index 7b946ad6c6a89..45d380c1b2e6c 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaProfileCompliance.h
@@ -79,7 +79,8 @@ class ProfileInfoDepot {
 
   LogicalResult populatationDispatch(Operation *op);
 
-  LogicalResult populateProfileInfo(ValueRange operands, Value output);
+  // Add input operands and output results to the profile type info list
+  LogicalResult populateProfileInfo(ValueRange operands, ValueRange results);
 
   // Base
   template <typename T>
@@ -147,10 +148,12 @@ class TosaProfileCompliance {
     case Extension::fp8e4m3:
     case Extension::fp8e5m2:
     case Extension::fft:
+    case Extension::mxfp:
       return {Profile::pro_fp};
     case Extension::variable:
     case Extension::controlflow:
     case Extension::dynamic:
+    case Extension::int64:
       return {Profile::pro_fp, Profile::pro_int};
     case Extension::none:
       return {};
diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
index 93ab120339d55..93843e86fd378 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaTypesBase.td
@@ -84,6 +84,10 @@ def Tosa_QuantizedInt : AnyTypeOf<[Tosa_QuantizedType<"uint8", [8], 0>,
 def Tosa_AnyNumber : AnyTypeOf<[Tosa_Int, Tosa_QuantizedInt, AnyFloat],
                                 "number">;
 
+def Tosa_MXFPNumber : AnyTypeOf<[F8E4M3FN, F8E5M2, F4E2M1FN, F6E2M3FN, F6E3M2FN],
+                                "micro-scaling format number">;
+def Tosa_MXFPScaleNumber : AnyTypeOf<[F8E8M0FNU], "micro-scaling format scale number">;
+
 //===----------------------------------------------------------------------===//
 // TOSA Tensor Conformance
 //===----------------------------------------------------------------------===//
@@ -187,6 +191,25 @@ def Tosa_Int32Tensor2D : AnyTypeOf<[
 def Tosa_TensorAtLeast1D : AnyTypeOf<[
   Tosa_UnrankedTensor, TosaRankedTensorOf<[Tosa_AnyNumber], [AtLeastRankOne]>], "tosa-conformant tensor of at least rank 1", "::mlir::TensorType">;
 
+def Tosa_MXFPDataTensor3D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPNumber]>,
+  TosaTensorRankOf<[Tosa_MXFPNumber], [3]>
+]>;
+def Tosa_MXFPScaleTensor3D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPScaleNumber]>,
+  TosaTensorRankOf<[Tosa_MXFPScaleNumber], [3]>
+]>;
+def Tosa_MXFPDataTensorAtLeast1D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPNumber]>,
+  TosaRankedTensorOf<[Tosa_MXFPNumber], [AtLeastRankOne]>],
+  "tosa-conformant tensor of at least rank 1", "::mlir::TensorType"
+>;
+def Tosa_MXFPScaleTensorAtLeast1D : AnyTypeOf<[
+  TosaUnrankedTensorOf<[Tosa_MXFPScaleNumber]>,
+  TosaRankedTensorOf<[Tosa_MXFPScaleNumber], [AtLeastRankOne]>],
+  "tosa-conformant tensor of at least rank 1", "::mlir::TensorType"
+>;
+
 //===----------------------------------------------------------------------===//
 // Generic scalar, vector, or tensor of a particular type.
 //===----------------------------------------------------------------------===//
diff --git a/mlir/include/mlir/IR/Builders.h b/mlir/include/mlir/IR/Builders.h
index 9205f16f97bbb..3ba6818204ba0 100644
--- a/mlir/include/mlir/IR/Builders.h
+++ b/mlir/include/mlir/IR/Builders.h
@@ -502,6 +502,7 @@ class OpBuilder : public Builder {
 public:
   /// Create an operation of specific op type at the current insertion point.
   template <typename OpTy, typename... Args>
+  [[deprecated("Use OpTy::create instead")]]
   OpTy create(Location location, Args &&...args) {
     OperationState state(location,
                          getCheckRegisteredInfo<OpTy>(location.getContext()));
@@ -517,9 +518,9 @@ class OpBuilder : public Builder {
   /// the results of the operation.
   ///
   /// Note: This performs opportunistic eager folding during IR construction.
-  /// The folders are designed to operate efficiently on canonical IR, which 
+  /// The folders are designed to operate efficiently on canonical IR, which
   /// this API does not enforce. Complete folding isn't only expected in the
-  /// context of canonicalization which intertwine folders with pattern 
+  /// context of canonicalization which intertwine folders with pattern
   /// rewrites until fixed-point.
   template <typename OpTy, typename... Args>
   void createOrFold(SmallVectorImpl<Value> &results, Location location,
diff --git a/mlir/include/mlir/IR/CommonAttrConstraints.td b/mlir/include/mlir/IR/CommonAttrConstraints.td
index e1869c1821b11..b7e168a3e6f86 100644
--- a/mlir/include/mlir/IR/CommonAttrConstraints.td
+++ b/mlir/include/mlir/IR/CommonAttrConstraints.td
@@ -804,6 +804,11 @@ def IntPositivePowerOf2 : AllAttrOf<[IntPositive, IntPowerOf2]>;
 
 class IntValidAlignment<Attr attr>: ConfinedAttr<attr, [IntPositivePowerOf2]>;
 
+class IntIsOneOf<list<int> values> : AttrConstraint<
+    CPred<"::llvm::is_contained({" # !interleave(!foreach(val, values, val), ", ") #
+                                "}, ::llvm::cast<::mlir::IntegerAttr>($_self).getInt())">,
+    "whose value is one of {" # !interleave(!foreach(val, values, val), ", ") # "}">;
+
 class ArrayMaxCount<int n> : AttrConstraint<
     CPred<"::llvm::cast<::mlir::ArrayAttr>($_self).size() <= " # n>,
     "with at most " # n # " elements">;
diff --git a/mlir/include/mlir/IR/Value.h b/mlir/include/mlir/IR/Value.h
index 4d6d89fa69a07..af58778a0a13e 100644
--- a/mlir/include/mlir/IR/Value.h
+++ b/mlir/include/mlir/IR/Value.h
@@ -433,9 +433,19 @@ inline unsigned OpResultImpl::getResultNumber() const {
 template <typename Ty>
 struct TypedValue : Value {
   using Value::Value;
+  using ValueType = Ty;
 
   static bool classof(Value value) { return llvm::isa<Ty>(value.getType()); }
 
+  /// TypedValue<B> can implicitly convert to TypedValue<A> if B is assignable
+  /// to A.
+  template <typename ToTy,
+            typename = typename std::enable_if<std::is_assignable<
+                typename ToTy::ValueType &, Ty>::value>::type>
+  operator ToTy() const {
+    return llvm::cast<ToTy>(*this);
+  }
+
   /// Return the known Type
   Ty getType() const { return llvm::cast<Ty>(Value::getType()); }
   void setType(Ty ty) { Value::setType(ty); }
diff --git a/mlir/lib/Bindings/Python/IRCore.cpp b/mlir/lib/Bindings/Python/IRCore.cpp
index 06d0256e6287b..cda4fe19c16f8 100644
--- a/mlir/lib/Bindings/Python/IRCore.cpp
+++ b/mlir/lib/Bindings/Python/IRCore.cpp
@@ -598,7 +598,7 @@ class PyOpOperand {
 public:
   PyOpOperand(MlirOpOperand opOperand) : opOperand(opOperand) {}
 
-  PyOpView getOwner() {
+  nb::typed<nb::object, PyOpView> getOwner() {
     MlirOperation owner = mlirOpOperandGetOwner(opOperand);
     PyMlirContextRef context =
         PyMlirContext::forContext(mlirOperationGetContext(owner));
diff --git a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
index 85f0fd1dd1048..478b6aaaec83a 100644
--- a/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
+++ b/mlir/lib/Conversion/AMDGPUToROCDL/AMDGPUToROCDL.cpp
@@ -16,6 +16,7 @@
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
+#include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Pass/Pass.h"
@@ -993,28 +994,36 @@ mfmaOpToScaledIntrinsic(ScaledMFMAOp smfma, Chipset chipset) {
 /// on the architecture you are compiling for.
 static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
                                                   Chipset chipset) {
-  auto sourceVectorType = dyn_cast<VectorType>(wmma.getSourceA().getType());
-  auto sourceBVectorType = dyn_cast<VectorType>(wmma.getSourceB().getType());
-  auto destVectorType = dyn_cast<VectorType>(wmma.getDestC().getType());
-  auto elemSourceType = sourceVectorType.getElementType();
-  auto elemBSourceType = sourceBVectorType.getElementType();
-  auto elemDestType = destVectorType.getElementType();
-
-  if (elemSourceType.isF16() && elemDestType.isF32())
-    return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
-  if (elemSourceType.isBF16() && elemDestType.isF32())
-    return ROCDL::wmma_f32_16x16x16_bf16::getOperationName();
-  if (elemSourceType.isF16() && elemDestType.isF16())
-    return ROCDL::wmma_f16_16x16x16_f16::getOperationName();
-  if (elemSourceType.isBF16() && elemDestType.isBF16())
-    return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
-  if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
-    return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
-  if (chipset.majorVersion == 11) {
-    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
-      return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+  auto sourceVectorType = cast<VectorType>(wmma.getSourceA().getType());
+  auto sourceBVectorType = cast<VectorType>(wmma.getSourceB().getType());
+  auto destVectorType = cast<VectorType>(wmma.getDestC().getType());
+  Type elemSourceType = sourceVectorType.getElementType();
+  Type elemBSourceType = sourceBVectorType.getElementType();
+  Type elemDestType = destVectorType.getElementType();
+
+  const uint32_t k = wmma.getK();
+
+  if (k == 16) {
+    if (elemSourceType.isF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x16_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isF32())
+      return ROCDL::wmma_f32_16x16x16_bf16::getOperationName();
+    if (elemSourceType.isF16() && elemDestType.isF16())
+      return ROCDL::wmma_f16_16x16x16_f16::getOperationName();
+    if (elemSourceType.isBF16() && elemDestType.isBF16())
+      return ROCDL::wmma_bf16_16x16x16_bf16::getOperationName();
+    if (elemSourceType.isInteger(8) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x16_iu8::getOperationName();
+    if (chipset.majorVersion == 11) {
+      if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
+        return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
+    }
   }
-  if (chipset.majorVersion >= 12) {
+  if (chipset.majorVersion < 12)
+    return std::nullopt;
+
+  // gfx12+
+  if (k == 16) {
     if (isa<Float8E4M3FNType>(elemSourceType) &&
         isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_fp8_fp8::getOperationName();
@@ -1027,17 +1036,18 @@ static std::optional<StringRef> wmmaOpToIntrinsic(WMMAOp wmma,
     if (isa<Float8E5M2Type>(elemSourceType) &&
         isa<Float8E4M3FNType>(elemBSourceType) && elemDestType.isF32())
       return ROCDL::wmma_f32_16x16x16_bf8_fp8::getOperationName();
-    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32)) {
-      bool isWave64 = destVectorType.getNumElements() == 4;
-      // This is the ambiguous case. 8 inputs to the wave64 version means that
-      // we want the 16x16x32 version, but for wave32 they mean the short form.
-      bool has8Inputs = sourceVectorType.getNumElements() == 8;
-      if ((isWave64 && has8Inputs) || (!isWave64 && !has8Inputs))
-        return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
       return ROCDL::wmma_i32_16x16x16_iu4::getOperationName();
-    }
+
+    return std::nullopt;
   }
-  return std::nullopt;
+  if (k == 32) {
+    if (elemSourceType.isInteger(4) && elemDestType.isInteger(32))
+      return ROCDL::wmma_i32_16x16x32_iu4::getOperationName();
+    return std::nullopt;
+  }
+
+  llvm_unreachable("unhandled WMMA case");
 }
 
 namespace {
@@ -1927,16 +1937,16 @@ struct AMDGPUPermlaneLowering : public ConvertOpToLLVMPattern<PermlaneSwapOp> {
       else
         llvm_unreachable("unsupported row length");
 
-      const Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
-      const Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1});
+      Value vdst0 = LLVM::ExtractValueOp::create(rewriter, loc, res, {0});
+      Value vdst1 = LLVM::ExtractValueOp::create(rewriter, loc, res, {1});
 
-      const Value isEqual =
-          rewriter.create<LLVM::ICmpOp>(loc, LLVM::ICmpPredicate::eq, vdst0, v);
+      Value isEqual = LLVM::ICmpOp::create(rewriter, loc,
+                                           LLVM::ICmpPredicate::eq, vdst0, v);
 
       // Per `permlane(16|32)` semantics: if the first extracted element equals
       // 'v', the result is the second element; otherwise it is the first.
       Value vdstNew =
-          rewriter.create<LLVM::SelectOp>(loc, isEqual, vdst1, vdst0);
+          LLVM::SelectOp::create(rewriter, loc, isEqual, vdst1, vdst0);
       permuted.emplace_back(vdstNew);
     }
 
diff --git a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
index 42099aaa6b574..12adfe13a5685 100644
--- a/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
+++ b/mlir/lib/Conversion/ComplexToROCDLLibraryCalls/ComplexToROCDLLibraryCalls.cpp
@@ -93,11 +93,11 @@ struct PowiOpToROCDLLibraryCalls : public OpRewritePattern<complex::PowiOp> {
 
     Location loc = op.getLoc();
     Value exponentReal =
-        rewriter.create<arith::SIToFPOp>(loc, exponentFloatType, op.getRhs());
-    Value zeroImag = rewriter.create<arith::ConstantOp>(
-        loc, rewriter.getZeroAttr(exponentFloatType));
-    Value exponent = rewriter.create<complex::CreateOp>(
-        loc, op.getLhs().getType(), exponentReal, zeroImag);
+        arith::SIToFPOp::create(rewriter, loc, exponentFloatType, op.getRhs());
+    Value zeroImag = arith::ConstantOp::create(
+        rewriter, loc, rewriter.getZeroAttr(exponentFloatType));
+    Value exponent = complex::CreateOp::create(
+        rewriter, loc, op.getLhs().getType(), exponentReal, zeroImag);
 
     rewriter.replaceOpWithNewOp<complex::PowOp>(op, op.getType(), op.getLhs(),
                                                 exponent, op.getFastmathAttr());
diff --git a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
index 5613e021cd709..0fe72394b61d6 100644
--- a/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
+++ b/mlir/lib/Conversion/ComplexToStandard/ComplexToStandard.cpp
@@ -937,14 +937,14 @@ struct PowiOpConversion : public OpConversionPattern<complex::PowiOp> {
     auto elementType = cast<FloatType>(type.getElementType());
 
     Value floatExponent =
-        builder.create<arith::SIToFPOp>(elementType, adaptor.getRhs());
+        arith::SIToFPOp::create(builder, elementType, adaptor.getRhs());
     Value zero = arith::ConstantOp::create(
         builder, elementType, builder.getFloatAttr(elementType, 0.0));
     Value complexExponent =
         complex::CreateOp::create(builder, type, floatExponent, zero);
 
-    auto pow = builder.create<complex::PowOp>(
-        type, adaptor.getLhs(), complexExponent, op.getFastmathAttr());
+    auto pow = complex::PowOp::create(builder, type, adaptor.getLhs(),
+                                      complexExponent, op.getFastmathAttr());
     rewriter.replaceOp(op, pow.getResult());
     return success();
   }
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
index 2285d2695db4e..eb662a1b056de 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.cpp
@@ -507,7 +507,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
       LLVM::LLVMFunctionType::get(rewriter.getI32Type(), {ptrType},
                                   /*isVarArg=*/true);
   LLVM::LLVMFuncOp printfDecl =
-      getOrDefineFunction(moduleOp, loc, rewriter, "printf", printfType);
+      getOrDefineFunction(moduleOp, loc, rewriter, funcName, printfType);
+  printfDecl.setCConv(callingConvention);
 
   // Create the global op or find an existing one.
   LLVM::GlobalOp global = getOrCreateStringConstant(
@@ -530,7 +531,8 @@ LogicalResult GPUPrintfOpToLLVMCallLowering::matchAndRewrite(
   printfArgs.push_back(stringStart);
   printfArgs.append(argsRange.begin(), argsRange.end());
 
-  LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
+  auto call = LLVM::CallOp::create(rewriter, loc, printfDecl, printfArgs);
+  call.setCConv(callingConvention);
   rewriter.eraseOp(gpuPrintfOp);
   return success();
 }
diff --git a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
index 66d3bb40a8f5a..ec74787b2a8ed 100644
--- a/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
+++ b/mlir/lib/Conversion/GPUCommon/GPUOpsLowering.h
@@ -10,6 +10,7 @@
 
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMAttrs.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 
 namespace mlir {
@@ -142,13 +143,23 @@ struct GPUPrintfOpToHIPLowering : public ConvertOpToLLVMPattern<gpu::PrintfOp> {
 /// This pass will add a declaration of printf() to the GPUModule if needed
 /// and separate out the format strings into global constants. For some
 /// runtimes, such as OpenCL on AMD, this is sufficient setup, as the compiler
-/// will lower printf calls to appropriate device-side code
+/// will lower printf calls to appropriate device-side code.
+/// However not all backends use the same calling convention and function
+/// naming.
+/// For example, the LLVM SPIRV backend requires calling convention
+/// LLVM::cconv::CConv::SPIR_FUNC and function name needs to be
+/// mangled as "_Z6printfPU3AS2Kcz".
+/// Default callingConvention is LLVM::cconv::CConv::C and
+/// funcName is "printf" but they can be customized as needed.
 struct GPUPrintfOpToLLVMCallLowering
     : public ConvertOpToLLVMPattern<gpu::PrintfOp> {
-  GPUPrintfOpToLLVMCallLowering(const LLVMTypeConverter &converter,
-                                int addressSpace = 0)
+  GPUPrintfOpToLLVMCallLowering(
+      const LLVMTypeConverter &converter, int addressSpace = 0,
+      LLVM::cconv::CConv callingConvention = LLVM::cconv::CConv::C,
+      StringRef funcName = "printf")
       : ConvertOpToLLVMPattern<gpu::PrintfOp>(converter),
-        addressSpace(addressSpace) {}
+        addressSpace(addressSpace), callingConvention(callingConvention),
+        funcName(funcName) {}
 
   LogicalResult
   matchAndRewrite(gpu::PrintfOp gpuPrintfOp, gpu::PrintfOpAdaptor adaptor,
@@ -156,6 +167,8 @@ struct GPUPrintfOpToLLVMCallLowering
 
 private:
   int addressSpace;
+  LLVM::cconv::CConv callingConvention;
+  StringRef funcName;
 };
 
 /// Lowering of gpu.printf to a vprintf standard library.
diff --git a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
index c2363a1a40294..25f1e1b184d61 100644
--- a/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
+++ b/mlir/lib/Conversion/GPUToLLVMSPV/GPUToLLVMSPV.cpp
@@ -470,10 +470,13 @@ struct GPUToLLVMSPVConversionPass final
                         gpu::GPUFuncOp, gpu::GlobalIdOp, gpu::GridDimOp,
                         gpu::LaneIdOp, gpu::NumSubgroupsOp, gpu::ReturnOp,
                         gpu::ShuffleOp, gpu::SubgroupIdOp, gpu::SubgroupSizeOp,
-                        gpu::ThreadIdOp>();
+                        gpu::ThreadIdOp, gpu::PrintfOp>();
 
     populateGpuToLLVMSPVConversionPatterns(converter, patterns);
     populateGpuMemorySpaceAttributeConversions(converter);
+    patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/2,
+                                                LLVM::cconv::CConv::SPIR_FUNC,
+                                                "_Z6printfPU3AS2Kcz");
 
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
diff --git a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
index 852c50c965f11..d64c4d64cad84 100644
--- a/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
+++ b/mlir/lib/Conversion/GPUToNVVM/LowerGpuOpsToNVVMOps.cpp
@@ -500,19 +500,19 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
           op->getParentWithTrait<mlir::OpTrait::AutomaticAllocationScope>();
       assert(scope && "Expected op to be inside automatic allocation scope");
       rewriter.setInsertionPointToStart(&scope->getRegion(0).front());
-      auto one = rewriter.create<LLVM::ConstantOp>(
-          loc, rewriter.getI32Type(), rewriter.getI32IntegerAttr(1));
+      auto one = LLVM::ConstantOp::create(rewriter, loc, rewriter.getI32Type(),
+                                          rewriter.getI32IntegerAttr(1));
       sinPtr =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+          LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
       cosPtr =
-          rewriter.create<LLVM::AllocaOp>(loc, ptrType, computeType, one, 0);
+          LLVM::AllocaOp::create(rewriter, loc, ptrType, computeType, one, 0);
     }
 
     createSincosCall(rewriter, loc, sincosFunc, convertedInput, sinPtr, cosPtr,
                      op);
 
-    auto sinResult = rewriter.create<LLVM::LoadOp>(loc, computeType, sinPtr);
-    auto cosResult = rewriter.create<LLVM::LoadOp>(loc, computeType, cosPtr);
+    auto sinResult = LLVM::LoadOp::create(rewriter, loc, computeType, sinPtr);
+    auto cosResult = LLVM::LoadOp::create(rewriter, loc, computeType, cosPtr);
 
     rewriter.replaceOp(op, {maybeTrunc(sinResult, inputType, rewriter),
                             maybeTrunc(cosResult, inputType, rewriter)});
@@ -522,14 +522,15 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
 private:
   Value maybeExt(Value operand, PatternRewriter &rewriter) const {
     if (isa<Float16Type, BFloat16Type>(operand.getType()))
-      return rewriter.create<LLVM::FPExtOp>(
-          operand.getLoc(), Float32Type::get(rewriter.getContext()), operand);
+      return LLVM::FPExtOp::create(rewriter, operand.getLoc(),
+                                   Float32Type::get(rewriter.getContext()),
+                                   operand);
     return operand;
   }
 
   Value maybeTrunc(Value operand, Type type, PatternRewriter &rewriter) const {
     if (operand.getType() != type)
-      return rewriter.create<LLVM::FPTruncOp>(operand.getLoc(), type, operand);
+      return LLVM::FPTruncOp::create(rewriter, operand.getLoc(), type, operand);
     return operand;
   }
 
@@ -556,7 +557,7 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
     }
 
     SmallVector<Value> callOperands = {input, sinPtr, cosPtr};
-    rewriter.create<LLVM::CallOp>(loc, funcOp, callOperands);
+    LLVM::CallOp::create(rewriter, loc, funcOp, callOperands);
   }
 };
 
diff --git a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
index 229e40e2061cb..7cce324f94295 100644
--- a/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
+++ b/mlir/lib/Conversion/MathToLLVM/MathToLLVM.cpp
@@ -142,8 +142,8 @@ struct SincosOpLowering : public ConvertOpToLLVMPattern<math::SincosOp> {
     auto structType = LLVM::LLVMStructType::getLiteral(
         rewriter.getContext(), {llvmOperandType, llvmOperandType});
 
-    auto sincosOp = rewriter.create<LLVM::SincosOp>(
-        loc, structType, adaptor.getOperand(), attrs.getAttrs());
+    auto sincosOp = LLVM::SincosOp::create(
+        rewriter, loc, structType, adaptor.getOperand(), attrs.getAttrs());
 
     auto sinValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 0);
     auto cosValue = LLVM::ExtractValueOp::create(rewriter, loc, sincosOp, 1);
diff --git a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
index 519d9c8f835f2..71e3f88a63f34 100644
--- a/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
+++ b/mlir/lib/Conversion/SCFToEmitC/SCFToEmitC.cpp
@@ -394,9 +394,9 @@ struct WhileLowering : public OpConversionPattern<WhileOp> {
       if (!convertedType)
         return rewriter.notifyMatchFailure(whileOp, "type conversion failed");
 
-      emitc::VariableOp var = rewriter.create<emitc::VariableOp>(
-          loc, emitc::LValueType::get(convertedType), noInit);
-      rewriter.create<emitc::AssignOp>(loc, var.getResult(), init);
+      auto var = emitc::VariableOp::create(
+          rewriter, loc, emitc::LValueType::get(convertedType), noInit);
+      emitc::AssignOp::create(rewriter, loc, var.getResult(), init);
       loopVars.push_back(var);
     }
 
@@ -411,11 +411,11 @@ struct WhileLowering : public OpConversionPattern<WhileOp> {
     // Create a global boolean variable to store the loop condition state.
     Type i1Type = IntegerType::get(context, 1);
     auto globalCondition =
-        rewriter.create<emitc::VariableOp>(loc, emitc::LValueType::get(i1Type),
-                                           emitc::OpaqueAttr::get(context, ""));
+        emitc::VariableOp::create(rewriter, loc, emitc::LValueType::get(i1Type),
+                                  emitc::OpaqueAttr::get(context, ""));
     Value conditionVal = globalCondition.getResult();
 
-    auto loweredDo = rewriter.create<emitc::DoOp>(loc);
+    auto loweredDo = emitc::DoOp::create(rewriter, loc);
 
     // Convert region types to match the target dialect type system.
     if (failed(rewriter.convertRegionTypes(&whileOp.getBefore(),
@@ -450,12 +450,12 @@ struct WhileLowering : public OpConversionPattern<WhileOp> {
 
     // Convert scf.condition to condition variable assignment.
     Value condition = rewriter.getRemappedValue(condOp.getCondition());
-    rewriter.create<emitc::AssignOp>(loc, conditionVal, condition);
+    emitc::AssignOp::create(rewriter, loc, conditionVal, condition);
 
     // Wrap body region in conditional to preserve scf semantics. Only create
     // ifOp if after-region is non-empty.
     if (whileOp.getAfterBody()->getOperations().size() > 1) {
-      auto ifOp = rewriter.create<emitc::IfOp>(loc, condition, false, false);
+      auto ifOp = emitc::IfOp::create(rewriter, loc, condition, false, false);
 
       // Prepare the after region (loop body) for merging.
       Block *afterBlock = &whileOp.getAfter().front();
@@ -480,8 +480,8 @@ struct WhileLowering : public OpConversionPattern<WhileOp> {
     Block *condBlock = rewriter.createBlock(&condRegion);
     rewriter.setInsertionPointToStart(condBlock);
 
-    auto exprOp = rewriter.create<emitc::ExpressionOp>(
-        loc, i1Type, conditionVal, /*do_not_inline=*/false);
+    auto exprOp = emitc::ExpressionOp::create(
+        rewriter, loc, i1Type, conditionVal, /*do_not_inline=*/false);
     Block *exprBlock = rewriter.createBlock(&exprOp.getBodyRegion());
 
     // Set up the expression block to load the condition variable.
@@ -490,12 +490,12 @@ struct WhileLowering : public OpConversionPattern<WhileOp> {
 
     // Load the condition value and yield it as the expression result.
     Value cond =
-        rewriter.create<emitc::LoadOp>(loc, i1Type, exprBlock->getArgument(0));
-    rewriter.create<emitc::YieldOp>(loc, cond);
+        emitc::LoadOp::create(rewriter, loc, i1Type, exprBlock->getArgument(0));
+    emitc::YieldOp::create(rewriter, loc, cond);
 
     // Yield the expression as the condition region result.
     rewriter.setInsertionPointToEnd(condBlock);
-    rewriter.create<emitc::YieldOp>(loc, exprOp);
+    emitc::YieldOp::create(rewriter, loc, exprOp);
 
     return success();
   }
diff --git a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
index 00df14b1bdb77..29afdc240c9f8 100644
--- a/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
+++ b/mlir/lib/Conversion/TosaToLinalg/TosaToLinalg.cpp
@@ -232,16 +232,16 @@ static Value createLinalgBodyCalculationForElementwiseOp(
         }
 
         intermediateType = rewriter.getIntegerType(intermediateBitWidth);
-        zpAddValue = rewriter.create<arith::ConstantOp>(
-            loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
+        zpAddValue = arith::ConstantOp::create(
+            rewriter, loc, rewriter.getIntegerAttr(intermediateType, zpAdd));
       } else {
         intermediateType = rewriter.getIntegerType(intermediateBitWidth);
         auto arg1 =
-            rewriter.create<arith::ExtSIOp>(loc, intermediateType, args[1]);
+            arith::ExtSIOp::create(rewriter, loc, intermediateType, args[1]);
         auto arg2 =
-            rewriter.create<arith::ExtSIOp>(loc, intermediateType, args[2]);
+            arith::ExtSIOp::create(rewriter, loc, intermediateType, args[2]);
         zpAddValue =
-            rewriter.create<arith::AddIOp>(loc, intermediateType, arg1, arg2);
+            arith::AddIOp::create(rewriter, loc, intermediateType, arg1, arg2);
       }
 
       // The negation can be applied by doing:
@@ -1402,8 +1402,8 @@ static Value collapse1xNTensorToN(PatternRewriter &rewriter, Value input,
   auto elemType = inputType.getElementType();
   auto collapsedType = RankedTensorType::get({}, elemType);
   // Emit the collapse op
-  return rewriter.create<tensor::CollapseShapeOp>(loc, collapsedType, input,
-                                                  reassociation);
+  return tensor::CollapseShapeOp::create(rewriter, loc, collapsedType, input,
+                                         reassociation);
 }
 
 static llvm::SmallVector<int8_t>
@@ -1443,7 +1443,7 @@ static void setupLinalgGenericOpInputAndIndexingMap(
       IntegerAttr intAttr = isShift
                                 ? rewriter.getI8IntegerAttr(values.front())
                                 : rewriter.getI32IntegerAttr(values.front());
-      constant = rewriter.create<arith::ConstantOp>(loc, intAttr);
+      constant = arith::ConstantOp::create(rewriter, loc, intAttr);
     } else {
       auto elementType =
           isShift ? rewriter.getIntegerType(8) : rewriter.getI32Type();
@@ -1511,14 +1511,14 @@ static Value getExtendZp(OpBuilder &builder, Type valueTy,
                 .getResult(0);
       }
       if (zpTy.isUnsignedInteger()) {
-        return builder.create<arith::ExtUIOp>(loc, extendType, result);
+        return arith::ExtUIOp::create(builder, loc, extendType, result);
       } else {
-        return builder.create<arith::ExtSIOp>(loc, extendType, result);
+        return arith::ExtSIOp::create(builder, loc, extendType, result);
       }
     }
   } else {
-    return builder.create<arith::ConstantOp>(
-        loc, IntegerAttr::get(extendType, *maybeZp));
+    return arith::ConstantOp::create(builder, loc,
+                                     IntegerAttr::get(extendType, *maybeZp));
   }
   return result;
 }
diff --git a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
index 61166db0ff210..585b6dacfa648 100644
--- a/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
+++ b/mlir/lib/Dialect/AMDGPU/IR/AMDGPUDialect.cpp
@@ -360,45 +360,53 @@ LogicalResult ScaledExtPacked816Op::verify() {
 //===----------------------------------------------------------------------===//
 // WMMAOp
 //===----------------------------------------------------------------------===//
-LogicalResult WMMAOp::verify() {
-  Type sourceAType = getSourceA().getType();
-  Type sourceBType = getSourceB().getType();
-  Type destType = getDestC().getType();
 
-  VectorType sourceVectorAType = dyn_cast<VectorType>(sourceAType);
-  VectorType sourceVectorBType = dyn_cast<VectorType>(sourceBType);
-  VectorType destVectorType = dyn_cast<VectorType>(destType);
+ParseResult mlir::amdgpu::parseMNKDimensionList(OpAsmParser &parser,
+                                                IntegerAttr &m, IntegerAttr &n,
+                                                IntegerAttr &k) {
+  SmallVector<int64_t, 3> dimensions;
+  if (parser.parseDimensionList(dimensions, false, false))
+    return failure();
+  if (dimensions.size() != 3)
+    return parser.emitError(parser.getCurrentLocation())
+           << "expected 3 dimensions in MNK dimension list";
 
-  Type sourceAElemType = sourceVectorAType.getElementType();
-  Type sourceBElemType = sourceVectorBType.getElementType();
-  Type destElemType = destVectorType.getElementType();
+  m = parser.getBuilder().getI32IntegerAttr(dimensions[0]);
+  n = parser.getBuilder().getI32IntegerAttr(dimensions[1]);
+  k = parser.getBuilder().getI32IntegerAttr(dimensions[2]);
+  return success();
+}
 
-  if (sourceVectorAType.getNumElements() !=
-      sourceVectorBType.getNumElements()) {
+LogicalResult WMMAOp::verify() {
+  auto sourceAType = cast<VectorType>(getSourceA().getType());
+  auto sourceBType = cast<VectorType>(getSourceB().getType());
+  auto destType = cast<VectorType>(getDestC().getType());
+
+  Type sourceAElemType = sourceAType.getElementType();
+  Type sourceBElemType = sourceBType.getElementType();
+  if (sourceAType.getNumElements() != sourceBType.getNumElements()) {
     return emitOpError("source vectors have different lengths: ")
-           << sourceVectorAType << " vs. " << sourceVectorBType;
+           << sourceAType << " vs. " << sourceBType;
   }
 
-  bool isDestFloat = isa<Float32Type, Float16Type, BFloat16Type>(destElemType);
-  bool isSrcFloat =
-      isa<Float16Type, BFloat16Type, Float8E4M3FNType, Float8E5M2Type>(
-          sourceAElemType);
-
-  if (isDestFloat && !isSrcFloat) {
-    return emitOpError("Expected float sources with float destination");
-  }
+  bool isDestFloat = destType.getElementType().isFloat();
+  bool isSrcFloat = sourceAElemType.isFloat();
 
-  if (!isDestFloat && isSrcFloat) {
-    return emitOpError("Expected int sources with int destination");
-  }
+  if (isDestFloat && !isSrcFloat)
+    return emitOpError("expected float sources with float destination");
+  if (!isDestFloat && isSrcFloat)
+    return emitOpError("expected int sources with int destination");
 
-  if (sourceAElemType != sourceBElemType &&
-      !(isa<Float8E5M2Type, Float8E4M3FNType>(sourceAElemType) &&
-        isa<Float8E5M2Type, Float8E4M3FNType>(sourceBElemType))) {
+  if (!sourceAElemType.isFloat(8) && sourceAElemType != sourceBElemType) {
     return emitOpError(
                "source element types much match (except for fp8) but have ")
            << sourceAType << " and " << sourceBType;
   }
+
+  if (!sourceAElemType.isInteger(4) && getK() != 16) {
+    return emitOpError("K dimension must be 16 for source element type ")
+           << sourceAElemType;
+  }
   return success();
 }
 
@@ -414,11 +422,11 @@ LogicalResult MFMAOp::verify() {
 
   Type sourceElem = sourceType, destElem = destType;
   uint32_t sourceLen = 1, destLen = 1;
-  if (auto sourceVector = llvm::dyn_cast<VectorType>(sourceType)) {
+  if (auto sourceVector = dyn_cast<VectorType>(sourceType)) {
     sourceLen = sourceVector.getNumElements();
     sourceElem = sourceVector.getElementType();
   }
-  if (auto destVector = llvm::dyn_cast<VectorType>(destType)) {
+  if (auto destVector = dyn_cast<VectorType>(destType)) {
     destLen = destVector.getNumElements();
     destElem = destVector.getElementType();
   }
@@ -443,7 +451,7 @@ LogicalResult MFMAOp::verify() {
       return emitOpError("expected both non-small-float source operand types "
                          "to match exactly");
   }
-  // Normalize the wider integer types the compiler expects to i8
+  // Normalize the wider integer types the compiler expects to i8.
   if (sourceElem.isInteger(32)) {
     sourceLen *= 4;
     sourceElem = b.getI8Type();
diff --git a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
index 316721b2ecd78..60ae78b4133a4 100644
--- a/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
+++ b/mlir/lib/Dialect/Affine/Transforms/LoopUnroll.cpp
@@ -45,18 +45,15 @@ struct LoopUnroll : public affine::impl::AffineLoopUnrollBase<LoopUnroll> {
   const std::function<unsigned(AffineForOp)> getUnrollFactor;
 
   LoopUnroll() : getUnrollFactor(nullptr) {}
-  LoopUnroll(const LoopUnroll &other)
-
-      = default;
+  LoopUnroll(const LoopUnroll &other) = default;
   explicit LoopUnroll(
       std::optional<unsigned> unrollFactor = std::nullopt,
-      bool unrollUpToFactor = false, bool unrollFull = false,
+      bool unrollUpToFactor = false,
       const std::function<unsigned(AffineForOp)> &getUnrollFactor = nullptr)
       : getUnrollFactor(getUnrollFactor) {
     if (unrollFactor)
       this->unrollFactor = *unrollFactor;
     this->unrollUpToFactor = unrollUpToFactor;
-    this->unrollFull = unrollFull;
   }
 
   void runOnOperation() override;
@@ -85,11 +82,17 @@ static void gatherInnermostLoops(FunctionOpInterface f,
 }
 
 void LoopUnroll::runOnOperation() {
+  if (!(unrollFactor.getValue() > 0 || unrollFactor.getValue() == -1)) {
+    emitError(UnknownLoc::get(&getContext()),
+              "Invalid option: 'unroll-factor' should be greater than 0 or "
+              "equal to -1");
+    return signalPassFailure();
+  }
   FunctionOpInterface func = getOperation();
   if (func.isExternal())
     return;
 
-  if (unrollFull && unrollFullThreshold.hasValue()) {
+  if (unrollFactor.getValue() == -1 && unrollFullThreshold.hasValue()) {
     // Store short loops as we walk.
     SmallVector<AffineForOp, 4> loops;
 
@@ -130,7 +133,7 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
     return loopUnrollByFactor(forOp, getUnrollFactor(forOp),
                               /*annotateFn=*/nullptr, cleanUpUnroll);
   // Unroll completely if full loop unroll was specified.
-  if (unrollFull)
+  if (unrollFactor.getValue() == -1)
     return loopUnrollFull(forOp);
   // Otherwise, unroll by the given unroll factor.
   if (unrollUpToFactor)
@@ -141,9 +144,9 @@ LogicalResult LoopUnroll::runOnAffineForOp(AffineForOp forOp) {
 
 std::unique_ptr<InterfacePass<FunctionOpInterface>>
 mlir::affine::createLoopUnrollPass(
-    int unrollFactor, bool unrollUpToFactor, bool unrollFull,
+    int unrollFactor, bool unrollUpToFactor,
     const std::function<unsigned(AffineForOp)> &getUnrollFactor) {
   return std::make_unique<LoopUnroll>(
       unrollFactor == -1 ? std::nullopt : std::optional<unsigned>(unrollFactor),
-      unrollUpToFactor, unrollFull, getUnrollFactor);
+      unrollUpToFactor, getUnrollFactor);
 }
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
index a6159ee93e87e..f0ddb506064fe 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/TensorCopyInsertion.cpp
@@ -14,13 +14,6 @@
 #include "mlir/Dialect/Bufferization/Transforms/OneShotModuleBufferize.h"
 #include "mlir/Dialect/Bufferization/Transforms/Transforms.h"
 
-namespace mlir {
-namespace bufferization {
-#define GEN_PASS_DEF_TENSORCOPYINSERTION
-#include "mlir/Dialect/Bufferization/Transforms/Passes.h.inc"
-} // namespace bufferization
-} // namespace mlir
-
 using namespace mlir;
 using namespace mlir::bufferization;
 
diff --git a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
index 2a8c330b761be..f0de4dbcc1d4b 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/NVVMDialect.cpp
@@ -320,6 +320,51 @@ LogicalResult ConvertF32x2ToF4x2Op::verify() {
   return success();
 }
 
+LogicalResult ConvertF8x2ToF16x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<Float8E4M3FNType, Float8E5M2Type>(getSrcType()))
+    return emitOpError("Only ")
+           << mlir::Float8E4M3FNType::get(ctx) << " and "
+           << mlir::Float8E5M2Type::get(ctx)
+           << " types are supported for conversions from f8x2 to f16x2.";
+
+  return success();
+}
+
+LogicalResult ConvertF8x2ToBF16x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+  if (!llvm::isa<Float8E8M0FNUType>(getSrcType()))
+    return emitOpError("Only ")
+           << mlir::Float8E8M0FNUType::get(ctx)
+           << " type is supported for conversions from f8x2 to bf16x2.";
+
+  return success();
+}
+
+LogicalResult ConvertF6x2ToF16x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<Float6E2M3FNType, Float6E3M2FNType>(getSrcType()))
+    return emitOpError("Only ")
+           << mlir::Float6E2M3FNType::get(ctx) << " and "
+           << mlir::Float6E3M2FNType::get(ctx)
+           << " types are supported for conversions from f6x2 to f16x2.";
+
+  return success();
+}
+
+LogicalResult ConvertF4x2ToF16x2Op::verify() {
+  mlir::MLIRContext *ctx = getContext();
+
+  if (!llvm::isa<Float4E2M1FNType>(getSrcType()))
+    return emitOpError("Only ")
+           << mlir::Float4E2M1FNType::get(ctx)
+           << " type is supported for conversions from f4x2 to f16x2.";
+
+  return success();
+}
+
 LogicalResult BulkStoreOp::verify() {
   if (getInitVal() != 0)
     return emitOpError("only 0 is supported for initVal, got ") << getInitVal();
@@ -2187,6 +2232,98 @@ ConvertBF16x2ToF8x2Op::getIntrinsicID(NVVM::FPRoundingMode rnd,
   }
 }
 
+NVVM::IDArgPair ConvertF8x2ToF16x2Op::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ConvertF8x2ToF16x2Op>(op);
+
+  bool hasRelu = curOp.getRelu();
+
+  llvm::Intrinsic::ID intId =
+      llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(curOp.getSrcType())
+          .Case<Float8E4M3FNType>([&](Float8E4M3FNType type) {
+            return hasRelu ? llvm::Intrinsic::nvvm_e4m3x2_to_f16x2_rn_relu
+                           : llvm::Intrinsic::nvvm_e4m3x2_to_f16x2_rn;
+          })
+          .Case<Float8E5M2Type>([&](Float8E5M2Type type) {
+            return hasRelu ? llvm::Intrinsic::nvvm_e5m2x2_to_f16x2_rn_relu
+                           : llvm::Intrinsic::nvvm_e5m2x2_to_f16x2_rn;
+          })
+          .Default([](mlir::Type type) {
+            llvm_unreachable("Invalid type for ConvertF8x2ToF16x2Op");
+            return llvm::Intrinsic::not_intrinsic;
+          });
+
+  llvm::Value *packedI16 =
+      builder.CreateBitCast(mt.lookupValue(curOp.getSrc()),
+                            llvm::Type::getInt16Ty(builder.getContext()));
+
+  return {intId, {packedI16}};
+}
+
+NVVM::IDArgPair ConvertF8x2ToBF16x2Op::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ConvertF8x2ToBF16x2Op>(op);
+
+  llvm::Intrinsic::ID intId = llvm::Intrinsic::nvvm_ue8m0x2_to_bf16x2;
+  llvm::Value *packedI16 =
+      builder.CreateBitCast(mt.lookupValue(curOp.getSrc()),
+                            llvm::Type::getInt16Ty(builder.getContext()));
+
+  return {intId, {packedI16}};
+}
+
+NVVM::IDArgPair ConvertF6x2ToF16x2Op::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ConvertF6x2ToF16x2Op>(op);
+
+  bool hasRelu = curOp.getRelu();
+
+  llvm::Intrinsic::ID intId =
+      llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(curOp.getSrcType())
+          .Case<Float6E2M3FNType>([&](Float6E2M3FNType type) {
+            return hasRelu ? llvm::Intrinsic::nvvm_e2m3x2_to_f16x2_rn_relu
+                           : llvm::Intrinsic::nvvm_e2m3x2_to_f16x2_rn;
+          })
+          .Case<Float6E3M2FNType>([&](Float6E3M2FNType type) {
+            return hasRelu ? llvm::Intrinsic::nvvm_e3m2x2_to_f16x2_rn_relu
+                           : llvm::Intrinsic::nvvm_e3m2x2_to_f16x2_rn;
+          })
+          .Default([](mlir::Type type) {
+            llvm_unreachable("Invalid type for ConvertF6x2ToF16x2Op");
+            return llvm::Intrinsic::not_intrinsic;
+          });
+
+  llvm::Value *packedI16 =
+      builder.CreateBitCast(mt.lookupValue(curOp.getSrc()),
+                            llvm::Type::getInt16Ty(builder.getContext()));
+
+  return {intId, {packedI16}};
+}
+
+NVVM::IDArgPair ConvertF4x2ToF16x2Op::getIntrinsicIDAndArgs(
+    Operation &op, LLVM::ModuleTranslation &mt, llvm::IRBuilderBase &builder) {
+  auto curOp = cast<NVVM::ConvertF4x2ToF16x2Op>(op);
+
+  bool hasRelu = curOp.getRelu();
+
+  llvm::Intrinsic::ID intId =
+      llvm::TypeSwitch<mlir::Type, llvm::Intrinsic::ID>(curOp.getSrcType())
+          .Case<Float4E2M1FNType>([&](Float4E2M1FNType type) {
+            return hasRelu ? llvm::Intrinsic::nvvm_e2m1x2_to_f16x2_rn_relu
+                           : llvm::Intrinsic::nvvm_e2m1x2_to_f16x2_rn;
+          })
+          .Default([](mlir::Type type) {
+            llvm_unreachable("Invalid type for ConvertF4x2ToF16x2Op");
+            return llvm::Intrinsic::not_intrinsic;
+          });
+
+  llvm::Value *extendedI16 =
+      builder.CreateZExt(mt.lookupValue(curOp.getSrc()),
+                         llvm::Type::getInt16Ty(builder.getContext()));
+
+  return {intId, {extendedI16}};
+}
+
 llvm::Intrinsic::ID
 Tcgen05AllocOp::getIntrinsicIDAndArgs(Operation &op,
                                       LLVM::ModuleTranslation &mt,
diff --git a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
index d4ff0955c5d0e..37a45d478a1fb 100644
--- a/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
+++ b/mlir/lib/Dialect/LLVMIR/Transforms/CMakeLists.txt
@@ -18,4 +18,5 @@ add_mlir_dialect_library(MLIRLLVMIRTransforms
   MLIRPass
   MLIRTransforms
   MLIRNVVMDialect
+  MLIROpenMPDialect
   )
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index a6ec9f68c8fcb..0641228da60b4 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -437,13 +437,15 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter,
     for (auto [pos, dim] : llvm::enumerate(type.getShape())) {
       if (!ShapedType::isDynamic(dim))
         continue;
-      Value cst = rewriter.create<arith::ConstantIndexOp>(tensor.getLoc(), pos);
-      auto dimOp = rewriter.create<tensor::DimOp>(tensor.getLoc(), tensor, cst);
+      Value cst =
+          arith::ConstantIndexOp::create(rewriter, tensor.getLoc(), pos);
+      auto dimOp =
+          tensor::DimOp::create(rewriter, tensor.getLoc(), tensor, cst);
       preservedOps.insert(dimOp);
       dynamicDims.push_back(dimOp);
     }
-    auto allocation = rewriter.create<bufferization::AllocTensorOp>(
-        tensor.getLoc(), type, dynamicDims);
+    auto allocation = bufferization::AllocTensorOp::create(
+        rewriter, tensor.getLoc(), type, dynamicDims);
     // Set memory space if provided.
     if (getMemorySpaceAttr())
       allocation.setMemorySpaceAttr(getMemorySpaceAttr());
@@ -452,8 +454,8 @@ transform::PromoteTensorOp::apply(transform::TransformRewriter &rewriter,
     // Only insert a materialization (typically bufferizes to a copy) when the
     // value may be read from.
     if (needsMaterialization) {
-      auto copy = rewriter.create<bufferization::MaterializeInDestinationOp>(
-          tensor.getLoc(), tensor, allocated);
+      auto copy = bufferization::MaterializeInDestinationOp::create(
+          rewriter, tensor.getLoc(), tensor, allocated);
       preservedOps.insert(copy);
       promoted.push_back(copy.getResult());
     } else {
diff --git a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
index 15eb51a6dcab2..5e10ba3780607 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/RuntimeOpVerification.cpp
@@ -17,6 +17,7 @@
 #include "mlir/Dialect/Index/IR/IndexOps.h"
 #include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/RuntimeVerifiableOpInterface.h"
 
@@ -43,6 +44,33 @@ struct StructuredOpInterface
     auto zero = arith::ConstantIndexOp::create(builder, loc, 0);
     auto one = arith::ConstantIndexOp::create(builder, loc, 1);
 
+    Value iterationDomainIsNonDegenerate;
+    for (auto [start, end] : llvm::zip(starts, ends)) {
+      auto startValue = getValueOrCreateConstantIndexOp(builder, loc, start);
+      auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end);
+
+      // Loop Trip count > 0 iff start < end
+      Value dimensionHasNonZeroTripCount = index::CmpOp::create(
+          builder, loc, index::IndexCmpPredicate::SLT, startValue, endValue);
+
+      if (!iterationDomainIsNonDegenerate) {
+        iterationDomainIsNonDegenerate = dimensionHasNonZeroTripCount;
+      } else {
+        // Iteration domain is non-degenerate iff all dimensions have loop trip
+        // count > 0
+        iterationDomainIsNonDegenerate =
+            arith::AndIOp::create(builder, loc, iterationDomainIsNonDegenerate,
+                                  dimensionHasNonZeroTripCount);
+      }
+    }
+
+    if (!iterationDomainIsNonDegenerate)
+      return;
+
+    auto ifOp = scf::IfOp::create(builder, loc, iterationDomainIsNonDegenerate,
+                                  /*withElseRegion=*/false);
+    builder.setInsertionPointToStart(&ifOp.getThenRegion().front());
+
     // Subtract one from the loop ends before composing with the indexing map
     transform(ends, ends.begin(), [&](OpFoldResult end) {
       auto endValue = getValueOrCreateConstantIndexOp(builder, loc, end);
@@ -110,6 +138,7 @@ struct StructuredOpInterface
         builder.createOrFold<cf::AssertOp>(loc, cmpOp, msg);
       }
     }
+    builder.setInsertionPointAfter(ifOp);
   }
 };
 
diff --git a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
index 94947b760251e..c551fba93e367 100644
--- a/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/MemRefOps.cpp
@@ -1437,6 +1437,13 @@ ExtractStridedMetadataOp::fold(FoldAdaptor adaptor,
   atLeastOneReplacement |= replaceConstantUsesOf(
       builder, getLoc(), getStrides(), getConstifiedMixedStrides());
 
+  // extract_strided_metadata(cast(x)) -> extract_strided_metadata(x).
+  if (auto prev = getSource().getDefiningOp<CastOp>())
+    if (isa<MemRefType>(prev.getSource().getType())) {
+      getSourceMutable().assign(prev.getSource());
+      atLeastOneReplacement = true;
+    }
+
   return success(atLeastOneReplacement);
 }
 
@@ -1744,11 +1751,11 @@ OpFoldResult MemorySpaceCastOp::fold(FoldAdaptor adaptor) {
 }
 
 TypedValue<PtrLikeTypeInterface> MemorySpaceCastOp::getSourcePtr() {
-  return cast<TypedValue<PtrLikeTypeInterface>>(getSource());
+  return getSource();
 }
 
 TypedValue<PtrLikeTypeInterface> MemorySpaceCastOp::getTargetPtr() {
-  return cast<TypedValue<PtrLikeTypeInterface>>(getDest());
+  return getDest();
 }
 
 bool MemorySpaceCastOp::isValidMemorySpaceCast(PtrLikeTypeInterface tgt,
diff --git a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
index 11400de35e430..a15bf891dd596 100644
--- a/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
+++ b/mlir/lib/Dialect/MemRef/IR/ValueBoundsOpInterfaceImpl.cpp
@@ -59,6 +59,17 @@ struct DimOpInterface
   }
 };
 
+struct ExpandShapeOpInterface
+    : public ValueBoundsOpInterface::ExternalModel<ExpandShapeOpInterface,
+                                                   memref::ExpandShapeOp> {
+  void populateBoundsForShapedValueDim(Operation *op, Value value, int64_t dim,
+                                       ValueBoundsConstraintSet &cstr) const {
+    auto expandOp = cast<memref::ExpandShapeOp>(op);
+    assert(value == expandOp.getResult() && "invalid value");
+    cstr.bound(value)[dim] == expandOp.getOutputShape()[dim];
+  }
+};
+
 struct GetGlobalOpInterface
     : public ValueBoundsOpInterface::ExternalModel<GetGlobalOpInterface,
                                                    GetGlobalOp> {
@@ -123,6 +134,8 @@ void mlir::memref::registerValueBoundsOpInterfaceExternalModels(
         memref::AllocOpInterface<memref::AllocaOp>>(*ctx);
     memref::CastOp::attachInterface<memref::CastOpInterface>(*ctx);
     memref::DimOp::attachInterface<memref::DimOpInterface>(*ctx);
+    memref::ExpandShapeOp::attachInterface<memref::ExpandShapeOpInterface>(
+        *ctx);
     memref::GetGlobalOp::attachInterface<memref::GetGlobalOpInterface>(*ctx);
     memref::RankOp::attachInterface<memref::RankOpInterface>(*ctx);
     memref::SubViewOp::attachInterface<memref::SubViewOpInterface>(*ctx);
diff --git a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
index d35566a9c0d29..bd02516d5b527 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ExpandStridedMetadata.cpp
@@ -1033,91 +1033,6 @@ class ExtractStridedMetadataOpReinterpretCastFolder
   }
 };
 
-/// Replace `base, offset, sizes, strides =
-///              extract_strided_metadata(
-///                 cast(src) to dstTy)`
-/// With
-/// ```
-/// base, ... = extract_strided_metadata(src)
-/// offset = !dstTy.srcOffset.isDynamic()
-///            ? dstTy.srcOffset
-///            : extract_strided_metadata(src).offset
-/// sizes = for each srcSize in dstTy.srcSizes:
-///           !srcSize.isDynamic()
-///             ? srcSize
-//              : extract_strided_metadata(src).sizes[i]
-/// strides = for each srcStride in dstTy.srcStrides:
-///             !srcStrides.isDynamic()
-///               ? srcStrides
-///               : extract_strided_metadata(src).strides[i]
-/// ```
-///
-/// In other words, consume the `cast` and apply its effects
-/// on the offset, sizes, and strides or compute them directly from `src`.
-class ExtractStridedMetadataOpCastFolder
-    : public OpRewritePattern<memref::ExtractStridedMetadataOp> {
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult
-  matchAndRewrite(memref::ExtractStridedMetadataOp extractStridedMetadataOp,
-                  PatternRewriter &rewriter) const override {
-    Value source = extractStridedMetadataOp.getSource();
-    auto castOp = source.getDefiningOp<memref::CastOp>();
-    if (!castOp)
-      return failure();
-
-    Location loc = extractStridedMetadataOp.getLoc();
-    // Check if the source is suitable for extract_strided_metadata.
-    SmallVector<Type> inferredReturnTypes;
-    if (failed(extractStridedMetadataOp.inferReturnTypes(
-            rewriter.getContext(), loc, {castOp.getSource()},
-            /*attributes=*/{}, /*properties=*/nullptr, /*regions=*/{},
-            inferredReturnTypes)))
-      return rewriter.notifyMatchFailure(castOp,
-                                         "cast source's type is incompatible");
-
-    auto memrefType = cast<MemRefType>(source.getType());
-    unsigned rank = memrefType.getRank();
-    SmallVector<OpFoldResult> results;
-    results.resize_for_overwrite(rank * 2 + 2);
-
-    auto newExtractStridedMetadata = memref::ExtractStridedMetadataOp::create(
-        rewriter, loc, castOp.getSource());
-
-    // Register the base_buffer.
-    results[0] = newExtractStridedMetadata.getBaseBuffer();
-
-    auto getConstantOrValue = [&rewriter](int64_t constant,
-                                          OpFoldResult ofr) -> OpFoldResult {
-      return ShapedType::isStatic(constant)
-                 ? OpFoldResult(rewriter.getIndexAttr(constant))
-                 : ofr;
-    };
-
-    auto [sourceStrides, sourceOffset] = memrefType.getStridesAndOffset();
-    assert(sourceStrides.size() == rank && "unexpected number of strides");
-
-    // Register the new offset.
-    results[1] =
-        getConstantOrValue(sourceOffset, newExtractStridedMetadata.getOffset());
-
-    const unsigned sizeStartIdx = 2;
-    const unsigned strideStartIdx = sizeStartIdx + rank;
-    ArrayRef<int64_t> sourceSizes = memrefType.getShape();
-
-    SmallVector<OpFoldResult> sizes = newExtractStridedMetadata.getSizes();
-    SmallVector<OpFoldResult> strides = newExtractStridedMetadata.getStrides();
-    for (unsigned i = 0; i < rank; ++i) {
-      results[sizeStartIdx + i] = getConstantOrValue(sourceSizes[i], sizes[i]);
-      results[strideStartIdx + i] =
-          getConstantOrValue(sourceStrides[i], strides[i]);
-    }
-    rewriter.replaceOp(extractStridedMetadataOp,
-                       getValueOrCreateConstantIndexOp(rewriter, loc, results));
-    return success();
-  }
-};
-
 /// Replace `base, offset, sizes, strides = extract_strided_metadata(
 ///      memory_space_cast(src) to dstTy)`
 /// with
@@ -1209,7 +1124,6 @@ void memref::populateExpandStridedMetadataPatterns(
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
                ExtractStridedMetadataOpSubviewFolder,
-               ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpMemorySpaceCastFolder,
                ExtractStridedMetadataOpAssumeAlignmentFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
@@ -1226,7 +1140,6 @@ void memref::populateResolveExtractStridedMetadataPatterns(
                ExtractStridedMetadataOpSubviewFolder,
                RewriteExtractAlignedPointerAsIndexOfViewLikeOp,
                ExtractStridedMetadataOpReinterpretCastFolder,
-               ExtractStridedMetadataOpCastFolder,
                ExtractStridedMetadataOpMemorySpaceCastFolder,
                ExtractStridedMetadataOpAssumeAlignmentFolder,
                ExtractStridedMetadataOpExtractStridedMetadataFolder>(
diff --git a/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt
new file mode 100644
index 0000000000000..f305068e1b3bc
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Analysis/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_mlir_dialect_library(MLIROpenACCAnalysis
+  OpenACCSupport.cpp
+
+  ADDITIONAL_HEADER_DIRS
+  ${MLIR_MAIN_INCLUDE_DIR}/mlir/Dialect/OpenACC
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIROpenACCDialect
+  MLIROpenACCUtils
+  MLIRSupport
+)
+
diff --git a/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
new file mode 100644
index 0000000000000..f6b4534794eaf
--- /dev/null
+++ b/mlir/lib/Dialect/OpenACC/Analysis/OpenACCSupport.cpp
@@ -0,0 +1,26 @@
+//===- OpenACCSupport.cpp - OpenACCSupport Implementation -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the OpenACCSupport analysis interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACCUtils.h"
+
+namespace mlir {
+namespace acc {
+
+std::string OpenACCSupport::getVariableName(Value v) {
+  if (impl)
+    return impl->getVariableName(v);
+  return acc::getVariableName(v);
+}
+
+} // namespace acc
+} // namespace mlir
diff --git a/mlir/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
index 7117520599fa6..e8a916e824d71 100644
--- a/mlir/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenACC/CMakeLists.txt
@@ -1,3 +1,4 @@
+add_subdirectory(Analysis)
 add_subdirectory(IR)
 add_subdirectory(Utils)
 add_subdirectory(Transforms)
diff --git a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
index 5ca0100debe55..ca46629919dba 100644
--- a/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
+++ b/mlir/lib/Dialect/OpenACC/IR/OpenACC.cpp
@@ -609,6 +609,20 @@ LogicalResult acc::FirstprivateOp::verify() {
   return success();
 }
 
+//===----------------------------------------------------------------------===//
+// FirstprivateMapInitialOp
+//===----------------------------------------------------------------------===//
+LogicalResult acc::FirstprivateMapInitialOp::verify() {
+  if (getDataClause() != acc::DataClause::acc_firstprivate)
+    return emitError("data clause associated with firstprivate operation must "
+                     "match its intent");
+  if (failed(checkVarAndVarType(*this)))
+    return failure();
+  if (failed(checkNoModifier(*this)))
+    return failure();
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // ReductionOp
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
index 12233254f3fb4..89adda82646e6 100644
--- a/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
+++ b/mlir/lib/Dialect/OpenACC/Utils/OpenACCUtils.cpp
@@ -9,6 +9,7 @@
 #include "mlir/Dialect/OpenACC/OpenACCUtils.h"
 
 #include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Interfaces/ViewLikeInterface.h"
 #include "llvm/ADT/TypeSwitch.h"
 
 mlir::Operation *mlir::acc::getEnclosingComputeOp(mlir::Region &region) {
@@ -78,3 +79,30 @@ mlir::acc::VariableTypeCategory mlir::acc::getTypeCategory(mlir::Value var) {
         pointerLikeTy.getElementType());
   return typeCategory;
 }
+
+std::string mlir::acc::getVariableName(mlir::Value v) {
+  Value current = v;
+
+  // Walk through view operations until a name is found or can't go further
+  while (Operation *definingOp = current.getDefiningOp()) {
+    // Check for `acc.var_name` attribute
+    if (auto varNameAttr =
+            definingOp->getAttrOfType<VarNameAttr>(getVarNameAttrName()))
+      return varNameAttr.getName().str();
+
+    // If it is a data entry operation, get name via getVarName
+    if (isa<ACC_DATA_ENTRY_OPS>(definingOp))
+      if (auto name = acc::getVarName(definingOp))
+        return name->str();
+
+    // If it's a view operation, continue to the source
+    if (auto viewOp = dyn_cast<ViewLikeOpInterface>(definingOp)) {
+      current = viewOp.getViewSource();
+      continue;
+    }
+
+    break;
+  }
+
+  return "";
+}
diff --git a/mlir/lib/Dialect/OpenMP/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
index 57a6d3445c151..f3c02da458508 100644
--- a/mlir/lib/Dialect/OpenMP/CMakeLists.txt
+++ b/mlir/lib/Dialect/OpenMP/CMakeLists.txt
@@ -1,3 +1,5 @@
+add_subdirectory(Transforms)
+
 add_mlir_dialect_library(MLIROpenMPDialect
   IR/OpenMPDialect.cpp
 
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
new file mode 100644
index 0000000000000..b9b8eda9ed51b
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/CMakeLists.txt
@@ -0,0 +1,14 @@
+add_mlir_dialect_library(MLIROpenMPTransforms
+  OpenMPOffloadPrivatizationPrepare.cpp
+
+  DEPENDS
+  MLIROpenMPPassIncGen
+
+  LINK_LIBS PUBLIC
+  MLIRIR
+  MLIRFuncDialect
+  MLIRLLVMDialect
+  MLIROpenMPDialect
+  MLIRPass
+  MLIRTransforms
+  )
diff --git a/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
new file mode 100644
index 0000000000000..c117d9b034b7a
--- /dev/null
+++ b/mlir/lib/Dialect/OpenMP/Transforms/OpenMPOffloadPrivatizationPrepare.cpp
@@ -0,0 +1,447 @@
+//===- OpenMPOffloadPrivatizationPrepare.cpp - Prepare OMP privatization --===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/SliceAnalysis.h"
+#include "mlir/Dialect/LLVMIR/FunctionCallUtils.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
+#include "mlir/Dialect/OpenMP/OpenMPDialect.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/Dominance.h"
+#include "mlir/IR/IRMapping.h"
+#include "mlir/Pass/Pass.h"
+#include "mlir/Support/LLVM.h"
+#include "llvm/Support/DebugLog.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <cstdint>
+#include <iterator>
+#include <utility>
+
+//===----------------------------------------------------------------------===//
+// A pass that prepares OpenMP code for translation of delayed privatization
+// in the context of deferred target tasks. Deferred target tasks are created
+// when the nowait clause is used on the target directive.
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "omp-prepare-for-offload-privatization"
+
+namespace mlir {
+namespace omp {
+
+#define GEN_PASS_DEF_PREPAREFOROMPOFFLOADPRIVATIZATIONPASS
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h.inc"
+
+} // namespace omp
+} // namespace mlir
+
+using namespace mlir;
+namespace {
+
+//===----------------------------------------------------------------------===//
+// PrepareForOMPOffloadPrivatizationPass
+//===----------------------------------------------------------------------===//
+
+class PrepareForOMPOffloadPrivatizationPass
+    : public omp::impl::PrepareForOMPOffloadPrivatizationPassBase<
+          PrepareForOMPOffloadPrivatizationPass> {
+
+  void runOnOperation() override {
+    ModuleOp mod = getOperation();
+
+    // In this pass, we make host-allocated privatized variables persist for
+    // deferred target tasks by copying them to the heap. Once the target task
+    // is done, this heap memory is freed. Since all of this happens on the host
+    // we can skip device modules.
+    auto offloadModuleInterface =
+        dyn_cast<omp::OffloadModuleInterface>(mod.getOperation());
+    if (offloadModuleInterface && offloadModuleInterface.getIsTargetDevice())
+      return;
+
+    getOperation()->walk([&](omp::TargetOp targetOp) {
+      if (!hasPrivateVars(targetOp) || !isTargetTaskDeferred(targetOp))
+        return;
+      IRRewriter rewriter(&getContext());
+      OperandRange privateVars = targetOp.getPrivateVars();
+      SmallVector<mlir::Value> newPrivVars;
+      Value fakeDependVar;
+      omp::TaskOp cleanupTaskOp;
+
+      newPrivVars.reserve(privateVars.size());
+      std::optional<ArrayAttr> privateSyms = targetOp.getPrivateSyms();
+      for (auto [privVarIdx, privVarSymPair] :
+           llvm::enumerate(llvm::zip_equal(privateVars, *privateSyms))) {
+        Value privVar = std::get<0>(privVarSymPair);
+        Attribute privSym = std::get<1>(privVarSymPair);
+
+        omp::PrivateClauseOp privatizer = findPrivatizer(targetOp, privSym);
+        if (!privatizer.needsMap()) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+        bool isFirstPrivate = privatizer.getDataSharingType() ==
+                              omp::DataSharingClauseType::FirstPrivate;
+
+        Value mappedValue = targetOp.getMappedValueForPrivateVar(privVarIdx);
+        auto mapInfoOp = cast<omp::MapInfoOp>(mappedValue.getDefiningOp());
+
+        if (mapInfoOp.getMapCaptureType() == omp::VariableCaptureKind::ByCopy) {
+          newPrivVars.push_back(privVar);
+          continue;
+        }
+
+        // For deferred target tasks (!$omp target nowait), we need to keep
+        // a copy of the original, i.e. host variable being privatized so
+        // that it is available when the target task is eventually executed.
+        // We do this by first allocating as much heap memory as is needed by
+        // the original variable. Then, we use the init and copy regions of the
+        // privatizer, an instance of omp::PrivateClauseOp to set up the heap-
+        // allocated copy.
+        // After the target task is done, we need to use the dealloc region
+        // of the privatizer to clean up everything. We also need to free
+        // the heap memory we allocated. But due to the deferred nature
+        // of the target task, we cannot simply deallocate right after the
+        // omp.target operation else we may end up freeing memory before
+        // its eventual use by the target task. So, we create a dummy
+        // dependence between the target task and new omp.task. In the omp.task,
+        // we do all the cleanup. So, we end up with the following structure
+        //
+        // omp.target map_entries(..) ... nowait depend(out:fakeDependVar) {
+        //   ...
+        //   omp.terminator
+        // }
+        // omp.task depend(in: fakeDependVar) {
+        //   /*cleanup_code*/
+        //   omp.terminator
+        // }
+        // fakeDependVar is the address of the first heap-allocated copy of the
+        // host variable being privatized.
+
+        bool needsCleanupTask = !privatizer.getDeallocRegion().empty();
+
+        // Allocate heap memory that corresponds to the type of memory
+        // pointed to by varPtr
+        // For boxchars this won't be a pointer. But, MapsForPrivatizedSymbols
+        // should have mapped the pointer to the boxchar so use that as varPtr.
+        Value varPtr = mapInfoOp.getVarPtr();
+        Type varType = mapInfoOp.getVarType();
+        bool isPrivatizedByValue =
+            !isa<LLVM::LLVMPointerType>(privVar.getType());
+
+        assert(isa<LLVM::LLVMPointerType>(varPtr.getType()));
+        Value heapMem =
+            allocateHeapMem(targetOp, varPtr, varType, mod, rewriter);
+        if (!heapMem)
+          targetOp.emitError(
+              "Unable to allocate heap memory when trying to move "
+              "a private variable out of the stack and into the "
+              "heap for use by a deferred target task");
+
+        if (needsCleanupTask && !fakeDependVar)
+          fakeDependVar = heapMem;
+
+        // The types of private vars should match before and after the
+        // transformation. In particular, if the type is a pointer,
+        // simply record the newly allocated malloc location as the
+        // new private variable. If, however, the type is not a pointer
+        // then, we need to load the value from the newly allocated
+        // location. We'll insert that load later after we have updated
+        // the malloc'd location with the contents of the original
+        // variable.
+        if (!isPrivatizedByValue)
+          newPrivVars.push_back(heapMem);
+
+        // We now need to copy the original private variable into the newly
+        // allocated location in the heap.
+        // Find the earliest insertion point for the copy. This will be before
+        // the first in the list of omp::MapInfoOp instances that use varPtr.
+        // After the copy these omp::MapInfoOp instances will refer to heapMem
+        // instead.
+        Operation *varPtrDefiningOp = varPtr.getDefiningOp();
+        DenseSet<Operation *> users;
+        if (varPtrDefiningOp) {
+          users.insert(varPtrDefiningOp->user_begin(),
+                       varPtrDefiningOp->user_end());
+        } else {
+          auto blockArg = cast<BlockArgument>(varPtr);
+          users.insert(blockArg.user_begin(), blockArg.user_end());
+        }
+        auto usesVarPtr = [&users](Operation *op) -> bool {
+          return users.count(op);
+        };
+
+        SmallVector<Operation *> chainOfOps;
+        chainOfOps.push_back(mapInfoOp);
+        for (auto member : mapInfoOp.getMembers()) {
+          omp::MapInfoOp memberMap =
+              cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (usesVarPtr(memberMap))
+            chainOfOps.push_back(memberMap);
+          if (memberMap.getVarPtrPtr()) {
+            Operation *defOp = memberMap.getVarPtrPtr().getDefiningOp();
+            if (defOp && usesVarPtr(defOp))
+              chainOfOps.push_back(defOp);
+          }
+        }
+
+        DominanceInfo dom;
+        llvm::sort(chainOfOps, [&](Operation *l, Operation *r) {
+          if (l == r)
+            return false;
+          return dom.properlyDominates(l, r);
+        });
+
+        rewriter.setInsertionPoint(chainOfOps.front());
+
+        Operation *firstOp = chainOfOps.front();
+        Location loc = firstOp->getLoc();
+
+        // Create a llvm.func for 'region' that is marked always_inline and call
+        // it.
+        auto createAlwaysInlineFuncAndCallIt =
+            [&](Region &region, llvm::StringRef funcName,
+                llvm::ArrayRef<Value> args, bool returnsValue) -> Value {
+          assert(!region.empty() && "region cannot be empty");
+          LLVM::LLVMFuncOp func = createFuncOpForRegion(
+              loc, mod, region, funcName, rewriter, returnsValue);
+          auto call = LLVM::CallOp::create(rewriter, loc, func, args);
+          return call.getResult();
+        };
+
+        Value moldArg, newArg;
+        if (isPrivatizedByValue) {
+          moldArg = LLVM::LoadOp::create(rewriter, loc, varType, varPtr);
+          newArg = LLVM::LoadOp::create(rewriter, loc, varType, heapMem);
+        } else {
+          moldArg = varPtr;
+          newArg = heapMem;
+        }
+
+        Value initializedVal;
+        if (!privatizer.getInitRegion().empty())
+          initializedVal = createAlwaysInlineFuncAndCallIt(
+              privatizer.getInitRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "init").str(),
+              {moldArg, newArg}, /*returnsValue=*/true);
+        else
+          initializedVal = newArg;
+
+        if (isFirstPrivate && !privatizer.getCopyRegion().empty())
+          initializedVal = createAlwaysInlineFuncAndCallIt(
+              privatizer.getCopyRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "copy").str(),
+              {moldArg, initializedVal}, /*returnsValue=*/true);
+
+        if (isPrivatizedByValue)
+          (void)LLVM::StoreOp::create(rewriter, loc, initializedVal, heapMem);
+
+        // clone origOp, replace all uses of varPtr with heapMem and
+        // erase origOp.
+        auto cloneModifyAndErase = [&](Operation *origOp) -> Operation * {
+          Operation *clonedOp = rewriter.clone(*origOp);
+          rewriter.replaceAllOpUsesWith(origOp, clonedOp);
+          rewriter.modifyOpInPlace(clonedOp, [&]() {
+            clonedOp->replaceUsesOfWith(varPtr, heapMem);
+          });
+          rewriter.eraseOp(origOp);
+          return clonedOp;
+        };
+
+        // Now that we have set up the heap-allocated copy of the private
+        // variable, rewrite all the uses of the original variable with
+        // the heap-allocated variable.
+        rewriter.setInsertionPoint(targetOp);
+        mapInfoOp = cast<omp::MapInfoOp>(cloneModifyAndErase(mapInfoOp));
+        rewriter.setInsertionPoint(mapInfoOp);
+
+        // Fix any members that may use varPtr to now use heapMem
+        for (auto member : mapInfoOp.getMembers()) {
+          auto memberMapInfoOp = cast<omp::MapInfoOp>(member.getDefiningOp());
+          if (!usesVarPtr(memberMapInfoOp))
+            continue;
+          memberMapInfoOp =
+              cast<omp::MapInfoOp>(cloneModifyAndErase(memberMapInfoOp));
+          rewriter.setInsertionPoint(memberMapInfoOp);
+
+          if (memberMapInfoOp.getVarPtrPtr()) {
+            Operation *varPtrPtrdefOp =
+                memberMapInfoOp.getVarPtrPtr().getDefiningOp();
+            rewriter.setInsertionPoint(cloneModifyAndErase(varPtrPtrdefOp));
+          }
+        }
+
+        // If the type of the private variable is not a pointer,
+        // which is typically the case with !fir.boxchar types, then
+        // we need to ensure that the new private variable is also
+        // not a pointer. Insert a load from heapMem right before
+        // targetOp.
+        if (isPrivatizedByValue) {
+          rewriter.setInsertionPoint(targetOp);
+          auto newPrivVar = LLVM::LoadOp::create(rewriter, mapInfoOp.getLoc(),
+                                                 varType, heapMem);
+          newPrivVars.push_back(newPrivVar);
+        }
+
+        // Deallocate
+        if (needsCleanupTask) {
+          if (!cleanupTaskOp) {
+            assert(fakeDependVar &&
+                   "Need a valid value to set up a dependency");
+            rewriter.setInsertionPointAfter(targetOp);
+            omp::TaskOperands taskOperands;
+            auto inDepend = omp::ClauseTaskDependAttr::get(
+                rewriter.getContext(), omp::ClauseTaskDepend::taskdependin);
+            taskOperands.dependKinds.push_back(inDepend);
+            taskOperands.dependVars.push_back(fakeDependVar);
+            cleanupTaskOp = omp::TaskOp::create(rewriter, loc, taskOperands);
+            Block *taskBlock = rewriter.createBlock(&cleanupTaskOp.getRegion());
+            rewriter.setInsertionPointToEnd(taskBlock);
+            omp::TerminatorOp::create(rewriter, cleanupTaskOp.getLoc());
+          }
+          rewriter.setInsertionPointToStart(
+              &*cleanupTaskOp.getRegion().getBlocks().begin());
+          (void)createAlwaysInlineFuncAndCallIt(
+              privatizer.getDeallocRegion(),
+              llvm::formatv("{0}_{1}", privatizer.getSymName(), "dealloc")
+                  .str(),
+              {initializedVal}, /*returnsValue=*/false);
+          llvm::FailureOr<LLVM::LLVMFuncOp> freeFunc =
+              LLVM::lookupOrCreateFreeFn(rewriter, mod);
+          assert(llvm::succeeded(freeFunc) &&
+                 "Could not find free in the module");
+          (void)LLVM::CallOp::create(rewriter, loc, freeFunc.value(),
+                                     ValueRange{heapMem});
+        }
+      }
+      assert(newPrivVars.size() == privateVars.size() &&
+             "The number of private variables must match before and after "
+             "transformation");
+      if (fakeDependVar) {
+        omp::ClauseTaskDependAttr outDepend = omp::ClauseTaskDependAttr::get(
+            rewriter.getContext(), omp::ClauseTaskDepend::taskdependout);
+        SmallVector<Attribute> newDependKinds;
+        if (!targetOp.getDependVars().empty()) {
+          std::optional<ArrayAttr> dependKinds = targetOp.getDependKinds();
+          assert(dependKinds && "bad depend clause in omp::TargetOp");
+          llvm::copy(*dependKinds, std::back_inserter(newDependKinds));
+        }
+        newDependKinds.push_back(outDepend);
+        ArrayAttr newDependKindsAttr =
+            ArrayAttr::get(rewriter.getContext(), newDependKinds);
+        targetOp.getDependVarsMutable().append(fakeDependVar);
+        targetOp.setDependKindsAttr(newDependKindsAttr);
+      }
+      rewriter.setInsertionPoint(targetOp);
+      targetOp.getPrivateVarsMutable().clear();
+      targetOp.getPrivateVarsMutable().assign(newPrivVars);
+    });
+  }
+
+private:
+  bool hasPrivateVars(omp::TargetOp targetOp) const {
+    return !targetOp.getPrivateVars().empty();
+  }
+
+  bool isTargetTaskDeferred(omp::TargetOp targetOp) const {
+    return targetOp.getNowait();
+  }
+
+  template <typename OpTy>
+  omp::PrivateClauseOp findPrivatizer(OpTy op, Attribute privSym) const {
+    SymbolRefAttr privatizerName = llvm::cast<SymbolRefAttr>(privSym);
+    omp::PrivateClauseOp privatizer =
+        SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(
+            op, privatizerName);
+    return privatizer;
+  }
+
+  // Get the (compile-time constant) size of varType as per the
+  // given DataLayout dl.
+  std::int64_t getSizeInBytes(const DataLayout &dl, Type varType) const {
+    llvm::TypeSize size = dl.getTypeSize(varType);
+    unsigned short alignment = dl.getTypeABIAlignment(varType);
+    return llvm::alignTo(size, alignment);
+  }
+
+  LLVM::LLVMFuncOp getMalloc(ModuleOp mod, IRRewriter &rewriter) const {
+    llvm::FailureOr<LLVM::LLVMFuncOp> mallocCall =
+        LLVM::lookupOrCreateMallocFn(rewriter, mod, rewriter.getI64Type());
+    assert(llvm::succeeded(mallocCall) &&
+           "Could not find malloc in the module");
+    return mallocCall.value();
+  }
+
+  Value allocateHeapMem(omp::TargetOp targetOp, Value privVar, Type varType,
+                        ModuleOp mod, IRRewriter &rewriter) const {
+    OpBuilder::InsertionGuard guard(rewriter);
+    Value varPtr = privVar;
+    Operation *definingOp = varPtr.getDefiningOp();
+    BlockArgument blockArg;
+    if (!definingOp) {
+      blockArg = mlir::dyn_cast<BlockArgument>(varPtr);
+      rewriter.setInsertionPointToStart(blockArg.getParentBlock());
+    } else {
+      rewriter.setInsertionPoint(definingOp);
+    }
+    Location loc = definingOp ? definingOp->getLoc() : blockArg.getLoc();
+    LLVM::LLVMFuncOp mallocFn = getMalloc(mod, rewriter);
+
+    assert(mod.getDataLayoutSpec() &&
+           "MLIR module with no datalayout spec not handled yet");
+
+    const DataLayout &dl = DataLayout(mod);
+    std::int64_t distance = getSizeInBytes(dl, varType);
+
+    Value sizeBytes = LLVM::ConstantOp::create(
+        rewriter, loc, mallocFn.getFunctionType().getParamType(0), distance);
+
+    auto mallocCallOp =
+        LLVM::CallOp::create(rewriter, loc, mallocFn, ValueRange{sizeBytes});
+    return mallocCallOp.getResult();
+  }
+
+  // Create a function for srcRegion and attribute it to be always_inline.
+  // The big assumption here is that srcRegion is one of init, copy or dealloc
+  // regions of a omp::PrivateClauseop. Accordingly, the return type is assumed
+  // to either be the same as the types of the two arguments of the region (for
+  // init and copy regions) or void as would be the case for dealloc regions.
+  LLVM::LLVMFuncOp createFuncOpForRegion(Location loc, ModuleOp mod,
+                                         Region &srcRegion,
+                                         llvm::StringRef funcName,
+                                         IRRewriter &rewriter,
+                                         bool returnsValue = false) {
+
+    OpBuilder::InsertionGuard guard(rewriter);
+    rewriter.setInsertionPoint(mod.getBody(), mod.getBody()->end());
+    Region clonedRegion;
+    IRMapping mapper;
+    srcRegion.cloneInto(&clonedRegion, mapper);
+
+    SmallVector<Type> paramTypes;
+    llvm::copy(srcRegion.getArgumentTypes(), std::back_inserter(paramTypes));
+    Type resultType = returnsValue
+                          ? srcRegion.getArgument(0).getType()
+                          : LLVM::LLVMVoidType::get(rewriter.getContext());
+    LLVM::LLVMFunctionType funcType =
+        LLVM::LLVMFunctionType::get(resultType, paramTypes);
+
+    LLVM::LLVMFuncOp func =
+        LLVM::LLVMFuncOp::create(rewriter, loc, funcName, funcType);
+    func.setAlwaysInline(true);
+    rewriter.inlineRegionBefore(clonedRegion, func.getRegion(),
+                                func.getRegion().end());
+    for (auto &block : func.getRegion().getBlocks()) {
+      if (isa<omp::YieldOp>(block.getTerminator())) {
+        omp::YieldOp yieldOp = cast<omp::YieldOp>(block.getTerminator());
+        rewriter.setInsertionPoint(yieldOp);
+        rewriter.replaceOpWithNewOp<LLVM::ReturnOp>(yieldOp, TypeRange(),
+                                                    yieldOp.getOperands());
+      }
+    }
+    return func;
+  }
+};
+} // namespace
diff --git a/mlir/lib/Dialect/SCF/IR/SCF.cpp b/mlir/lib/Dialect/SCF/IR/SCF.cpp
index 9d41856398362..ae75f7f670432 100644
--- a/mlir/lib/Dialect/SCF/IR/SCF.cpp
+++ b/mlir/lib/Dialect/SCF/IR/SCF.cpp
@@ -27,6 +27,7 @@
 #include "mlir/Interfaces/ValueBoundsOpInterface.h"
 #include "mlir/Transforms/InliningUtils.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/DebugLog.h"
@@ -291,9 +292,102 @@ struct MultiBlockExecuteInliner : public OpRewritePattern<ExecuteRegionOp> {
   }
 };
 
+// Pattern to eliminate ExecuteRegionOp results which forward external
+// values from the region. In case there are multiple yield operations,
+// all of them must have the same operands in order for the pattern to be
+// applicable.
+struct ExecuteRegionForwardingEliminator
+    : public OpRewritePattern<ExecuteRegionOp> {
+  using OpRewritePattern<ExecuteRegionOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(ExecuteRegionOp op,
+                                PatternRewriter &rewriter) const override {
+    if (op.getNumResults() == 0)
+      return failure();
+
+    SmallVector<Operation *> yieldOps;
+    for (Block &block : op.getRegion()) {
+      if (auto yield = dyn_cast<scf::YieldOp>(block.getTerminator()))
+        yieldOps.push_back(yield.getOperation());
+    }
+
+    if (yieldOps.empty())
+      return failure();
+
+    // Check if all yield operations have the same operands.
+    auto yieldOpsOperands = yieldOps[0]->getOperands();
+    for (auto *yieldOp : yieldOps) {
+      if (yieldOp->getOperands() != yieldOpsOperands)
+        return failure();
+    }
+
+    SmallVector<Value> externalValues;
+    SmallVector<Value> internalValues;
+    SmallVector<Value> opResultsToReplaceWithExternalValues;
+    SmallVector<Value> opResultsToKeep;
+    for (auto [index, yieldedValue] : llvm::enumerate(yieldOpsOperands)) {
+      if (isValueFromInsideRegion(yieldedValue, op)) {
+        internalValues.push_back(yieldedValue);
+        opResultsToKeep.push_back(op.getResult(index));
+      } else {
+        externalValues.push_back(yieldedValue);
+        opResultsToReplaceWithExternalValues.push_back(op.getResult(index));
+      }
+    }
+    // No yielded external values - nothing to do.
+    if (externalValues.empty())
+      return failure();
+
+    // There are yielded external values - create a new execute_region returning
+    // just the internal values.
+    SmallVector<Type> resultTypes;
+    for (Value value : internalValues)
+      resultTypes.push_back(value.getType());
+    auto newOp =
+        ExecuteRegionOp::create(rewriter, op.getLoc(), TypeRange(resultTypes));
+    newOp->setAttrs(op->getAttrs());
+
+    // Move old op's region to the new operation.
+    rewriter.inlineRegionBefore(op.getRegion(), newOp.getRegion(),
+                                newOp.getRegion().end());
+
+    // Replace all yield operations with a new yield operation with updated
+    // results. scf.execute_region must have at least one yield operation.
+    for (auto *yieldOp : yieldOps) {
+      rewriter.setInsertionPoint(yieldOp);
+      rewriter.replaceOpWithNewOp<scf::YieldOp>(yieldOp,
+                                                ValueRange(internalValues));
+    }
+
+    // Replace the old operation with the external values directly.
+    rewriter.replaceAllUsesWith(opResultsToReplaceWithExternalValues,
+                                externalValues);
+    // Replace the old operation's remaining results with the new operation's
+    // results.
+    rewriter.replaceAllUsesWith(opResultsToKeep, newOp.getResults());
+    rewriter.eraseOp(op);
+    return success();
+  }
+
+private:
+  bool isValueFromInsideRegion(Value value,
+                               ExecuteRegionOp executeRegionOp) const {
+    // Check if the value is defined within the execute_region
+    if (Operation *defOp = value.getDefiningOp())
+      return &executeRegionOp.getRegion() == defOp->getParentRegion();
+
+    // If it's a block argument, check if it's from within the region
+    if (BlockArgument blockArg = dyn_cast<BlockArgument>(value))
+      return &executeRegionOp.getRegion() == blockArg.getParentRegion();
+
+    return false; // Value is from outside the region
+  }
+};
+
 void ExecuteRegionOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                                   MLIRContext *context) {
-  results.add<SingleBlockExecuteInliner, MultiBlockExecuteInliner>(context);
+  results.add<SingleBlockExecuteInliner, MultiBlockExecuteInliner,
+              ExecuteRegionForwardingEliminator>(context);
 }
 
 void ExecuteRegionOp::getSuccessorRegions(
@@ -2485,8 +2579,8 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
         changed = true;
 
         if (!constantTrue)
-          constantTrue = rewriter.create<arith::ConstantOp>(
-              op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1));
+          constantTrue = arith::ConstantOp::create(
+              rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 1));
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantTrue); });
@@ -2495,8 +2589,8 @@ struct ConditionPropagation : public OpRewritePattern<IfOp> {
         changed = true;
 
         if (!constantFalse)
-          constantFalse = rewriter.create<arith::ConstantOp>(
-              op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0));
+          constantFalse = arith::ConstantOp::create(
+              rewriter, op.getLoc(), i1Ty, rewriter.getIntegerAttr(i1Ty, 0));
 
         rewriter.modifyOpInPlace(use.getOwner(),
                                  [&]() { use.set(constantFalse); });
diff --git a/mlir/lib/Dialect/Shard/Transforms/Partition.cpp b/mlir/lib/Dialect/Shard/Transforms/Partition.cpp
index c904e5e382b5d..3095f4d648c18 100644
--- a/mlir/lib/Dialect/Shard/Transforms/Partition.cpp
+++ b/mlir/lib/Dialect/Shard/Transforms/Partition.cpp
@@ -69,10 +69,10 @@ splitLastAxisInResharding(ImplicitLocOpBuilder &builder,
                           Sharding sourceSharding,
                           TypedValue<ShapedType> sourceShard, GridOp grid,
                           int64_t splitTensorAxis, GridAxis splitGridAxis) {
-  TypedValue<ShapedType> targetShard = cast<TypedValue<ShapedType>>(
+  TypedValue<ShapedType> targetShard =
       AllSliceOp::create(builder, sourceShard, grid,
                          ArrayRef<GridAxis>(splitGridAxis), splitTensorAxis)
-          .getResult());
+          .getResult();
   Sharding targetSharding = targetShardingInSplitLastAxis(
       builder.getContext(), sourceSharding, splitTensorAxis, splitGridAxis);
   return {targetShard, targetSharding};
@@ -204,9 +204,8 @@ static std::tuple<TypedValue<ShapedType>, Sharding> unsplitLastAxisInResharding(
       APInt(64, splitTensorAxis));
   ShapedType targetShape =
       shardShapedType(sourceUnshardedShape, grid, targetSharding);
-  TypedValue<ShapedType> targetShard = cast<TypedValue<ShapedType>>(
-      tensor::CastOp::create(builder, targetShape, allGatherResult)
-          .getResult());
+  TypedValue<ShapedType> targetShard =
+      tensor::CastOp::create(builder, targetShape, allGatherResult).getResult();
   return {targetShard, targetSharding};
 }
 
@@ -336,8 +335,8 @@ moveLastSplitAxisInResharding(ImplicitLocOpBuilder &builder, GridOp grid,
       APInt(64, targetTensorAxis), APInt(64, sourceTensorAxis));
   ShapedType targetShape =
       shardShapedType(sourceUnshardedShape, grid, targetSharding);
-  TypedValue<ShapedType> targetShard = cast<TypedValue<ShapedType>>(
-      tensor::CastOp::create(builder, targetShape, allToAllResult).getResult());
+  TypedValue<ShapedType> targetShard =
+      tensor::CastOp::create(builder, targetShape, allToAllResult).getResult();
   return {targetShard, targetSharding};
 }
 
@@ -510,8 +509,7 @@ TypedValue<ShapedType> reshard(OpBuilder &builder, GridOp grid, ShardOp source,
   auto targetSharding = target.getSharding();
   ImplicitLocOpBuilder implicitLocOpBuilder(target->getLoc(), builder);
   return reshard(implicitLocOpBuilder, grid, sourceSharding, targetSharding,
-                 cast<TypedValue<ShapedType>>(source.getSrc()),
-                 sourceShardValue);
+                 source.getSrc(), sourceShardValue);
 }
 
 TypedValue<ShapedType> reshard(OpBuilder &builder, ShardOp source,
diff --git a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
index 73e0f3d2891d7..f53d2727c9b00 100644
--- a/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
+++ b/mlir/lib/Dialect/SparseTensor/Transforms/Utils/IterationGraphSorter.cpp
@@ -159,14 +159,22 @@ IterationGraphSorter::IterationGraphSorter(
       loop2OutLvl(loop2OutLvl), iterTypes(std::move(iterTypes)),
       strategy(strategy) {
   // One map per tensor.
-  assert(loop2InsLvl.size() == ins.size());
+  assert(this->loop2InsLvl.size() == this->ins.size());
   // All the affine maps have the same number of dimensions (loops).
   assert(llvm::all_equal(llvm::map_range(
-      loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
+      this->loop2InsLvl, [](AffineMap m) { return m.getNumDims(); })));
   // The number of results of the map should match the rank of the tensor.
-  assert(llvm::all_of(llvm::zip(loop2InsLvl, ins), [](auto mvPair) {
+  assert(llvm::all_of(llvm::zip(this->loop2InsLvl, this->ins), [](auto mvPair) {
     auto [m, v] = mvPair;
-    return m.getNumResults() == cast<ShapedType>(v.getType()).getRank();
+
+    // For ranked types the rank must match.
+    // Simply return true for UnrankedTensorType
+    if (auto shapedType = llvm::dyn_cast<ShapedType>(v.getType())) {
+      return !shapedType.hasRank() ||
+             (m.getNumResults() == shapedType.getRank());
+    }
+    // Non-shaped (scalar) types behave like rank-0.
+    return m.getNumResults() == 0;
   }));
 
   itGraph.resize(getNumLoops(), std::vector<bool>(getNumLoops(), false));
diff --git a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
index 1cba1bb540c02..eb47e85cf9b0b 100644
--- a/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TargetEnv.cpp
@@ -12,6 +12,97 @@
 namespace mlir {
 namespace tosa {
 
+llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
+  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
+}
+
+TosaSpecificationVersion getMinVersion(const Profile &profile) {
+  switch (profile) {
+  case Profile::pro_int:
+  case Profile::pro_fp:
+    return TosaSpecificationVersion(1, 0);
+  case Profile::none:
+    return TosaSpecificationVersion(0, 0);
+  }
+  llvm_unreachable("Unknown TOSA profile");
+}
+
+TosaSpecificationVersion getMinVersion(const Extension &extension) {
+  switch (extension) {
+  case Extension::int16:
+  case Extension::int4:
+  case Extension::bf16:
+  case Extension::fp8e4m3:
+  case Extension::fp8e5m2:
+  case Extension::fft:
+  case Extension::variable:
+  case Extension::controlflow:
+  case Extension::doubleround:
+  case Extension::inexactround:
+  case Extension::dynamic:
+    return TosaSpecificationVersion(1, 0);
+  case Extension::mxfp:
+  case Extension::int64:
+    return TosaSpecificationVersion(1, 1);
+  case Extension::none:
+    return TosaSpecificationVersion(0, 0);
+  }
+  llvm_unreachable("Unknown TOSA extension");
+}
+
+TosaSpecificationVersion getMinVersion(const Level &level) {
+  switch (level) {
+  case Level::eightK:
+  case Level::none:
+    return TosaSpecificationVersion(1, 0);
+  }
+  llvm_unreachable("Unknown TOSA level");
+}
+
+FailureOr<TargetEnv>
+TargetEnv::createTargetEnvFromAttr(TargetEnvAttr targetAttr,
+                                   Location targetEnvAttrLoc) {
+  if (failed(verifyTargetInformation(targetAttr, targetEnvAttrLoc)))
+    return failure();
+
+  return TargetEnv(targetAttr.getSpecificationVersion(), targetAttr.getLevel(),
+                   targetAttr.getProfiles(), targetAttr.getExtensions());
+}
+
+LogicalResult TargetEnv::verifyTargetInformation(TargetEnvAttr targetAttr,
+                                                 Location targetAttrLoc) {
+  TosaSpecificationVersion targetVersion(targetAttr.getSpecificationVersion());
+
+  const auto isCompatibleWithTargetVersion =
+      [&](const auto &targetEnum, Location targetAttrLoc,
+          StringRef enumName) -> LogicalResult {
+    const TosaSpecificationVersion minRequiredVersion =
+        getMinVersion(targetEnum);
+    if (!targetVersion.isBackwardsCompatibleWith(minRequiredVersion))
+      return emitError(targetAttrLoc, enumName)
+             << " '" << stringifyEnum(targetEnum)
+             << "' is not compatible with the target version "
+             << stringifyVersion(targetVersion)
+             << ", minimum required version is "
+             << stringifyVersion(minRequiredVersion);
+    return success();
+  };
+
+  for (const auto &profile : targetAttr.getProfiles())
+    if (failed(
+            isCompatibleWithTargetVersion(profile, targetAttrLoc, "profile")))
+      return failure();
+  for (const auto &extension : targetAttr.getExtensions())
+    if (failed(isCompatibleWithTargetVersion(extension, targetAttrLoc,
+                                             "extension")))
+      return failure();
+  if (failed(isCompatibleWithTargetVersion(targetAttr.getLevel(), targetAttrLoc,
+                                           "level")))
+    return failure();
+
+  return success();
+}
+
 TargetEnvAttr lookupTargetEnv(Operation *op) {
   while (op) {
     op = SymbolTable::getNearestSymbolTable(op);
@@ -39,9 +130,5 @@ TargetEnvAttr lookupTargetEnvOrDefault(Operation *op) {
   return getDefaultTargetEnv(op->getContext());
 }
 
-llvm::SmallString<4> stringifyVersion(TosaSpecificationVersion version) {
-  return llvm::formatv("{0}.{1}", version.getMajor(), version.getMinor());
-}
-
 } // namespace tosa
 } // namespace mlir
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
index f7f66f2550b39..de9096da9b87b 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaCanonicalizations.cpp
@@ -75,28 +75,6 @@ namespace {
 template <typename OpTy>
 struct PoolPadFoldAdaptor;
 
-template <>
-struct PoolPadFoldAdaptor<tosa::AvgPool2dOp> {
-  using OpTy = tosa::AvgPool2dOp;
-  static bool checkKernelCompliance(OpTy op, const ArrayRef<int64_t> newPad) {
-    const llvm::ArrayRef<int64_t> kernel = op.getKernel();
-    if (newPad[2] >= kernel[1] || newPad[3] >= kernel[1] ||
-        newPad[0] >= kernel[0] || newPad[1] >= kernel[0])
-      return false;
-    return true;
-  }
-  static bool checkPadConstCompliance(OpTy op, Value padConst) {
-    return checkMatchingPadConstAndZp(padConst, op.getInputZp());
-  }
-  static void replaceOpWithNewPad(PatternRewriter &rewriter, OpTy op,
-                                  Value padInput, ArrayRef<int64_t> newPad) {
-    rewriter.replaceOpWithNewOp<tosa::AvgPool2dOp>(
-        op, op.getType(), padInput, op.getInputZp(), op.getOutputZp(),
-        op.getKernel(), op.getStride(), rewriter.getDenseI64ArrayAttr(newPad),
-        op.getAccType());
-  }
-};
-
 template <>
 struct PoolPadFoldAdaptor<tosa::MaxPool2dOp> {
   using OpTy = tosa::MaxPool2dOp;
@@ -245,13 +223,6 @@ struct FoldPadToTensorOp : public OpRewritePattern<OpTy> {
 };
 } // namespace
 
-void AvgPool2dOp::getCanonicalizationPatterns(RewritePatternSet &results,
-                                              MLIRContext *context) {
-  results.add<FoldPadToTensorOp<tosa::AvgPool2dOp,
-                                PoolPadFoldAdaptor<tosa::AvgPool2dOp>>>(
-      context);
-}
-
 void Conv2DOp::getCanonicalizationPatterns(RewritePatternSet &results,
                                            MLIRContext *context) {
   results.add<
@@ -1001,8 +972,12 @@ OpFoldResult ArgMaxOp::fold(FoldAdaptor adaptor) {
       !outputTy.hasStaticShape())
     return {};
 
-  if (inputTy.getDimSize(getAxis()) == 1)
-    return DenseElementsAttr::get(outputTy, 0);
+  const Type outputElementTy = getElementTypeOrSelf(outputTy);
+  if (inputTy.getDimSize(getAxis()) == 1 && outputElementTy.isInteger()) {
+    const auto outputElemIntTy = cast<IntegerType>(outputElementTy);
+    const APInt zero = APInt::getZero(outputElemIntTy.getWidth());
+    return DenseElementsAttr::get(outputTy, zero);
+  }
 
   return {};
 }
diff --git a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
index 00f84bc43f444..0aff67f0b5eba 100644
--- a/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
+++ b/mlir/lib/Dialect/Tosa/IR/TosaOps.cpp
@@ -321,6 +321,19 @@ ParseResult parseAttrEntryWithEnumHandling(OpAsmParser &parser,
     }
   }
 
+  // special handling: block_size accepts a *bare* BlockSizeMode enum
+  if constexpr (std::is_same_v<EnumType, tosa::BlockSize>) {
+    if (name == "block_size" && succeeded(parser.parseOptionalKeyword(&kw))) {
+      auto sym = symbolizeBlockSize(kw);
+      if (!sym)
+        return parser.emitError(parser.getCurrentLocation())
+               << "invalid block_size value: " << kw;
+      auto attr = BlockSizeAttr::get(parser.getContext(), sym.value());
+      outAttrs.push_back(NamedAttribute(name, attr));
+      return success();
+    }
+  }
+
   // Default path: parse any normal attribute literal, including fully qualified
   // enum keyword
   Attribute attr;
@@ -357,7 +370,7 @@ ParseResult parseWithEnumHandling(OpAsmParser &parser, OperationState &result) {
                                     result.operands)))
     return failure();
 
-  result.addTypes(fnTy.getResult(0));
+  result.addTypes(fnTy.getResults());
   result.addAttributes(attrs);
 
   return success();
@@ -373,6 +386,8 @@ void printNamedAttr(OpAsmPrinter &parser, const NamedAttribute namedAttr) {
   } else if (auto nanPropagationModeAttr =
                  dyn_cast<tosa::NanPropagationModeAttr>(attr)) {
     parser << nanPropagationModeAttr.getValue();
+  } else if (auto blockSizeAttr = dyn_cast<tosa::BlockSizeAttr>(attr)) {
+    parser << blockSizeAttr.getValue();
   } else {
     parser.printAttribute(attr);
   }
@@ -508,6 +523,33 @@ void ReduceMinOp::print(OpAsmPrinter &parser) {
   printWithNanPropagationHandling(parser, *this);
 }
 
+ParseResult MatmulTBlockScaledOp::parse(OpAsmParser &parser,
+                                        OperationState &result) {
+  return parseWithEnumHandling<tosa::BlockSize>(parser, result);
+}
+
+void MatmulTBlockScaledOp::print(OpAsmPrinter &parser) {
+  printWithEnumHandling(parser, *this);
+}
+
+ParseResult CastFromBlockScaledOp::parse(OpAsmParser &parser,
+                                         OperationState &result) {
+  return parseWithEnumHandling<tosa::BlockSize>(parser, result);
+}
+
+void CastFromBlockScaledOp::print(OpAsmPrinter &parser) {
+  printWithEnumHandling(parser, *this);
+}
+
+ParseResult CastToBlockScaledOp::parse(OpAsmParser &parser,
+                                       OperationState &result) {
+  return parseWithEnumHandling<tosa::BlockSize>(parser, result);
+}
+
+void CastToBlockScaledOp::print(OpAsmPrinter &parser) {
+  printWithEnumHandling(parser, *this);
+}
+
 //===----------------------------------------------------------------------===//
 // Tosa utilities.
 //===----------------------------------------------------------------------===//
@@ -933,32 +975,35 @@ static LogicalResult verifyVariableOpErrorIf(T op, Type type, StringRef name) {
 
 // verify that inType and outType have same element types
 template <typename T>
-static LogicalResult verifySameElementTypes(T op, Type inType, Type outType) {
-  auto inputType = llvm::dyn_cast<TensorType>(inType);
-  auto outputType = llvm::dyn_cast<TensorType>(outType);
-  if (!inputType) {
-    op.emitOpError("expect shaped tensor for input, got ") << inType;
+static LogicalResult verifySameElementTypes(T op, Type aType, Type bType,
+                                            StringRef aName = "input",
+                                            StringRef bName = "output") {
+  auto aTType = llvm::dyn_cast<TensorType>(aType);
+  auto bTType = llvm::dyn_cast<TensorType>(bType);
+  if (!aTType) {
+    op.emitOpError("expect shaped tensor for") << aName << ", got " << aType;
     return failure();
   }
-  if (!outputType) {
-    op.emitOpError("expect shaped tensor for output, got ") << outType;
+  if (!bTType) {
+    op.emitOpError("expect shaped tensor for") << bName << ", got" << bType;
     return failure();
   }
-  auto inputElementType = inputType.getElementType();
-  auto outputElementType = outputType.getElementType();
-  auto inputQuantType =
-      llvm::dyn_cast<mlir::quant::UniformQuantizedType>(inputElementType);
-  auto outputQuantType =
-      llvm::dyn_cast<mlir::quant::UniformQuantizedType>(outputElementType);
-  if ((inputElementType.isIntOrIndexOrFloat() || inputQuantType) &&
-      (outputElementType.isIntOrIndexOrFloat() || outputQuantType) &&
-      inputElementType != outputElementType) {
+  auto aElementType = aTType.getElementType();
+  auto bElementType = bTType.getElementType();
+  auto aQuantType =
+      llvm::dyn_cast<mlir::quant::UniformQuantizedType>(aElementType);
+  auto bQuantType =
+      llvm::dyn_cast<mlir::quant::UniformQuantizedType>(bElementType);
+  if ((aElementType.isIntOrIndexOrFloat() || aQuantType) &&
+      (bElementType.isIntOrIndexOrFloat() || bQuantType) &&
+      aElementType != bElementType) {
     // only check if both element types are int/index/float/UniformQuantized
     // eg, not sure how to check quant::QuantizedType
     // this happens in test_conv2d_q_grouped_convolution in
     // tfl-to-tosa-pipeline.mlir
-    op.emitOpError("expect input and output to have same element type, got ")
-        << inputElementType << " and " << outputElementType;
+    op.emitOpError("expect ")
+        << aName << " and " << bName << " to have same element type, got "
+        << aElementType << " and " << bElementType;
     return failure();
   }
   return success();
@@ -1846,6 +1891,161 @@ LogicalResult MatMulOp::verify() {
   return success();
 }
 
+LogicalResult tosa::MatmulTBlockScaledOp::inferReturnTypeComponents(
+    MLIRContext *context, ::std::optional<Location> location,
+    MatmulTBlockScaledOp::Adaptor adaptor,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  SmallVector<int64_t, 3> outShape(3, ShapedType::kDynamic);
+
+  const auto aDataShape = cast<ShapedType>(adaptor.getAData().getType());
+  if (aDataShape.hasRank()) {
+    outShape[0] = aDataShape.getDimSize(0);
+    outShape[1] = aDataShape.getDimSize(1);
+  }
+
+  const auto aScaleShape = cast<ShapedType>(adaptor.getAScale().getType());
+  if (aScaleShape.hasRank()) {
+    outShape[0] = ShapedType::isDynamic(outShape[0]) ? aScaleShape.getDimSize(0)
+                                                     : outShape[0];
+    outShape[1] = ShapedType::isDynamic(outShape[1]) ? aScaleShape.getDimSize(1)
+                                                     : outShape[1];
+  }
+
+  // If B batch size is 1, it is broadcast across A's batch size
+  const auto bDataShape = cast<ShapedType>(adaptor.getBData().getType());
+  if (bDataShape.hasRank()) {
+    const int64_t bDataBatchSize = bDataShape.getDimSize(0);
+    if (bDataBatchSize != 1)
+      outShape[0] =
+          ShapedType::isDynamic(outShape[0]) ? bDataBatchSize : outShape[0];
+    outShape[2] = bDataShape.getDimSize(1);
+  }
+
+  const auto bScaleShape = cast<ShapedType>(adaptor.getBScale().getType());
+  if (bScaleShape.hasRank()) {
+    const int64_t bScaleBatchSize = bScaleShape.getDimSize(0);
+    if (bScaleBatchSize != 1)
+      outShape[0] =
+          ShapedType::isDynamic(outShape[0]) ? bScaleBatchSize : outShape[0];
+    outShape[2] = ShapedType::isDynamic(outShape[2]) ? bScaleShape.getDimSize(1)
+                                                     : outShape[2];
+  }
+
+  inferredReturnShapes.push_back(ShapedTypeComponents(outShape));
+  return success();
+}
+
+LogicalResult MatmulTBlockScaledOp::verify() {
+  // Verify same input data types
+  const Type aDataType = getAData().getType();
+  const Type bDataType = getBData().getType();
+  if (failed(verifySameElementTypes(*this, aDataType, bDataType, "A_data",
+                                    "B_data")))
+    return failure();
+
+  auto tryUpdateDimOrFailure = [&](int64_t &currDim, const int64_t newDim,
+                                   const StringRef operandName,
+                                   const StringRef dimName) -> LogicalResult {
+    if (ShapedType::isDynamic(currDim)) {
+      currDim = newDim;
+      return success();
+    } else if (ShapedType::isStatic(newDim) && currDim != newDim) {
+      return emitOpError("expected ")
+             << dimName << " of " << operandName << " to match size " << currDim
+             << ", got " << newDim;
+    }
+    return success();
+  };
+
+  // Verify input shape compatibility
+  int64_t N = ShapedType::kDynamic;
+  int64_t D = ShapedType::kDynamic;
+  int64_t H = ShapedType::kDynamic;
+  int64_t W = ShapedType::kDynamic;
+  int64_t C = ShapedType::kDynamic;
+  int64_t multiplesOfC = ShapedType::kDynamic;
+
+  const ShapeAdaptor aDataShape = ShapeAdaptor(aDataType);
+  if (aDataShape.hasRank()) {
+    N = aDataShape.getDimSize(0);
+    H = aDataShape.getDimSize(1);
+    C = aDataShape.getDimSize(2);
+  }
+
+  const ShapeAdaptor aScaleShape = ShapeAdaptor(getAScale().getType());
+  if (aScaleShape.hasRank()) {
+    if (failed(tryUpdateDimOrFailure(N, aScaleShape.getDimSize(0), "a_scale",
+                                     "batch")) ||
+        failed(tryUpdateDimOrFailure(H, aScaleShape.getDimSize(1), "a_scale",
+                                     "height")))
+      return failure();
+    multiplesOfC = aScaleShape.getDimSize(2);
+  }
+
+  const ShapeAdaptor bDataShape = ShapeAdaptor(bDataType);
+  if (bDataShape.hasRank()) {
+    if (failed(tryUpdateDimOrFailure(D, bDataShape.getDimSize(0), "b_data",
+                                     "batch")) ||
+        failed(tryUpdateDimOrFailure(C, bDataShape.getDimSize(2), "b_data",
+                                     "channels")))
+      return failure();
+    W = bDataShape.getDimSize(1);
+  }
+
+  const ShapeAdaptor bScaleShape = ShapeAdaptor(getBScale().getType());
+  if (bScaleShape.hasRank()) {
+    if (failed(tryUpdateDimOrFailure(D, bScaleShape.getDimSize(0), "b_scale",
+                                     "batch")) ||
+        failed(tryUpdateDimOrFailure(W, bScaleShape.getDimSize(1), "b_scale",
+                                     "width")) ||
+        failed(tryUpdateDimOrFailure(multiplesOfC, bScaleShape.getDimSize(2),
+                                     "b_scale", "C/block_size")))
+      return failure();
+  }
+
+  // Verify batch size is broadcast compatible
+  if (ShapedType::isStatic(N) && ShapedType::isStatic(D) && N != D && D != 1)
+    return emitOpError("expect B matrix batch size to be broadcast compatible "
+                       "with A, got D=")
+           << D << " vs N=" << N;
+
+  // Verify C is a multiple of block size
+  const uint32_t blockSize = BlockSizeAttr::getBlockSizeValue(getBlockSize());
+  if (ShapedType::isStatic(C) && C % blockSize != 0)
+    return emitOpError("expect C to be a multiple of block size, got C=")
+           << C << ", block_size=" << blockSize;
+
+  // Verify multiplesOfC is C / block size
+  if (ShapedType::isStatic(C) && ShapedType::isStatic(multiplesOfC) &&
+      multiplesOfC != C / blockSize)
+    return emitOpError(
+               "expect scale operands dimension 2 to equal C/block_size (")
+           << C << "/" << blockSize << ")"
+           << ", got " << multiplesOfC;
+
+  // Verify output shape
+  N = ShapedType::isDynamic(N) ? D : N;
+  const SmallVector<int64_t, 3> expectedOutputShape = {N, H, W};
+  const auto outputType = cast<ShapedType>(getResult().getType());
+  if (outputType.hasRank() &&
+      failed(
+          verifyCompatibleShape(outputType.getShape(), expectedOutputShape))) {
+    InFlightDiagnostic opError = emitOpError("expected output shape ");
+    auto stringifyDim = [&](int64_t d) {
+      if (ShapedType::isDynamic(d))
+        opError << "?";
+      else
+        opError << d;
+    };
+    llvm::interleaveComma(outputType.getShape(), opError, stringifyDim);
+    opError << " to be compatible with expected output shape ";
+    llvm::interleaveComma(expectedOutputShape, opError, stringifyDim);
+    return opError;
+  }
+
+  return success();
+}
+
 LogicalResult tosa::PadOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     PadOp::Adaptor adaptor,
@@ -3762,6 +3962,145 @@ LogicalResult RescaleOp::inferReturnTypeComponents(
   return success();
 }
 
+LogicalResult CastFromBlockScaledOp::inferReturnTypeComponents(
+    MLIRContext *context, ::std::optional<Location> location,
+    CastFromBlockScaledOp::Adaptor adaptor,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  const ShapeAdaptor inputShape(adaptor.getInputData().getType());
+  inferredReturnShapes.push_back(ShapedTypeComponents(inputShape));
+  return success();
+}
+
+LogicalResult CastFromBlockScaledOp::verify() {
+  const Type inputDataType = getInputData().getType();
+  const Type outputDataType = getResult().getType();
+  if (failed(verifyCompatibleShape(inputDataType, outputDataType)))
+    return emitOpError() << "require compatible shapes for input_data ("
+                         << inputDataType << ") and "
+                         << "output_data (" << outputDataType << ")";
+
+  const ShapeAdaptor inputDataShape = ShapeAdaptor(inputDataType);
+
+  if (inputDataShape.hasRank()) {
+    const unsigned int blockSize =
+        BlockSizeAttr::getBlockSizeValue(getBlockSize());
+    const int64_t inputDataLastDim =
+        inputDataShape.getDimSize(inputDataShape.getRank() - 1);
+    if (inputDataLastDim % blockSize != 0)
+      return emitOpError() << "expect last dimension of input_data ("
+                           << inputDataLastDim
+                           << ") to be divisible by block_size (" << blockSize
+                           << ")";
+
+    const Type inputScaleType = getInputScale().getType();
+    const ShapeAdaptor inputScaleShape = ShapeAdaptor(inputScaleType);
+
+    if (inputScaleShape.hasRank()) {
+      SmallVector<int64_t> inputDataDims, inputScaleDims;
+      inputDataShape.getDims(inputDataDims);
+      inputScaleShape.getDims(inputScaleDims);
+
+      if (inputDataDims.size() != inputScaleDims.size() ||
+          failed(verifyCompatibleShape(
+              ArrayRef<int64_t>(inputDataDims).drop_back(1),
+              ArrayRef<int64_t>(inputScaleDims).drop_back(1))))
+        return emitOpError() << "require compatible shapes for input_data ("
+                             << inputDataType << ") and "
+                             << "input_scale (" << inputScaleType
+                             << ") except for the last dimension";
+
+      const SmallVector<int64_t, 2> dimsToCheck{inputDataLastDim / blockSize,
+                                                inputScaleDims.back()};
+      if (ShapedType::isStatic(inputDataLastDim) &&
+          failed(verifyCompatibleDims(dimsToCheck)))
+        return emitOpError()
+               << "expect last dimension of input_scale ("
+               << inputScaleDims.back()
+               << ") to be equal to last dimension of input_data / block_size ("
+               << inputDataDims.back() / blockSize << ")";
+    }
+  }
+
+  return success();
+}
+
+LogicalResult CastToBlockScaledOp::inferReturnTypeComponents(
+    MLIRContext *context, ::std::optional<Location> location,
+    CastToBlockScaledOp::Adaptor adaptor,
+    SmallVectorImpl<ShapedTypeComponents> &inferredReturnShapes) {
+  const ShapeAdaptor inputShape(adaptor.getInputData().getType());
+  inferredReturnShapes.push_back(ShapedTypeComponents(inputShape));
+  if (!inputShape.hasRank())
+    return success();
+
+  // Calculate output_scale shape if ranked input provided
+  SmallVector<int64_t> outputScaleShape;
+  inputShape.getDims(outputScaleShape);
+  const int64_t lastDimLoc = inputShape.getRank() - 1;
+  const int64_t lastDimSize = inputShape.getDimSize(lastDimLoc);
+  if (ShapedType::isStatic(lastDimSize)) {
+    const unsigned int blockSize =
+        BlockSizeAttr::getBlockSizeValue(adaptor.getBlockSize());
+    outputScaleShape[lastDimLoc] = lastDimSize / blockSize;
+  }
+  inferredReturnShapes.push_back(ShapedTypeComponents(outputScaleShape));
+  return success();
+}
+
+LogicalResult CastToBlockScaledOp::verify() {
+  const Type inputDataType = getInputData().getType();
+  const Type outputDataType = getResult(0).getType();
+  if (failed(verifyCompatibleShape(inputDataType, outputDataType)))
+    return emitOpError() << "require compatible shapes for input_data ("
+                         << inputDataType << ") and "
+                         << "output_data (" << outputDataType << ")";
+
+  const unsigned int blockSize =
+      BlockSizeAttr::getBlockSizeValue(getBlockSize());
+  const ShapeAdaptor inputDataShape = ShapeAdaptor(inputDataType);
+  if (inputDataShape.hasRank()) {
+    const int64_t inputDataLastDim =
+        inputDataShape.getDimSize(inputDataShape.getRank() - 1);
+    if (ShapedType::isStatic(inputDataLastDim) &&
+        inputDataLastDim % blockSize != 0)
+      return emitOpError() << "expect last dimension of input_data ("
+                           << inputDataLastDim
+                           << ") to be divisible by block_size (" << blockSize
+                           << ")";
+  }
+
+  const ShapeAdaptor outputDataShape = ShapeAdaptor(outputDataType);
+  const Type outputScaleType = getResult(1).getType();
+  const ShapeAdaptor outputScaleShape = ShapeAdaptor(outputScaleType);
+  if (outputDataShape.hasRank() && outputScaleShape.hasRank()) {
+    SmallVector<int64_t> outputDataDims, outputScaleDims;
+    outputDataShape.getDims(outputDataDims);
+    outputScaleShape.getDims(outputScaleDims);
+
+    if (outputDataDims.size() != outputScaleDims.size() ||
+        failed(verifyCompatibleShape(
+            ArrayRef<int64_t>(outputDataDims).drop_back(1),
+            ArrayRef<int64_t>(outputScaleDims).drop_back(1))))
+      return emitOpError() << "require compatible shapes for output_data ("
+                           << outputDataType << ") and "
+                           << "output_scale (" << outputScaleType
+                           << ") except for the last dimension";
+
+    const int64_t outputDataLastDim = outputDataDims.back();
+    const SmallVector<int64_t, 2> dimsToCheck{outputDataLastDim / blockSize,
+                                              outputScaleDims.back()};
+    if (ShapedType::isStatic(outputDataLastDim) &&
+        failed(verifyCompatibleDims(dimsToCheck)))
+      return emitOpError()
+             << "expect last dimension of output_scale ("
+             << outputScaleDims.back()
+             << ") to be equal to last dimension of output_data / block_size ("
+             << outputDataDims.back() / blockSize << ")";
+  }
+
+  return success();
+}
+
 LogicalResult IfOp::inferReturnTypeComponents(
     MLIRContext *context, ::std::optional<Location> location,
     IfOp::Adaptor adaptor,
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
index f072e3eff1975..ab363ee6b4d2a 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaProfileCompliance.cpp
@@ -19,12 +19,19 @@ TosaProfileCompliance::TosaProfileCompliance() {
   const TypeInfo i16T = {mlir::IntegerType::getTypeID(), 16};
   const TypeInfo i32T = {mlir::IntegerType::getTypeID(), 32};
   const TypeInfo i48T = {mlir::IntegerType::getTypeID(), 48};
+  const TypeInfo i64T = {mlir::IntegerType::getTypeID(), 64};
   const TypeInfo bf16T = {mlir::BFloat16Type::getTypeID(), 16};
   const TypeInfo fp16T = {mlir::Float16Type::getTypeID(), 16};
   const TypeInfo fp32T = {mlir::Float32Type::getTypeID(), 32};
   const TypeInfo fp8e4m3T = {mlir::Float8E4M3FNType::getTypeID(), 8};
   const TypeInfo fp8e5m2T = {mlir::Float8E5M2Type::getTypeID(), 8};
 
+  // micro-scaling formats
+  const TypeInfo fp6e2m3T = {mlir::Float6E2M3FNType::getTypeID(), 6};
+  const TypeInfo fp6e3m2T = {mlir::Float6E3M2FNType::getTypeID(), 6};
+  const TypeInfo fp4e2m1T = {mlir::Float4E2M1FNType::getTypeID(), 4};
+  const TypeInfo fp8ue8m0T = {mlir::Float8E8M0FNUType::getTypeID(), 8};
+
 // The profile-based compliance content below is auto-generated by a script
 // in https://git.mlplatform.org/tosa/specification.git
 #include "mlir/Dialect/Tosa/IR/TosaComplianceData.h.inc"
@@ -44,10 +51,11 @@ TosaProfileCompliance::getProfileComplianceMap() {
 
 // Base populating function
 LogicalResult ProfileInfoDepot::populateProfileInfo(ValueRange operands,
-                                                    Value output) {
-  for (auto operand : operands)
+                                                    ValueRange results) {
+  for (const auto &operand : operands)
     addValue(operand);
-  addValue(output);
+  for (const auto &result : results)
+    addValue(result);
   return success();
 }
 
@@ -169,23 +177,6 @@ LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::ResizeOp op) {
   return success();
 }
 
-template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::FFT2dOp op) {
-  addValue(op.getInputReal());
-  addValue(op.getInputImag());
-  addValue(op.getOutputReal());
-  addValue(op.getOutputImag());
-  return success();
-}
-
-template <>
-LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::RFFT2dOp op) {
-  addValue(op.getInputReal());
-  addValue(op.getOutputReal());
-  addValue(op.getOutputImag());
-  return success();
-}
-
 template <>
 LogicalResult ProfileInfoDepot::populateProfileInfo(tosa::SelectOp op) {
   addValue(op.getOnTrue());
@@ -239,7 +230,7 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 // This helper function populates the info for all operands.
 #define POPULATE_PROFILE_INFO_COMMON(tosaOp)                                   \
   if (isa<tosa::tosaOp##Op>(op)) {                                             \
-    return populateProfileInfo(op->getOperands(), op->getResult(0));           \
+    return populateProfileInfo(op->getOperands(), op->getResults());           \
   }
 
   // Skip irrelevant operands when they are independent and not tied to any
@@ -250,8 +241,6 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
   POPULATE_PROFILE_INFO_CUSTOM(Conv3D)
   POPULATE_PROFILE_INFO_CUSTOM(DepthwiseConv2D)
   POPULATE_PROFILE_INFO_CUSTOM(Mul)
-  POPULATE_PROFILE_INFO_CUSTOM(FFT2d)
-  POPULATE_PROFILE_INFO_CUSTOM(RFFT2d)
   POPULATE_PROFILE_INFO_CUSTOM(Concat)
   POPULATE_PROFILE_INFO_CUSTOM(Pad)
   POPULATE_PROFILE_INFO_CUSTOM(Reshape)
@@ -269,7 +258,12 @@ LogicalResult ProfileInfoDepot::populatationDispatch(Operation *op) {
 
   // For the most of tosa operators, all operands are profile/extension related
   // and hence are all considered in this profile-based compilance check.
+  POPULATE_PROFILE_INFO_COMMON(MatmulTBlockScaled)
+  POPULATE_PROFILE_INFO_COMMON(FFT2d)
+  POPULATE_PROFILE_INFO_COMMON(RFFT2d)
   POPULATE_PROFILE_INFO_COMMON(Cast)
+  POPULATE_PROFILE_INFO_COMMON(CastFromBlockScaled)
+  POPULATE_PROFILE_INFO_COMMON(CastToBlockScaled)
   POPULATE_PROFILE_INFO_COMMON(Const)
   POPULATE_PROFILE_INFO_COMMON(ArgMax)
   POPULATE_PROFILE_INFO_COMMON(Sub)
@@ -623,6 +617,14 @@ TosaProfileCompliance::stringifyTypeInfo(const TypeInfo &typeInfo) {
     return {"fp8e4m3"};
   } else if (typeInfo.typeID == mlir::Float8E5M2Type::getTypeID()) {
     return {"fp8e5m2"};
+  } else if (typeInfo.typeID == mlir::Float6E2M3FNType::getTypeID()) {
+    return {"fp6e2m3"};
+  } else if (typeInfo.typeID == mlir::Float6E3M2FNType::getTypeID()) {
+    return {"fp6e3m2"};
+  } else if (typeInfo.typeID == mlir::Float4E2M1FNType::getTypeID()) {
+    return {"fp4e2m1"};
+  } else if (typeInfo.typeID == mlir::Float8E8M0FNUType::getTypeID()) {
+    return {"fp8e8m0"};
   }
   llvm_unreachable("unknown type");
 }
diff --git a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
index 82f2f7eb17af4..4d0b61acc4ea4 100644
--- a/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
+++ b/mlir/lib/Dialect/Tosa/Transforms/TosaValidation.cpp
@@ -635,6 +635,8 @@ LogicalResult TosaValidation::levelCheckRanksAndSizes(Operation *op) {
   CHECK_RANKS_AND_SIZES(Transpose);
   // Type Conversion
   CHECK_RANKS_AND_SIZES(Cast);
+  CHECK_RANKS_AND_SIZES(CastFromBlockScaled);
+  CHECK_RANKS_AND_SIZES(CastToBlockScaled);
   CHECK_RANKS_AND_SIZES(Rescale);
   // Control Flow Operators
   CHECK_RANKS_AND_SIZES(If);
@@ -657,6 +659,7 @@ LogicalResult TosaValidation::levelCheckRanksAndSizes(Operation *op) {
   CHECK_SIZES(TransposeConv2D);
   CHECK_SIZES(FFT2d);
   CHECK_SIZES(MatMul);
+  CHECK_SIZES(MatmulTBlockScaled);
   CHECK_SIZES(MaxPool2d);
   CHECK_SIZES(RFFT2d);
   // Scatter/Gather Operators
@@ -1192,9 +1195,9 @@ LogicalResult TosaValidation::applyErrorIfCheck(Operation *op) {
 bool TosaValidation::isValidElementType(Type type, const bool allowUnsigned) {
   if (isa<FloatType>(type)) {
     return isa<Float32Type, Float16Type, BFloat16Type, Float8E4M3FNType,
-               Float8E5M2Type>(type);
-  }
-  if (auto intTy = dyn_cast<IntegerType>(type)) {
+               Float8E5M2Type, Float4E2M1FNType, Float6E2M3FNType,
+               Float6E3M2FNType, Float8E8M0FNUType>(type);
+  } else if (auto intTy = dyn_cast<IntegerType>(type)) {
     if (intTy.isSignless()) {
       switch (intTy.getWidth()) {
       case 1:
@@ -1203,6 +1206,7 @@ bool TosaValidation::isValidElementType(Type type, const bool allowUnsigned) {
       case 16:
       case 32:
       case 48:
+      case 64:
         return true;
       }
     } else if (allowUnsigned && intTy.isUnsigned()) {
@@ -1220,13 +1224,19 @@ bool TosaValidation::isValidElementType(Type type, const bool allowUnsigned) {
 }
 
 void TosaValidation::runOnOperation() {
+  ModuleOp modOp = getOperation();
+  const TargetEnvAttr targetEnvAttr = lookupTargetEnvOrDefault(modOp);
+  const auto maybeTargetEnv =
+      tosa::TargetEnv::createTargetEnvFromAttr(targetEnvAttr, modOp.getLoc());
+  if (failed(maybeTargetEnv))
+    return signalPassFailure();
+  targetEnv = *maybeTargetEnv;
+
   TosaDialect *tosaDialect = getContext().getLoadedDialect<TosaDialect>();
   if (!tosaDialect)
     return;
 
-  targetEnv = tosa::TargetEnv(lookupTargetEnvOrDefault(getOperation()));
-
-  getOperation().walk([&](Operation *op) {
+  modOp.walk([&](Operation *op) {
     if (op->getDialect() != tosaDialect)
       return;
 
diff --git a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
index 8f46ad6ea892b..ef49c863683ad 100644
--- a/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/LowerVectorShuffle.cpp
@@ -74,9 +74,9 @@ struct MixedSizeInputShuffleOpRewrite final
     for (int64_t i = 0; i < origNumElems; ++i)
       promoteMask[i] = i;
 
-    Value promotedInput = rewriter.create<vector::ShuffleOp>(
-        shuffleOp.getLoc(), promotedType, inputToPromote, inputToPromote,
-        promoteMask);
+    Value promotedInput =
+        vector::ShuffleOp::create(rewriter, shuffleOp.getLoc(), promotedType,
+                                  inputToPromote, inputToPromote, promoteMask);
 
     // Create the final shuffle with the promoted inputs.
     Value promotedV1 = promoteV1 ? promotedInput : shuffleOp.getV1();
diff --git a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
index 7c019e7d25bf2..8b5e950733a22 100644
--- a/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ b/mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -341,13 +341,18 @@ struct WarpOpToScfIfPattern : public WarpDistributionPattern {
 /// Return the distributed vector type based on the original type and the
 /// distribution map. The map is expected to have a dimension equal to the
 /// original type rank and should be a projection where the results are the
-/// distributed dimensions. The number of results should be equal to the number
+/// distributed dimensions. If the number of results is zero there is no
+/// distribution (i.e. original type is returned).
+/// Otherwise, The number of results should be equal to the number
 /// of warp sizes which is currently limited to 1.
 /// Example: For a vector<16x32x64> distributed with a map(d0, d1, d2) -> (d1)
 /// and a warp size of 16 would distribute the second dimension (associated to
 /// d1) and return vector<16x2x64>
 static VectorType getDistributedType(VectorType originalType, AffineMap map,
                                      int64_t warpSize) {
+  // If the map has zero results, return the original type.
+  if (map.getNumResults() == 0)
+    return originalType;
   SmallVector<int64_t> targetShape(originalType.getShape());
   for (unsigned i = 0, e = map.getNumResults(); i < e; i++) {
     unsigned position = map.getDimPosition(i);
diff --git a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
index 1599ae9f28e90..24e909548fe0b 100644
--- a/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
+++ b/mlir/lib/Dialect/XeGPU/IR/XeGPUDialect.cpp
@@ -736,7 +736,7 @@ OpFoldResult genBinOp(OpFoldResult a, OpFoldResult b, Location loc,
                       OpBuilder &builder) {
   auto aVal = getValueOrCreateConstantIndexOp(builder, loc, a);
   auto bVal = getValueOrCreateConstantIndexOp(builder, loc, b);
-  return builder.create<ArithOp>(loc, aVal, bVal).getResult();
+  return ArithOp::create(builder, loc, aVal, bVal).getResult();
 }
 
 // a helper utility to perform division operation on OpFoldResult and int64_t.
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
index f77784abaf0b2..2c37140ad9c76 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUBlocking.cpp
@@ -145,8 +145,26 @@ XeGPUBlockingPass::getTileShape(const T &operandOrResult) const {
   xegpu::DistributeLayoutAttr layout =
       xegpu::getDistributeLayoutAttr(operandOrResult);
   if (layout && layout.isForSubgroup()) {
-    if (!layout.getEffectiveInstDataAsInt().empty())
-      return layout.getEffectiveInstDataAsInt();
+    if (!layout.getEffectiveInstDataAsInt().empty()) {
+      SmallVector<int64_t> instData = layout.getEffectiveInstDataAsInt();
+      // Remove leading unit dimensions from inst_data
+      // For example, if the inst_data is [1, 1, 32]
+      // it will pass [32] as the unroll/blocking size.
+      // Skip it for xegpu nd ops since it will be 2D
+      // TODO: For vectors ops, experiment with the
+      // upstream vector remove leading unit dims patterns,
+      // populateCastAwayVectorLeadingOneDimPatterns.
+      Operation *definingOp = value.getDefiningOp();
+      bool skipLeadingUnitDimRemoval =
+          definingOp &&
+          (isa<xegpu::CreateNdDescOp, xegpu::LoadNdOp, xegpu::DpasOp,
+               xegpu::StoreNdOp, xegpu::PrefetchNdOp>(definingOp));
+      if (!skipLeadingUnitDimRemoval) {
+        auto it = llvm::find_if(instData, [](auto val) { return val != 1; });
+        instData.erase(instData.begin(), it);
+      }
+      return instData;
+    }
 
     if (auto type = dyn_cast<ShapedType>(value.getType()))
       return llvm::to_vector(type.getShape());
@@ -354,7 +372,6 @@ void XeGPUBlockingPass::runOnOperation() {
           // To create a new attribute with a different chunk_size:
           auto newEncoding = xegpu::ScatterTensorDescAttr::get(
               ctx, tdescTy.getMemorySpace(), blockedChunkSize);
-
           encoding = newEncoding;
         }
       }
@@ -363,7 +380,7 @@ void XeGPUBlockingPass::runOnOperation() {
           xegpu::TensorDescType::get(ctx, tileShape, elemTy, encoding,
                                      tdescTy.getLayoutAttr().dropInstData());
     } else {
-      newTy = type.clone(tileShape, elemTy);
+      newTy = VectorType::get(tileShape, elemTy);
     }
 
     if (returnSingleType)
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
index 26770b3c003ea..d09dc196c0bf7 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUSubgroupDistribute.cpp
@@ -1505,14 +1505,19 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
       return AffineMap::get(val.getContext());
     // Get the layout of the vector type.
     xegpu::DistributeLayoutAttr layout = xegpu::getDistributeLayoutAttr(val);
-    // If no layout is specified, assume the inner most dimension is distributed
-    // for now.
+    // If no layout is specified, that means no distribution.
     if (!layout)
-      return AffineMap::getMultiDimMapWithTargets(
-          vecRank, {static_cast<unsigned int>(vecRank - 1)}, val.getContext());
+      return AffineMap::getMultiDimMapWithTargets(vecRank, {},
+                                                  val.getContext());
+    // Expecting vector and layout rank to match.
+    assert(layout.getRank() == vecRank &&
+           "Expecting vector and layout rank to match");
+    // A dimension is distributed only if layout suggests there are
+    // multiple lanes assigned for this dimension and the shape can be evenly
+    // distributed to those lanes.
     SmallVector<unsigned int> distributedDims;
     for (auto [i, v] : llvm::enumerate(layout.getEffectiveLaneLayoutAsInt())) {
-      if (v > 1)
+      if (v > 1 && vecType.getShape()[i] % v == 0)
         distributedDims.push_back(i);
     }
     return AffineMap::getMultiDimMapWithTargets(vecRank, distributedDims,
@@ -1525,15 +1530,13 @@ void XeGPUSubgroupDistributePass::runOnOperation() {
   auto warpReduction = [](Location loc, OpBuilder &builder, Value input,
                           vector::CombiningKind kind, uint32_t size) {
     // First reduce on a single thread to get per lane reduction value.
-    Value laneVal = builder.create<vector::ReductionOp>(loc, kind, input);
+    Value laneVal = vector::ReductionOp::create(builder, loc, kind, input);
     // Parallel reduction using butterfly shuffles.
     for (uint64_t i = 1; i < size; i <<= 1) {
-      Value shuffled =
-          builder
-              .create<gpu::ShuffleOp>(loc, laneVal, i,
-                                      /*width=*/size,
-                                      /*mode=*/gpu::ShuffleMode::XOR)
-              .getShuffleResult();
+      Value shuffled = gpu::ShuffleOp::create(builder, loc, laneVal, i,
+                                              /*width=*/size,
+                                              /*mode=*/gpu::ShuffleMode::XOR)
+                           .getShuffleResult();
       laneVal = makeArithReduction(builder, loc, kind, laneVal, shuffled);
     }
     return laneVal;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
index aafa1b7deb84b..e6e71cc29a80a 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUUnroll.cpp
@@ -66,8 +66,6 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
   Value unpack(ValueRange srcs, Type destTy, ArrayRef<int64_t> blockSize,
                Location loc, PatternRewriter &rewriter) const {
     if (auto vecTy = dyn_cast<VectorType>(destTy)) {
-      assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
-             "Expecting blockSize size to match the rank of destTy.");
       auto shape = vecTy.getShape();
       return xegpu::createVectorWithShapeFromValues(rewriter, loc, srcs, shape);
     }
@@ -93,8 +91,6 @@ struct UnrollPattern : public OpRewritePattern<SourceOp> {
                           ArrayRef<int64_t> blockSize, Location loc,
                           PatternRewriter &rewriter) const {
     if (auto vecTy = dyn_cast<VectorType>(src.getType())) {
-      assert(vecTy.getRank() == static_cast<int64_t>(blockSize.size()) &&
-             "Expecting blockSize size to match the rank of src.");
       return xegpu::extractVectorsWithShapeFromValue(rewriter, loc, src,
                                                      blockSize);
     }
@@ -635,7 +631,7 @@ struct UnrollLoadGatherOpWithOffset
     VectorType maskTy = llvm::dyn_cast<VectorType>(mask.getType());
     VectorType offsetsTy = llvm::dyn_cast<VectorType>(offsets.getType());
     Type elemTy = valueTy.getElementType();
-    VectorType newValueTy = valueTy.cloneWith(*targetShape, elemTy);
+    VectorType newValueTy = VectorType::get(*targetShape, elemTy);
 
     SmallVector<Type> convertedMaskTypes;
     SmallVector<Value> convertedMasks;
diff --git a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
index 31a967dcd04c7..9fc5ad9af5c7b 100644
--- a/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
+++ b/mlir/lib/Dialect/XeGPU/Transforms/XeGPUWgToSgDistribute.cpp
@@ -825,7 +825,7 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
 
       auto tileAttr = DenseElementsAttr::get(VectorType::get(sgShape, eltType),
                                              baseTileValues);
-      auto baseConstVec = rewriter.create<arith::ConstantOp>(loc, tileAttr);
+      auto baseConstVec = arith::ConstantOp::create(rewriter, loc, tileAttr);
 
       // Get subgroup id
       Value sgId =
@@ -837,25 +837,26 @@ struct WgToSgArithConstantOp : public OpConversionPattern<arith::ConstantOp> {
 
       SmallVector<Value, 2> strideConsts;
       strideConsts.push_back(
-          rewriter.create<arith::ConstantIndexOp>(loc, colStride));
+          arith::ConstantIndexOp::create(rewriter, loc, colStride));
       if (rows > 1)
         strideConsts.insert(
             strideConsts.begin(),
-            rewriter.create<arith::ConstantIndexOp>(loc, rowStride));
+            arith::ConstantIndexOp::create(rewriter, loc, rowStride));
 
       SmallVector<Value> newConstOps;
       for (auto offsets : *sgOffsets) {
         // Multiply offset with stride, broadcast it and add to baseConstVec
-        Value mulOffset = rewriter.create<arith::ConstantIndexOp>(loc, 0);
+        Value mulOffset = arith::ConstantIndexOp::create(rewriter, loc, 0);
         for (size_t i = 0; i < strideConsts.size(); ++i) {
-          Value mul = rewriter.create<arith::MulIOp>(
-              loc, rewriter.getIndexType(), offsets[i], strideConsts[i]);
-          mulOffset = rewriter.create<arith::AddIOp>(
-              loc, rewriter.getIndexType(), mulOffset, mul);
+          Value mul =
+              arith::MulIOp::create(rewriter, loc, rewriter.getIndexType(),
+                                    offsets[i], strideConsts[i]);
+          mulOffset = arith::AddIOp::create(
+              rewriter, loc, rewriter.getIndexType(), mulOffset, mul);
         }
         // Broadcast to baseConstVec size
-        auto bcastOffset = rewriter.create<vector::BroadcastOp>(
-            loc, baseConstVec.getType(), mulOffset);
+        auto bcastOffset = vector::BroadcastOp::create(
+            rewriter, loc, baseConstVec.getType(), mulOffset);
         auto finalConst =
             arith::AddIOp::create(rewriter, loc, baseConstVec, bcastOffset);
         setLayoutIfNeeded(baseConstVec);
@@ -1138,8 +1139,8 @@ struct WgToSgVectorShapeCastOp
 
     SmallVector<Value> newShapeCastOps;
     for (auto src : adaptor.getSource()) {
-      auto newShapeCast =
-          rewriter.create<vector::ShapeCastOp>(op.getLoc(), newResultType, src);
+      auto newShapeCast = vector::ShapeCastOp::create(rewriter, op.getLoc(),
+                                                      newResultType, src);
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
         xegpu::setDistributeLayoutAttr(newShapeCast->getResult(0),
@@ -1201,9 +1202,9 @@ struct WgToSgMultiDimReductionOp
 
     SmallVector<Value> newReductions;
     for (auto sgSrc : adaptor.getSource()) {
-      auto newOp = rewriter.create<vector::MultiDimReductionOp>(
-          op.getLoc(), newDstType, op.getKind(), sgSrc, adaptor.getAcc()[0],
-          op.getReductionDims());
+      auto newOp = vector::MultiDimReductionOp::create(
+          rewriter, op.getLoc(), newDstType, op.getKind(), sgSrc,
+          adaptor.getAcc()[0], op.getReductionDims());
       if (!layout.getEffectiveLaneLayoutAsInt().empty() ||
           !layout.getEffectiveInstDataAsInt().empty())
         xegpu::setDistributeLayoutAttr(newOp->getResult(0),
diff --git a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
index 2c56a438ea62c..b4605cd7e94d6 100644
--- a/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
+++ b/mlir/lib/Dialect/XeGPU/Utils/XeGPUUtils.cpp
@@ -246,11 +246,28 @@ xegpu::extractVectorsWithShapeFromValue(OpBuilder &builder, Location loc,
   if (!computeShapeRatio(srcShape, shape))
     return {value};
 
+  int64_t srcShapeRank = srcShape.size();
+  int64_t targetShapeRank = shape.size();
+
+  SmallVector<int64_t> adjustedTargetShape(srcShape.size());
+  int64_t rankDiff = srcShapeRank - targetShapeRank;
+  std::fill(adjustedTargetShape.begin(), adjustedTargetShape.begin() + rankDiff,
+            1);
+  std::copy(shape.begin(), shape.end(), adjustedTargetShape.begin() + rankDiff);
+
   SmallVector<Value> result;
-  for (SmallVector<int64_t> offsets : StaticTileOffsetRange(srcShape, shape)) {
+  for (SmallVector<int64_t> offsets :
+       StaticTileOffsetRange(srcShape, adjustedTargetShape)) {
     SmallVector<int64_t> staticStrides(offsets.size(), 1);
-    result.push_back(vector::ExtractStridedSliceOp::create(
-        builder, loc, value, offsets, shape, staticStrides));
+    Value slice = vector::ExtractStridedSliceOp::create(
+        builder, loc, value, offsets, adjustedTargetShape, staticStrides);
+
+    // Reshape to remove leading unit dims if needed
+    if (srcShapeRank > targetShapeRank) {
+      auto targetTy = VectorType::get(shape, vecTy.getElementType());
+      slice = vector::ShapeCastOp::create(builder, loc, targetTy, slice);
+    }
+    result.push_back(slice);
   }
 
   return result;
@@ -274,7 +291,7 @@ Value xegpu::createVectorWithShapeFromValues(OpBuilder &builder, Location loc,
 
   for (auto [src, offsets] :
        llvm::zip_equal(values, StaticTileOffsetRange(shape, tileShape))) {
-    SmallVector<int64_t> staticStrides(offsets.size(), 1);
+    SmallVector<int64_t> staticStrides(tileShape.size(), 1);
     result = vector::InsertStridedSliceOp::create(builder, loc, src, result,
                                                   offsets, staticStrides);
   }
diff --git a/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt
index 6ef1529343453..c712c64b6de55 100644
--- a/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt
+++ b/mlir/lib/ExecutionEngine/SparseTensor/CMakeLists.txt
@@ -21,6 +21,6 @@ set_property(TARGET MLIRSparseTensorRuntime PROPERTY CXX_STANDARD 17)
 check_cxx_compiler_flag(-Wweak-vtables
   COMPILER_SUPPORTS_WARNING_WEAK_VTABLES)
 if(COMPILER_SUPPORTS_WARNING_WEAK_VTABLES)
-  target_compile_options(MLIRSparseTensorRuntime PUBLIC
+  target_compile_options(MLIRSparseTensorRuntime PRIVATE
     "-Wweak-vtables")
 endif()
diff --git a/mlir/lib/RegisterAllPasses.cpp b/mlir/lib/RegisterAllPasses.cpp
index dd413d2de8710..d7e321a61d4ac 100644
--- a/mlir/lib/RegisterAllPasses.cpp
+++ b/mlir/lib/RegisterAllPasses.cpp
@@ -33,6 +33,7 @@
 #include "mlir/Dialect/MemRef/Transforms/Passes.h"
 #include "mlir/Dialect/NVGPU/Transforms/Passes.h"
 #include "mlir/Dialect/OpenACC/Transforms/Passes.h"
+#include "mlir/Dialect/OpenMP/Transforms/Passes.h"
 #include "mlir/Dialect/Quant/Transforms/Passes.h"
 #include "mlir/Dialect/SCF/Transforms/Passes.h"
 #include "mlir/Dialect/SPIRV/Transforms/Passes.h"
@@ -80,6 +81,7 @@ void mlir::registerAllPasses() {
   memref::registerMemRefPasses();
   shard::registerShardPasses();
   ml_program::registerMLProgramPasses();
+  omp::registerOpenMPPasses();
   quant::registerQuantPasses();
   registerSCFPasses();
   registerShapePasses();
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 2ec941a69bf45..8344332c9063f 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -361,14 +361,8 @@ static LogicalResult checkImplementationStatus(Operation &op) {
       result = todo("priority");
   };
   auto checkPrivate = [&todo](auto op, LogicalResult &result) {
-    if constexpr (std::is_same_v<std::decay_t<decltype(op)>, omp::TargetOp>) {
-      // Privatization is supported only for included target tasks.
-      if (!op.getPrivateVars().empty() && op.getNowait())
-        result = todo("privatization for deferred target tasks");
-    } else {
-      if (!op.getPrivateVars().empty() || op.getPrivateSyms())
-        result = todo("privatization");
-    }
+    if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+      result = todo("privatization");
   };
   auto checkReduction = [&todo](auto op, LogicalResult &result) {
     if (isa<omp::TeamsOp>(op))
@@ -460,7 +454,6 @@ static LogicalResult checkImplementationStatus(Operation &op) {
         checkDevice(op, result);
         checkInReduction(op, result);
         checkIsDevicePtr(op, result);
-        checkPrivate(op, result);
       })
       .Default([](Operation &) {
         // Assume all clauses for an operation can be translated unless they are
diff --git a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
index d9ad8fb144b61..6492708694cc5 100644
--- a/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
+++ b/mlir/lib/Target/SPIRV/Deserialization/Deserializer.cpp
@@ -702,8 +702,8 @@ spirv::Deserializer::processGraphEntryPointARM(ArrayRef<uint32_t> operands) {
   // RAII guard to reset the insertion point to previous value when done.
   OpBuilder::InsertionGuard insertionGuard(opBuilder);
   opBuilder.setInsertionPoint(graphARM);
-  opBuilder.create<spirv::GraphEntryPointARMOp>(
-      unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name),
+  spirv::GraphEntryPointARMOp::create(
+      opBuilder, unknownLoc, SymbolRefAttr::get(opBuilder.getContext(), name),
       opBuilder.getArrayAttr(interface));
 
   return success();
@@ -736,7 +736,7 @@ spirv::Deserializer::processGraphARM(ArrayRef<uint32_t> operands) {
 
   std::string graphName = getGraphSymbol(graphID);
   auto graphOp =
-      opBuilder.create<spirv::GraphARMOp>(unknownLoc, graphName, graphType);
+      spirv::GraphARMOp::create(opBuilder, unknownLoc, graphName, graphType);
   curGraph = graphMap[graphID] = graphOp;
   Block *entryBlock = graphOp.addEntryBlock();
   LLVM_DEBUG({
@@ -844,7 +844,7 @@ spirv::Deserializer::processOpGraphSetOutputARM(ArrayRef<uint32_t> operands) {
 LogicalResult
 spirv::Deserializer::processGraphEndARM(ArrayRef<uint32_t> operands) {
   // Create GraphOutputsARM instruction.
-  opBuilder.create<spirv::GraphOutputsARMOp>(unknownLoc, graphOutputs);
+  spirv::GraphOutputsARMOp::create(opBuilder, unknownLoc, graphOutputs);
 
   // Process OpGraphEndARM.
   if (!operands.empty()) {
diff --git a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
index b56e7788625f5..b88fbaa9018f6 100644
--- a/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
+++ b/mlir/lib/Target/SPIRV/Serialization/Serializer.cpp
@@ -260,9 +260,9 @@ static std::string getDecorationName(StringRef attrName) {
 }
 
 template <typename AttrTy, typename EmitF>
-LogicalResult processDecorationList(Location loc, Decoration decoration,
-                                    Attribute attrList, StringRef attrName,
-                                    EmitF emitter) {
+static LogicalResult processDecorationList(Location loc, Decoration decoration,
+                                           Attribute attrList,
+                                           StringRef attrName, EmitF emitter) {
   auto arrayAttr = dyn_cast<ArrayAttr>(attrList);
   if (!arrayAttr) {
     return emitError(loc, "expecting array attribute of ")
diff --git a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
index 366ba8f0115f4..048e964037558 100644
--- a/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
+++ b/mlir/lib/Target/Wasm/TranslateFromWasm.cpp
@@ -406,7 +406,7 @@ class ExpressionParser {
     auto returnOperands = popOperands(resTypes);
     if (failed(returnOperands))
       return failure();
-    builder.create<BlockReturnOp>(opLoc, *returnOperands);
+    BlockReturnOp::create(builder, opLoc, *returnOperands);
     LDBG() << "end of parsing of a block";
     return bodyParsingRes->endingByte;
   }
@@ -1000,7 +1000,7 @@ parsed_inst_t ExpressionParser::parseBlockLikeOp(OpBuilder &builder) {
       builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
   builder.setInsertionPointToEnd(curBlock);
   auto blockOp =
-      builder.create<OpToCreate>(*currentOpLoc, *inputOps, successor);
+      OpToCreate::create(builder, *currentOpLoc, *inputOps, successor);
   auto *blockBody = blockOp.createBlock();
   if (failed(parseBlockContent(builder, blockBody, resTypes, *opLoc, blockOp)))
     return failure();
@@ -1047,8 +1047,8 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
   auto *successor =
       builder.createBlock(curRegion, curRegion->end(), resTypes, locations);
   builder.setInsertionPointToEnd(curBlock);
-  auto ifOp = builder.create<IfOp>(*currentOpLoc, conditionValue->front(),
-                                   *inputOps, successor);
+  auto ifOp = IfOp::create(builder, *currentOpLoc, conditionValue->front(),
+                           *inputOps, successor);
   auto *ifEntryBlock = ifOp.createIfBlock();
   constexpr auto ifElseFilter =
       ByteSequence<WasmBinaryEncoding::endByte,
@@ -1091,9 +1091,9 @@ inline parsed_inst_t ExpressionParser::parseSpecificInstruction<
   auto branchArgs = popOperands(inputTypes);
   if (failed(branchArgs))
     return failure();
-  builder.create<BranchIfOp>(*currentOpLoc, condition->front(),
-                             builder.getUI32IntegerAttr(*level), *branchArgs,
-                             elseBlock);
+  BranchIfOp::create(builder, *currentOpLoc, condition->front(),
+                     builder.getUI32IntegerAttr(*level), *branchArgs,
+                     elseBlock);
   builder.setInsertionPointToStart(elseBlock);
   return {*branchArgs};
 }
@@ -1115,7 +1115,7 @@ ExpressionParser::parseSpecificInstruction<WasmBinaryEncoding::OpCode::call>(
   if (failed(inOperands))
     return failure();
   auto callOp =
-      builder.create<FuncCallOp>(loc, resTypes, callee.symbol, *inOperands);
+      FuncCallOp::create(builder, loc, resTypes, callee.symbol, *inOperands);
   return {callOp.getResults()};
 }
 
@@ -1391,8 +1391,8 @@ inline parsed_inst_t ExpressionParser::buildConvertOp(OpBuilder &builder,
   auto operand = popOperands(intype);
   if (failed(operand))
     return failure();
-  auto op = builder.create<opType>(*currentOpLoc, outType, operand->front(),
-                                   extraArgs...);
+  auto op = opType::create(builder, *currentOpLoc, outType, operand->front(),
+                           extraArgs...);
   LDBG() << "Built operation: " << op;
   return {{op.getResult()}};
 }
diff --git a/mlir/python/CMakeLists.txt b/mlir/python/CMakeLists.txt
index 84ff08a7631d6..20ed3ab41a0b4 100644
--- a/mlir/python/CMakeLists.txt
+++ b/mlir/python/CMakeLists.txt
@@ -346,6 +346,15 @@ declare_mlir_dialect_python_bindings(
     dialects/memref.py
   DIALECT_NAME memref)
 
+declare_mlir_dialect_python_bindings(
+  ADD_TO_PARENT MLIRPythonSources.Dialects
+  ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
+  TD_FILE dialects/ShardOps.td
+  SOURCES
+    dialects/shard.py
+  DIALECT_NAME shard
+  GEN_ENUM_BINDINGS)
+
 declare_mlir_dialect_python_bindings(
   ADD_TO_PARENT MLIRPythonSources.Dialects
   ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}/mlir"
@@ -885,9 +894,16 @@ if(NOT LLVM_ENABLE_IDE)
   )
 endif()
 
+set(_mlir_python_stubgen_enabled ON)
 # Stubgen doesn't work when cross-compiling (stubgen will run in the host interpreter and then fail
 # to find the extension module for the host arch).
-if(NOT CMAKE_CROSSCOMPILING)
+# Note: Stubgen requires some extra handling to work properly when sanitizers are enabled,
+# so we skip running it in that case now.
+if(CMAKE_CROSSCOMPILING OR (NOT LLVM_USE_SANITIZER STREQUAL ""))
+  set(_mlir_python_stubgen_enabled OFF)
+endif()
+
+if(_mlir_python_stubgen_enabled)
   # _mlir stubgen
   # Note: All this needs to come before add_mlir_python_modules(MLIRPythonModules so that the install targets for the
   # generated type stubs get created.
@@ -976,7 +992,7 @@ endif()
 ################################################################################
 
 set(_declared_sources MLIRPythonSources MLIRPythonExtension.RegisterEverything)
-if(NOT CMAKE_CROSSCOMPILING)
+if(_mlir_python_stubgen_enabled)
   list(APPEND _declared_sources MLIRPythonExtension.Core.type_stub_gen)
 endif()
 
@@ -989,7 +1005,7 @@ add_mlir_python_modules(MLIRPythonModules
   COMMON_CAPI_LINK_LIBS
     MLIRPythonCAPI
 )
-if(NOT CMAKE_CROSSCOMPILING)
+if(_mlir_python_stubgen_enabled)
   add_dependencies(MLIRPythonModules "${_mlir_typestub_gen_target}")
   if(MLIR_INCLUDE_TESTS)
     add_dependencies(MLIRPythonModules "${_mlirPythonTestNanobind_typestub_gen_target}")
diff --git a/mlir/python/mlir/dialects/ShardOps.td b/mlir/python/mlir/dialects/ShardOps.td
new file mode 100644
index 0000000000000..f8527664df67b
--- /dev/null
+++ b/mlir/python/mlir/dialects/ShardOps.td
@@ -0,0 +1,14 @@
+//===-- ShardOps.td - Entry point for ShardOps bindings ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef PYTHON_BINDINGS_SHARD_OPS
+#define PYTHON_BINDINGS_SHARD_OPS
+
+include "mlir/Dialect/Shard/IR/ShardOps.td"
+
+#endif
diff --git a/mlir/python/mlir/dialects/shard.py b/mlir/python/mlir/dialects/shard.py
new file mode 100644
index 0000000000000..8d69f17954290
--- /dev/null
+++ b/mlir/python/mlir/dialects/shard.py
@@ -0,0 +1,6 @@
+#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+#  See https://llvm.org/LICENSE.txt for license information.
+#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from ._shard_ops_gen import *
+from ._shard_enum_gen import *
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
index 39c31d5bf2fa3..c746d7690b00d 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma-gfx950.mlir
@@ -8,46 +8,46 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>,
   // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
 
   // CHECK: rocdl.mfma.f32.32x32x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf16>, vector<8xf16>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf16>, vector<8xf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x32.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf16>, vector<8xf16>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf16>, vector<8xf16>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x16.bf16{{.*}}: (vector<8xbf16>, vector<8xbf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg3 * %arg3 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg3 * %arg3 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xbf16>, vector<8xbf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x32.bf16{{.*}}: (vector<8xbf16>, vector<8xbf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg3 * %arg3 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xbf16>, vector<8xbf16>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg3 * %arg3 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xbf16>, vector<8xbf16>, vector<4xf32>
   // CHECK: rocdl.mfma.i32.32x32x32.i8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
-  amdgpu.mfma %arg4 * %arg4 + %arg5 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<16xi8>, vector<16xi8>, vector<16xi32>
+  amdgpu.mfma 32x32x32 %arg4 * %arg4 + %arg5 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<16xi8>, vector<16xi8>, vector<16xi32>
   // CHECK: rocdl.mfma.i32.16x16x64.i8{{.*}}: (vector<4xi32>, vector<4xi32>, vector<4xi32>, i32, i32, i32) -> vector<4xi32>
-  amdgpu.mfma %arg4 * %arg4 + %arg6 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<16xi8>, vector<16xi8>, vector<4xi32>
+  amdgpu.mfma 16x16x64 %arg4 * %arg4 + %arg6 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<16xi8>, vector<16xi8>, vector<4xi32>
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[c0]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg7 * %arg7 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg7 * %arg7 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[c0]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg7 * %arg7 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg7 * %arg7 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf8E4M3FN>, vector<32xf8E4M3FN>, vector<4xf32>
   // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c1]], %[[c1]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg8 * %arg8 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg8 * %arg8 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c1]], %[[c1]], %[[c0]], %[[c0]]{{.*}}: (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg8 * %arg8 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg8 * %arg8 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf8E5M2>, vector<32xf8E5M2>, vector<4xf32>
   // CHECK: %[[c2:.+]] = llvm.mlir.constant(2 : i32) : i32
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c2]], %[[c2]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg9 * %arg9 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c2]], %[[c2]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf6E2M3FN>, vector<4xf32>
   // CHECK: %[[c3:.+]] = llvm.mlir.constant(3 : i32) : i32
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c3]], %[[c3]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg10 * %arg10 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c3]], %[[c3]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E3M2FN>, vector<32xf6E3M2FN>, vector<4xf32>
   // CHECK-DAG: %[[c4:.+]] = llvm.mlir.constant(4 : i32) : i32
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c4]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg11 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg11 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c4]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg11 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg11 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf4E2M1FN>, vector<32xf4E2M1FN>, vector<4xf32>
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c2]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg9 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 64 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<16xf32>
+  amdgpu.mfma 32x32x64 %arg9 * %arg11 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c2]], %[[c4]], %[[c0]], %[[c0]]{{.*}}: (vector<6xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg9 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 128 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<4xf32>
+  amdgpu.mfma 16x16x128 %arg9 * %arg11 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<32xf6E2M3FN>, vector<32xf4E2M1FN>, vector<4xf32>
 
   func.return
 }
@@ -55,50 +55,50 @@ func.func @mfma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<16xf32>,
 // CHECK-LABEL: func @scaled_mfma_to_rocdl(
 // CHECK-SAME: %[[ARG0:.*]]: vector<16xf32>, %[[ARG1:.*]]: vector<4xf32>, %[[ARG2:.*]]: vector<32xf8E4M3FN>, %[[ARG3:.*]]: vector<32xf8E5M2>, %[[ARG4:.*]]: vector<32xf6E2M3FN>, %[[ARG5:.*]]: vector<32xf6E3M2FN>, %[[ARG6:.*]]: vector<32xf4E2M1FN>, %[[ARG7:.*]]: vector<4xf8E8M0FNU>, %[[ARG8:.*]]: f8E8M0FNU
 func.func @scaled_mfma_to_rocdl(%arg0 : vector<16xf32>,
-                    %arg1 : vector<4xf32>, %arg2 : vector<32xf8E4M3FN>,
-                    %arg3 : vector<32xf8E5M2>, %arg4 : vector<32xf6E2M3FN>,
-                    %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>, 
-                    %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
-  
+                                %arg1 : vector<4xf32>,       %arg2 : vector<32xf8E4M3FN>,
+                                %arg3 : vector<32xf8E5M2>,   %arg4 : vector<32xf6E2M3FN>,
+                                %arg5 : vector<32xf6E3M2FN>, %arg6 : vector<32xf4E2M1FN>,
+                                %arg7 : vector<4xf8E8M0FNU>, %arg8 : f8E8M0FNU) {
+
   // CHECK: %[[c0:.+]] = llvm.mlir.constant(0 : i32) : i32
   // CHECK: %[[c1:.+]] = llvm.mlir.constant(1 : i32) : i32
   // CHECK: %[[b0:.+]] = llvm.bitcast {{.*}} : vector<4xi8> to i32
   // CHECK: %[[z0:.+]] = llvm.zext {{.*}} : i8 to i32
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
-  
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg2) * (%arg8[1] * %arg2) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E4M3FN>, f8E8M0FNU, vector<32xf8E4M3FN>, vector<4xf32>
+
   // CHECK: llvm.bitcast
-  
+
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<8xi32>, vector<8xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
-  
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg3) * (%arg8[1] * %arg3) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf8E5M2>, f8E8M0FNU, vector<32xf8E5M2>, vector<4xf32>
+
   // CHECK: llvm.bitcast
-  
+
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
-  
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg4) * (%arg8[1] * %arg4) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<4xf32>
+
   // CHECK: llvm.bitcast
   // CHECK: llvm.mlir.constant(3 : i32) : i32
 
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<6xi32>, vector<6xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
-  
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg5) * (%arg8[1] * %arg5) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf6E3M2FN>, f8E8M0FNU, vector<32xf6E3M2FN>, vector<4xf32>
+
   // CHECK: llvm.bitcast
   // CHECK: llvm.mlir.constant(4 : i32) : i32
-  
+
   // CHECK: rocdl.mfma.scale.f32.32x32x64.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<16xf32>, i32, i32, i32, i32, i32, i32) -> vector<16xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
+  amdgpu.scaled_mfma 32x32x64 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<16xf32>
   // CHECK: rocdl.mfma.scale.f32.16x16x128.f8f6f4{{.*}}, %[[c0]], %[[b0]], %[[c1]], %[[z0]] : (vector<4xi32>, vector<4xi32>, vector<4xf32>, i32, i32, i32, i32, i32, i32) -> vector<4xf32>
-  amdgpu.scaled_mfma(%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 { k = 128 : i32, m = 16 : i32, n = 16 : i32 } : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
+  amdgpu.scaled_mfma 16x16x128 (%arg7[0] * %arg6) * (%arg8[1] * %arg6) + %arg1 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, f8E8M0FNU, vector<32xf4E2M1FN>, vector<4xf32>
 
   func.return
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
index 52db1421dc3c6..e292d98183cd5 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/mfma.mlir
@@ -9,89 +9,89 @@ func.func @mfma_to_rocdl(%arg0 : f32, %arg1 : vector<32xf32>,
                     %arg14 : vector<2xf32>, %arg15 : vector<8xf8E5M2FNUZ>,
                     %arg16 : vector<8xf8E4M3FNUZ>) {
   // CHECK: rocdl.mfma.f32.32x32x1f32{{.*}}: (f32, f32, vector<32xf32>, i32, i32, i32) -> vector<32xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 1 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 }  blgp = none : f32, f32, vector<32xf32>
+  amdgpu.mfma 32x32x1 %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, blocks = 2 : i32 } blgp = none : f32, f32, vector<32xf32>
   // CHECK: rocdl.mfma.f32.16x16x1f32{{.*}}: (f32, f32, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 1 : i32, m = 16 : i32, n = 16 : i32, blocks = 4 : i32 }  blgp = none : f32, f32, vector<16xf32>
+  amdgpu.mfma 16x16x1 %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : f32, f32, vector<16xf32>
   // CHECK: rocdl.mfma.f32.4x4x1f32{{.*}}: (f32, f32, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 1 : i32, m = 4 : i32, n = 4 : i32, blocks = 16 : i32 }  blgp = none : f32, f32, vector<4xf32>
+  amdgpu.mfma 4x4x1 %arg0 * %arg0 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, blocks = 16 : i32 } blgp = none : f32, f32, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x2f32{{.*}}: (f32, f32, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 2 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : f32, f32, vector<16xf32>
+  amdgpu.mfma 32x32x2 %arg0 * %arg0 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x4f32{{.*}}: (f32, f32, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg0 * %arg0 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : f32, f32, vector<4xf32>
+  amdgpu.mfma 16x16x4 %arg0 * %arg0 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x4f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<32xf32>, i32, i32, i32) -> vector<32xf32>
-  amdgpu.mfma %arg4 * %arg4 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 }  blgp = none : vector<4xf16>, vector<4xf16>, vector<32xf32>
+  amdgpu.mfma 32x32x4 %arg4 * %arg4 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, blocks = 2 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<32xf32>
   // CHECK: rocdl.mfma.f32.16x16x4f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg4 * %arg4 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32, blocks = 4 : i32 }  blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
+  amdgpu.mfma 16x16x4 %arg4 * %arg4 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.4x4x4f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg4 * %arg4 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 4 : i32, n = 4 : i32, blocks = 16 : i32 }  blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
+  amdgpu.mfma 4x4x4 %arg4 * %arg4 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, blocks = 16 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x8f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg4 * %arg4 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
+  amdgpu.mfma 32x32x8 %arg4 * %arg4 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x16f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg4 * %arg4 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
+  amdgpu.mfma 16x16x16 %arg4 * %arg4 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
   // CHECK: %[[BITCAST_4xi8_i32:.+]] = llvm.bitcast {{.*}} : vector<4xi8> to i32
   // CHECK: rocdl.mfma.i32.32x32x4i8 %[[BITCAST_4xi8_i32]], %[[BITCAST_4xi8_i32]], {{.*}}: (i32, i32, vector<32xi32>, i32, i32, i32) -> vector<32xi32>
-  amdgpu.mfma %arg5 * %arg5 + %arg6 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 }  blgp = none : vector<4xi8>, vector<4xi8>, vector<32xi32>
+  amdgpu.mfma 32x32x4 %arg5 * %arg5 + %arg6 { abid = 0 : i32, cbsz = 0 : i32, blocks = 2 : i32 } blgp = none : vector<4xi8>, vector<4xi8>, vector<32xi32>
   // CHECK: rocdl.mfma.i32.16x16x4i8{{.*}}: (i32, i32, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
-  amdgpu.mfma %arg5 * %arg5 + %arg7 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32, blocks = 4 : i32 }  blgp = none : vector<4xi8>, vector<4xi8>, vector<16xi32>
+  amdgpu.mfma 16x16x4 %arg5 * %arg5 + %arg7 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : vector<4xi8>, vector<4xi8>, vector<16xi32>
   // CHECK: rocdl.mfma.i32.4x4x4i8{{.*}}: (i32, i32, vector<4xi32>, i32, i32, i32) -> vector<4xi32>
-  amdgpu.mfma %arg5 * %arg5 + %arg8 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 4 : i32, n = 4 : i32, blocks = 16 : i32 }  blgp = none : vector<4xi8>, vector<4xi8>, vector<4xi32>
+  amdgpu.mfma 4x4x4 %arg5 * %arg5 + %arg8 { abid = 0 : i32, cbsz = 0 : i32, blocks = 16 : i32 } blgp = none : vector<4xi8>, vector<4xi8>, vector<4xi32>
   // CHECK: rocdl.mfma.i32.32x32x8i8{{.*}}: (i32, i32, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
-  amdgpu.mfma %arg5 * %arg5 + %arg7 { abid = 0 : i32, cbsz = 0 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<4xi8>, vector<4xi8>, vector<16xi32>
+  amdgpu.mfma 32x32x8 %arg5 * %arg5 + %arg7 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xi8>, vector<4xi8>, vector<16xi32>
   // CHECK: rocdl.mfma.i32.16x16x16i8{{.*}}: (i32, i32, vector<4xi32>, i32, i32, i32) -> vector<4xi32>
-  amdgpu.mfma %arg5 * %arg5 + %arg8 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<4xi8>, vector<4xi8>, vector<4xi32>
+  amdgpu.mfma 16x16x16 %arg5 * %arg5 + %arg8 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xi8>, vector<4xi8>, vector<4xi32>
   // CHECK: %[[BITCAST_2xbf16_2xi16:.+]] = llvm.bitcast {{.*}} : vector<2xbf16> to vector<2xi16>
   // CHECK: rocdl.mfma.f32.32x32x2bf16 %[[BITCAST_2xbf16_2xi16]], %[[BITCAST_2xbf16_2xi16]], %{{.*}}: (vector<2xi16>, vector<2xi16>, vector<32xf32>, i32, i32, i32) -> vector<32xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 2 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 }  blgp = none : vector<2xbf16>, vector<2xbf16>, vector<32xf32>
+  amdgpu.mfma 32x32x2 %arg9 * %arg9 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, blocks = 2 : i32 } blgp = none : vector<2xbf16>, vector<2xbf16>, vector<32xf32>
   // CHECK: rocdl.mfma.f32.16x16x2bf16{{.*}}: (vector<2xi16>, vector<2xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 2 : i32, m = 16 : i32, n = 16 : i32, blocks = 4 : i32 }  blgp = none : vector<2xbf16>, vector<2xbf16>, vector<16xf32>
+  amdgpu.mfma 16x16x2 %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : vector<2xbf16>, vector<2xbf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.4x4x2bf16{{.*}}: (vector<2xi16>, vector<2xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 2 : i32, m = 4 : i32, n = 4 : i32, blocks = 16 : i32 }  blgp = none : vector<2xbf16>, vector<2xbf16>, vector<4xf32>
+  amdgpu.mfma 4x4x2 %arg9 * %arg9 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, blocks = 16 : i32 } blgp = none : vector<2xbf16>, vector<2xbf16>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x4bf16{{.*}}: (vector<2xi16>, vector<2xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<2xbf16>, vector<2xbf16>, vector<16xf32>
+  amdgpu.mfma 32x32x4 %arg9 * %arg9 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<2xbf16>, vector<2xbf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x8bf16{{.*}}: (vector<2xi16>, vector<2xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg9 * %arg9 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 8 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<2xbf16>, vector<2xbf16>, vector<4xf32>
+  amdgpu.mfma 16x16x8 %arg9 * %arg9 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<2xbf16>, vector<2xbf16>, vector<4xf32>
   // CHECK: %[[BITCAST_4xbf16_4xi16:.+]] = llvm.bitcast {{.*}} : vector<4xbf16> to vector<4xi16>
   // CHECK: rocdl.mfma.f32.32x32x4bf16.1k %[[BITCAST_4xbf16_4xi16]], %[[BITCAST_4xbf16_4xi16]], {{.*}}: (vector<4xi16>, vector<4xi16>, vector<32xf32>, i32, i32, i32) -> vector<32xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 }  blgp = none : vector<4xbf16>, vector<4xbf16>, vector<32xf32>
+  amdgpu.mfma 32x32x4 %arg10 * %arg10 + %arg1 { abid = 0 : i32, cbsz = 0 : i32, blocks = 2 : i32 } blgp = none : vector<4xbf16>, vector<4xbf16>, vector<32xf32>
   // CHECK: rocdl.mfma.f32.16x16x4bf16.1k{{.*}}: (vector<4xi16>, vector<4xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32, blocks = 4 : i32 }  blgp = none : vector<4xbf16>, vector<4xbf16>, vector<16xf32>
+  amdgpu.mfma 16x16x4 %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : vector<4xbf16>, vector<4xbf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.4x4x4bf16.1k{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 4 : i32, n = 4 : i32, blocks = 16 : i32 }  blgp = none : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
+  amdgpu.mfma 4x4x4 %arg10 * %arg10 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, blocks = 16 : i32 } blgp = none : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x8bf16.1k{{.*}}: (vector<4xi16>, vector<4xi16>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 8 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<4xbf16>, vector<4xbf16>, vector<16xf32>
+  amdgpu.mfma 32x32x8 %arg10 * %arg10 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xbf16>, vector<4xbf16>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.16x16x16bf16.1k{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg10 * %arg10 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
+  amdgpu.mfma 16x16x16 %arg10 * %arg10 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
   // CHECK: rocdl.mfma.f64.16x16x4f64{{.*}}: (f64, f64, vector<4xf64>, i32, i32, i32) -> vector<4xf64>
-  amdgpu.mfma %arg11 * %arg11 + %arg12 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : f64, f64, vector<4xf64>
+  amdgpu.mfma 16x16x4 %arg11 * %arg11 + %arg12 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f64, f64, vector<4xf64>
   // CHECK: rocdl.mfma.f64.4x4x4f64{{.*}}: (f64, f64, f64, i32, i32, i32) -> f64
-  amdgpu.mfma %arg11 * %arg11 + %arg11 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 4 : i32, n = 4 : i32, blocks = 4 : i32 }  blgp = none : f64, f64, f64
+  amdgpu.mfma 4x4x4 %arg11 * %arg11 + %arg11 { abid = 0 : i32, cbsz = 0 : i32, blocks = 4 : i32 } blgp = none : f64, f64, f64
   // CHECK: %[[BITCAST_8xi8_i64:.+]] = llvm.bitcast {{.*}} : vector<8xi8> to i64
   // CHECK: rocdl.mfma.i32.16x16x32.i8 %[[BITCAST_8xi8_i64]], %[[BITCAST_8xi8_i64]], {{.*}}: (i64, i64, vector<4xi32>, i32, i32, i32) -> vector<4xi32>
-  amdgpu.mfma %arg13 * %arg13 + %arg8 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xi8>, vector<8xi8>, vector<4xi32>
+  amdgpu.mfma 16x16x32 %arg13 * %arg13 + %arg8 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xi8>, vector<8xi8>, vector<4xi32>
   // CHECK: rocdl.mfma.i32.32x32x16.i8{{.*}}: (i64, i64, vector<16xi32>, i32, i32, i32) -> vector<16xi32>
-  amdgpu.mfma %arg13 * %arg13 + %arg7 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xi8>, vector<8xi8>, vector<16xi32>
+  amdgpu.mfma 32x32x16 %arg13 * %arg13 + %arg7 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xi8>, vector<8xi8>, vector<16xi32>
   // CHECK: rocdl.mfma.f32.16x16x8.xf32{{.*}}: (vector<2xf32>, vector<2xf32>, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg14 * %arg14 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 8 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32, reducePrecision }  blgp = none : vector<2xf32>, vector<2xf32>, vector<4xf32>
+  amdgpu.mfma 16x16x8 %arg14 * %arg14 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, reducePrecision } blgp = none : vector<2xf32>, vector<2xf32>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x4.xf32{{.*}}: (vector<2xf32>, vector<2xf32>, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg14 * %arg14 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 4 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32, reducePrecision }  blgp = none : vector<2xf32>, vector<2xf32>, vector<16xf32>
+  amdgpu.mfma 32x32x4 %arg14 * %arg14 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, reducePrecision } blgp = none : vector<2xf32>, vector<2xf32>, vector<16xf32>
   // CHECK: %[[BITCAST_8xi8_i64_1:.+]] = llvm.bitcast {{.*}} : vector<8xi8> to i64
   // CHECK: rocdl.mfma.f32.16x16x32.bf8.bf8 %[[BITCAST_8xi8_i64_1]], %[[BITCAST_8xi8_i64_1]], {{.*}}: (i64, i64, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg15 * %arg15 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E5M2FNUZ>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg15 * %arg15 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E5M2FNUZ>, vector<4xf32>
   // CHECK: %[[BITCAST_8xi8_i64_2:.+]] = llvm.bitcast {{.*}} : vector<8xi8> to i64
   // CHECK: rocdl.mfma.f32.16x16x32.bf8.fp8 %[[BITCAST_8xi8_i64_1]], %[[BITCAST_8xi8_i64_2]], {{.*}}: (i64, i64, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg15 * %arg16 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg15 * %arg16 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.16x16x32.fp8.bf8{{.*}}: (i64, i64, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg16 * %arg15 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E5M2FNUZ>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg16 * %arg15 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E5M2FNUZ>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.16x16x32.fp8.fp8{{.*}}: (i64, i64, vector<4xf32>, i32, i32, i32) -> vector<4xf32>
-  amdgpu.mfma %arg16 * %arg16 + %arg3 { abid = 0 : i32, cbsz = 0 : i32, k = 32 : i32, m = 16 : i32, n = 16 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>
+  amdgpu.mfma 16x16x32 %arg16 * %arg16 + %arg3 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<4xf32>
   // CHECK: rocdl.mfma.f32.32x32x16.bf8.bf8{{.*}}: (i64, i64, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg15 * %arg15 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E5M2FNUZ>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg15 * %arg15 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E5M2FNUZ>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.32x32x16.bf8.fp8{{.*}}: (i64, i64, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg15 * %arg16 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg15 * %arg16 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E5M2FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.32x32x16.fp8.bf8{{.*}}: (i64, i64, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg16 * %arg15 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E5M2FNUZ>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg16 * %arg15 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E5M2FNUZ>, vector<16xf32>
   // CHECK: rocdl.mfma.f32.32x32x16.fp8.fp8{{.*}}: (i64, i64, vector<16xf32>, i32, i32, i32) -> vector<16xf32>
-  amdgpu.mfma %arg16 * %arg16 + %arg2 { abid = 0 : i32, cbsz = 0 : i32, k = 16 : i32, m = 32 : i32, n = 32 : i32, blocks = 1 : i32 }  blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>
+  amdgpu.mfma 32x32x16 %arg16 * %arg16 + %arg2 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<8xf8E4M3FNUZ>, vector<8xf8E4M3FNUZ>, vector<16xf32>
 
   func.return
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
similarity index 59%
rename from mlir/test/Conversion/AMDGPUToROCDL/wmma.mlir
rename to mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
index 638a7c3f8c1c5..d1301d0089220 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx11.mlir
@@ -1,35 +1,36 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1100 --allow-unregistered-dialect | FileCheck %s
+
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<16xf16>, %arg1 : vector<8xf32>, %arg2 : vector<4xf32>,
                          %arg3 : vector<16xbf16>, %arg4 : vector<8xf16>, %arg5 : vector<8xbf16>,
                          %arg6 : vector<16xi8>, %arg7 : vector<8xi32>, %arg8 : vector<4xi32>,
                          %arg9 : vector<16xui8>, %arg10 : vector<16xi4>, %arg11 : vector<8xi4>) {
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg2 : vector<16xf16>, vector<16xf16>, vector<4xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg3 * %arg3 + %arg1 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg1 : vector<16xbf16>, vector<16xbf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg3 * %arg3 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg2 : vector<16xbf16>, vector<16xbf16>, vector<4xf32>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<16xf16>, i1) -> vector<16xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg0 {subwordOffset = 1 : i32}: vector<16xf16>, vector<16xf16>, vector<16xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg0 {subwordOffset = 1 : i32}: vector<16xf16>, vector<16xf16>, vector<16xf16>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<16xf16>, vector<16xf16>, vector<8xf16>, i1) -> vector<8xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg4 {subwordOffset = 0 : i32}: vector<16xf16>, vector<16xf16>, vector<8xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg4 {subwordOffset = 0 : i32}: vector<16xf16>, vector<16xf16>, vector<8xf16>
   // CHECK: %[[raw_bf16x16:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<16xi16>, i1) -> vector<16xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x16]] : vector<16xi16> to vector<16xbf16>
-  amdgpu.wmma %arg3 * %arg3 + %arg3 {subwordOffset = 1 : i32}: vector<16xbf16>, vector<16xbf16>, vector<16xbf16>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg3 {subwordOffset = 1 : i32}: vector<16xbf16>, vector<16xbf16>, vector<16xbf16>
   // CHECK: %[[raw_bf16x8:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<16xi16>, vector<16xi16>, vector<8xi16>, i1) -> vector<8xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x8]] : vector<8xi16> to vector<8xbf16>
-  amdgpu.wmma %arg3 * %arg3 + %arg5 {subwordOffset = 0 : i32}: vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
+  amdgpu.wmma 16x16x16 %arg3 * %arg3 + %arg5 {subwordOffset = 0 : i32}: vector<16xbf16>, vector<16xbf16>, vector<8xbf16>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<4xi32>, i1, vector<4xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg6 * %arg6 + %arg7 {clamp}: vector<16xi8>, vector<16xi8>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg6 + %arg7 {clamp}: vector<16xi8>, vector<16xi8>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<4xi32>, i1, vector<4xi32>, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg9 * %arg9 + %arg8 {unsignedA, unsignedB, clamp}: vector<16xui8>, vector<16xui8>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg9 + %arg8 {unsignedA, unsignedB, clamp}: vector<16xui8>, vector<16xui8>, vector<4xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg10 * %arg10 + %arg7 {clamp}: vector<16xi4>, vector<16xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg10 * %arg10 + %arg7 {clamp}: vector<16xi4>, vector<16xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg8 {clamp}: vector<8xi4>, vector<8xi4>, vector<4xi32>
 
   func.return
 }
diff --git a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
index 94a1b78d5f040..b897323340402 100644
--- a/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
+++ b/mlir/test/Conversion/AMDGPUToROCDL/wmma-gfx12.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s
+// RUN: mlir-opt %s --convert-amdgpu-to-rocdl=chipset=gfx1200 --allow-unregistered-dialect | FileCheck %s
 // CHECK-LABEL: @wmma_to_rocdl
 func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
                          %arg2 : vector<8xf32>, %arg3 : vector<4xf32>,
@@ -9,60 +9,60 @@ func.func @wmma_to_rocdl(%arg0 : vector<8xf16>, %arg1 : vector<4xf16>,
                          %arg12 : vector<8xi32>, %arg13 : vector<4xi32>,
                          %arg14 : vector<16xi4>, %arg15 : vector<8xi4>, %arg16 : vector<4xi4>) {
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg0 * %arg0 + %arg2 : vector<8xf16>, vector<8xf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg2 : vector<8xf16>, vector<8xf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg1 * %arg1 + %arg3 : vector<4xf16>, vector<4xf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg1 * %arg1 + %arg3 : vector<4xf16>, vector<4xf16>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg4 * %arg4 + %arg2 : vector<8xbf16>, vector<8xbf16>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg4 * %arg4 + %arg2 : vector<8xbf16>, vector<8xbf16>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf16{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg5 * %arg5 + %arg3 : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg5 * %arg5 + %arg3 : vector<4xbf16>, vector<4xbf16>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<8xf16>, vector<8xf16>, vector<8xf16>, i1) -> vector<8xf16>
-  amdgpu.wmma %arg0 * %arg0 + %arg0 : vector<8xf16>, vector<8xf16>, vector<8xf16>
+  amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg0 : vector<8xf16>, vector<8xf16>, vector<8xf16>
   // CHECK: rocdl.wmma.f16.16x16x16.f16{{.*}}: (vector<4xf16>, vector<4xf16>, vector<4xf16>, i1) -> vector<4xf16>
-  amdgpu.wmma %arg1 * %arg1 + %arg1 : vector<4xf16>, vector<4xf16>, vector<4xf16>
+  amdgpu.wmma 16x16x16 %arg1 * %arg1 + %arg1 : vector<4xf16>, vector<4xf16>, vector<4xf16>
 
   // CHECK: %[[raw_bf16x8:.+]] = rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<8xi16>, vector<8xi16>, vector<8xi16>, i1) -> vector<8xi16>
   // CHECK-NEXT: llvm.bitcast %[[raw_bf16x8]] : vector<8xi16> to vector<8xbf16>
-  amdgpu.wmma %arg4 * %arg4 + %arg4 : vector<8xbf16>, vector<8xbf16>, vector<8xbf16>
+  amdgpu.wmma 16x16x16 %arg4 * %arg4 + %arg4 : vector<8xbf16>, vector<8xbf16>, vector<8xbf16>
   // CHECK: rocdl.wmma.bf16.16x16x16.bf16{{.*}}: (vector<4xi16>, vector<4xi16>, vector<4xi16>, i1) -> vector<4xi16>
-  amdgpu.wmma %arg5 * %arg5 + %arg5 : vector<4xbf16>, vector<4xbf16>, vector<4xbf16>
+  amdgpu.wmma 16x16x16 %arg5 * %arg5 + %arg5 : vector<4xbf16>, vector<4xbf16>, vector<4xbf16>
 
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_fp8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg6 * %arg6 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E4M3FN>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg6 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E4M3FN>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_fp8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg7 * %arg7 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg7 * %arg7 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E4M3FN>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_bf8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg6 * %arg8 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E5M2>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg6 * %arg8 + %arg2 : vector<8xf8E4M3FN>, vector<8xf8E5M2>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.fp8_bf8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg7 * %arg9 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E5M2>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg7 * %arg9 + %arg3 : vector<4xf8E4M3FN>, vector<4xf8E5M2>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_bf8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg8 * %arg8 + %arg2 : vector<8xf8E5M2>, vector<8xf8E5M2>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg8 * %arg8 + %arg2 : vector<8xf8E5M2>, vector<8xf8E5M2>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_bf8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg9 * %arg9 + %arg3 : vector<4xf8E5M2>, vector<4xf8E5M2>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg9 + %arg3 : vector<4xf8E5M2>, vector<4xf8E5M2>, vector<4xf32>
 
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_fp8{{.*}}: (vector<2xi32>, vector<2xi32>, vector<8xf32>) -> vector<8xf32>
-  amdgpu.wmma %arg8 * %arg6 + %arg2 : vector<8xf8E5M2>, vector<8xf8E4M3FN>, vector<8xf32>
+  amdgpu.wmma 16x16x16 %arg8 * %arg6 + %arg2 : vector<8xf8E5M2>, vector<8xf8E4M3FN>, vector<8xf32>
   // CHECK: rocdl.wmma.f32.16x16x16.bf8_fp8{{.*}}: (i32, i32, vector<4xf32>) -> vector<4xf32>
-  amdgpu.wmma %arg9 * %arg7 + %arg3 : vector<4xf8E5M2>, vector<4xf8E4M3FN>, vector<4xf32>
+  amdgpu.wmma 16x16x16 %arg9 * %arg7 + %arg3 : vector<4xf8E5M2>, vector<4xf8E4M3FN>, vector<4xf32>
 
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg10 * %arg10 + %arg12 {clamp} : vector<8xi8>, vector<8xi8>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg10 * %arg10 + %arg12 {clamp} : vector<8xi8>, vector<8xi8>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu8{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg11 * %arg11 + %arg13 {unsignedA, unsignedB, clamp}: vector<4xi8>, vector<4xi8>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg11 * %arg11 + %arg13 {unsignedA, unsignedB, clamp}: vector<4xi8>, vector<4xi8>, vector<4xi32>
 
   // CHECK: rocdl.wmma.i32.16x16x32.iu4{{.*}}: (i1, vector<2xi32>, i1, vector<2xi32>, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg14 * %arg14 + %arg12 {clamp} : vector<16xi4>, vector<16xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x32 %arg14 * %arg14 + %arg12 {clamp} : vector<16xi4>, vector<16xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x32.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg15 * %arg15 + %arg13 {clamp} : vector<8xi4>, vector<8xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x32 %arg15 * %arg15 + %arg13 {clamp} : vector<8xi4>, vector<8xi4>, vector<4xi32>
 
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<8xi32>, i1) -> vector<8xi32>
-  amdgpu.wmma %arg15 * %arg15 + %arg12 {clamp} : vector<8xi4>, vector<8xi4>, vector<8xi32>
+  amdgpu.wmma 16x16x16 %arg15 * %arg15 + %arg12 {clamp} : vector<8xi4>, vector<8xi4>, vector<8xi32>
   // CHECK: rocdl.wmma.i32.16x16x16.iu4{{.*}}: (i1, i32, i1, i32, vector<4xi32>, i1) -> vector<4xi32>
-  amdgpu.wmma %arg16 * %arg16 + %arg13 {clamp} : vector<4xi4>, vector<4xi4>, vector<4xi32>
+  amdgpu.wmma 16x16x16 %arg16 * %arg16 + %arg13 {clamp} : vector<4xi4>, vector<4xi4>, vector<4xi32>
 
   func.return
 }
diff --git a/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
new file mode 100644
index 0000000000000..74017e8354cf1
--- /dev/null
+++ b/mlir/test/Conversion/GPUToLLVMSPV/printf.mlir
@@ -0,0 +1,16 @@
+// RUN: mlir-opt %s -convert-gpu-to-llvm-spv | FileCheck %s
+
+gpu.module @test_module {
+  // CHECK: llvm.mlir.global internal constant @[[$PRINT_GLOBAL:[A-Za-z0-9_]+]]("Hello: %d\0A\00")  {addr_space = 2 : i32}
+  // CHECK: llvm.func spir_funccc @_Z6printfPU3AS2Kcz(!llvm.ptr<2>, ...) -> i32
+  // CHECK-LABEL: llvm.func spir_funccc @test_printf
+  // CHECK: (%[[ARG0:.*]]: i32)
+  gpu.func @test_printf(%arg0: i32) {
+    // CHECK: %[[IMM0:.*]] = llvm.mlir.addressof @[[$PRINT_GLOBAL]] : !llvm.ptr<2>
+    // CHECK-NEXT: %[[IMM2:.*]] = llvm.getelementptr %[[IMM0]][0, 0] : (!llvm.ptr<2>) -> !llvm.ptr<2>, !llvm.array<11 x i8>
+    // CHECK-NEXT: %{{.*}} = llvm.call spir_funccc @_Z6printfPU3AS2Kcz(%[[IMM2]], %[[ARG0]]) vararg(!llvm.func<i32 (ptr<2>, ...)>) : (!llvm.ptr<2>, i32) -> i32
+    gpu.printf "Hello: %d\n", %arg0 : i32
+    gpu.return
+  }
+}
+
diff --git a/mlir/test/Dialect/AMDGPU/canonicalize.mlir b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
index 52d3275dab43b..fee0c00606ab4 100644
--- a/mlir/test/Dialect/AMDGPU/canonicalize.mlir
+++ b/mlir/test/Dialect/AMDGPU/canonicalize.mlir
@@ -165,10 +165,10 @@ func.func @fold_gather_to_lds_of_cast_dest(%global: memref<128x72xf32, 1>, %lds:
 // CHECK-LABEL: func @scaled_mfma
 // CHECK: %[[SCALE_1:.*]] = vector.extract_strided_slice %0 {offsets = [0], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
 // CHECK: %[[SCALE_2:.*]] = vector.extract_strided_slice %2 {offsets = [4], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_1]][3] * %{{.*}}) * (%[[SCALE_2]][2] * %{{.*}}) {{.*}}
 // CHECK: %[[SCALE_3:.*]] = vector.extract_strided_slice %5 {offsets = [8], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
 // CHECK: %[[SCALE_4:.*]] = vector.extract_strided_slice %7 {offsets = [12], sizes = [4], strides = [1]} : vector<16xf8E8M0FNU> to vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma(%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%[[SCALE_3]][1] * %{{.*}}) * (%[[SCALE_4]][0] * %{{.*}}) {{.*}}
 func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2x1x8x1xf8E8M0FNU>, %scalesB: vector<2x1x8x1xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>) {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -176,12 +176,12 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
   %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleB = vector.extract %scalesB[0, 0, 6, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   %scaleC = vector.extract %scalesA[1, 0, 1, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sC = vector.insert %scaleC, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleD = vector.extract %scalesB[1, 0, 4, 0] : f8E8M0FNU from vector<2x1x8x1xf8E8M0FNU>
   %sD = vector.insert %scaleD, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_1 = amdgpu.scaled_mfma(%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_1 = amdgpu.scaled_mfma 16x16x128 (%sC[0] * %opA) * (%sD[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_0, %res_1 : vector<4xf32>, vector<4xf32>
 }
 
@@ -192,7 +192,7 @@ func.func @scaled_mfma(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %sc
 // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
 // CHECK: vector.extract {{.*}} : f8E8M0FNU from vector<2xf8E8M0FNU>
 // CHECK: vector.insert {{.*}} : f8E8M0FNU into vector<4xf8E8M0FNU>
-// CHECK: amdgpu.scaled_mfma({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
+// CHECK: amdgpu.scaled_mfma 16x16x128 ({{.*}}[0] * {{.*}}) * ({{.*}}[0] * {{.*}}
 func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<2xf8E8M0FNU>, %scalesB: vector<2xf8E8M0FNU>) -> vector<4xf32> {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -200,17 +200,17 @@ func.func @scaled_mfma_less_than_4(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
   %sA = vector.insert %scaleA, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %scaleB = vector.extract %scalesB[1] : f8E8M0FNU from vector<2xf8E8M0FNU>
   %sB = vector.insert %scaleB, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  %res_0 = amdgpu.scaled_mfma(%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_0 = amdgpu.scaled_mfma 16x16x128 (%sA[0] * %opA) * (%sB[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_0 : vector<4xf32>
 }
 
 // -----
 
 // CHECK-LABEL: func @scaled_mfma_ugly_shapes
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-// CHECK: amdgpu.scaled_mfma(%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[0] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[1] * %{{.*}}) * (%{{.*}}[3] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[2] * %{{.*}}) * (%{{.*}}[2] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+// CHECK: amdgpu.scaled_mfma 16x16x128 (%{{.*}}[3] * %{{.*}}) * (%{{.*}}[1] * %arg1) + %cst : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
 func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4E2M1FN>, %scalesA: vector<5x5xf8E8M0FNU>, %scalesB: vector<7x23xf8E8M0FNU>) -> (vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>) {
   %cst_0 = arith.constant dense<0.000000e+00> : vector<4xf32>
   %cst_1 = arith.constant dense<5.877470e-39> : vector<4xf8E8M0FNU>
@@ -237,10 +237,10 @@ func.func @scaled_mfma_ugly_shapes(%opA: vector<32xf4E2M1FN>, %opB: vector<32xf4
   %sB_6_21 = vector.insert %scaleB_6_21, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %sB_6_20 = vector.insert %scaleB_6_20, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
   %sB_6_19 = vector.insert %scaleB_6_19, %cst_1 [0] : f8E8M0FNU into vector<4xf8E8M0FNU>
-  
-  %res_4 = amdgpu.scaled_mfma(%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_5 = amdgpu.scaled_mfma(%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_6 = amdgpu.scaled_mfma(%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
-  %res_7 = amdgpu.scaled_mfma(%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 {k = 128 : i32, m = 16 : i32, n = 16 : i32} : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+
+  %res_4 = amdgpu.scaled_mfma 16x16x128 (%sA_0_4[0] * %opA) * (%sB_6_22[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_5 = amdgpu.scaled_mfma 16x16x128 (%sA_0_5[0] * %opA) * (%sB_6_21[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_6 = amdgpu.scaled_mfma 16x16x128 (%sA_0_6[0] * %opA) * (%sB_6_20[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
+  %res_7 = amdgpu.scaled_mfma 16x16x128 (%sA_0_7[0] * %opA) * (%sB_6_19[0] * %opB) + %cst_0 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf32>
   return %res_4, %res_5, %res_6, %res_7 : vector<4xf32>, vector<4xf32>, vector<4xf32>, vector<4xf32>
 }
diff --git a/mlir/test/Dialect/AMDGPU/invalid.mlir b/mlir/test/Dialect/AMDGPU/invalid.mlir
index a8256b16ed8a1..57847641a2d03 100644
--- a/mlir/test/Dialect/AMDGPU/invalid.mlir
+++ b/mlir/test/Dialect/AMDGPU/invalid.mlir
@@ -19,9 +19,7 @@ func.func @mixing_packed_stoch_round_types(%arg0: f32, %arg1: i32, %arg2: vector
 func.func @bad_source_types(%a: vector<2xf32>, %b: vector<4xf16>,
                                 %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op expected both non-small-float source operand types to match exactly}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<2xf32>, vector<4xf16>, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<2xf32>, vector<4xf16>, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
@@ -30,9 +28,7 @@ func.func @bad_source_types(%a: vector<2xf32>, %b: vector<4xf16>,
 func.func @bad_source_types_f8(%a: vector<8xf8E5M2FNUZ>, %b: vector<8xi8>,
                                 %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op expected both source operands to have small-float elements if one does}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<8xf8E5M2FNUZ>, vector<8xi8>, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<8xf8E5M2FNUZ>, vector<8xi8>, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
@@ -41,9 +37,7 @@ func.func @bad_source_types_f8(%a: vector<8xf8E5M2FNUZ>, %b: vector<8xi8>,
 func.func @bad_source_arguments(%a: vector<2xf32>, %b: vector<2xf32>,
                                 %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op expected 1 source values for this operation but got 2}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<2xf32>, vector<2xf32>, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<2xf32>, vector<2xf32>, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
@@ -52,9 +46,7 @@ func.func @bad_source_arguments(%a: vector<2xf32>, %b: vector<2xf32>,
 func.func @bad_source_arguments_i8(%a: vector<8xi8>, %b: vector<8xi8>,
                                    %c: vector<4xi32>) -> vector<4xi32> {
   // expected-error@+1 {{'amdgpu.mfma' op expected 4 source values for this operation but got 8}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 4 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<8xi8>, vector<8xi8>, vector<4xi32>
+  %d = amdgpu.mfma 32x32x4 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : vector<8xi8>, vector<8xi8>, vector<4xi32>
   func.return %d : vector<4xi32>
 }
 
@@ -62,9 +54,7 @@ func.func @bad_source_arguments_i8(%a: vector<8xi8>, %b: vector<8xi8>,
 
 func.func @bad_dest_type(%a: f32, %b: f32, %c: vector<16xf32>) -> vector<16xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op expected 32 result values for this operation but got 16}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = none : f32, f32, vector<16xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32} blgp = none : f32, f32, vector<16xf32>
   return %d : vector<16xf32>
 }
 
@@ -72,9 +62,7 @@ func.func @bad_dest_type(%a: f32, %b: f32, %c: vector<16xf32>) -> vector<16xf32>
 
 func.func @f64_permuting_b(%a: f64, %b: f64, %c: vector<4xf64>) -> vector<4xf64> {
   // expected-error@+1 {{'amdgpu.mfma' op double-precision ops do not support permuting lanes of B}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 16 : i32, n = 16 : i32, k = 4 : i32, blocks = 1 : i32,
-    abid = 0 : i32, cbsz = 0 : i32} blgp = bcast_first_32 : f64, f64, vector<4xf64>
+  %d = amdgpu.mfma 16x16x4 %a * %b + %c { abid = 0 : i32, cbsz = 0 : i32} blgp = bcast_first_32 : f64, f64, vector<4xf64>
   return %d : vector<4xf64>
 }
 
@@ -82,9 +70,7 @@ func.func @f64_permuting_b(%a: f64, %b: f64, %c: vector<4xf64>) -> vector<4xf64>
 
 func.func @f64_permuting_a(%a: f64, %b: f64, %c: vector<4xf64>) -> vector<4xf64> {
   // expected-error@+1 {{'amdgpu.mfma' op double-precision ops do not support permuting lanes of A}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 16 : i32, n = 16 : i32, k = 4 : i32, blocks = 1 : i32,
-    abid = 0 : i32, cbsz = 1 : i32} blgp = none : f64, f64, vector<4xf64>
+  %d = amdgpu.mfma 16x16x4 %a * %b + %c { abid = 0 : i32, cbsz = 1 : i32} blgp = none : f64, f64, vector<4xf64>
   return %d : vector<4xf64>
 }
 
@@ -92,9 +78,7 @@ func.func @f64_permuting_a(%a: f64, %b: f64, %c: vector<4xf64>) -> vector<4xf64>
 
 func.func @abid_without_bradcast(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op block ID for permuting A (abid) must be below 2 ** cbsz}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 1 : i32, cbsz = 0 : i32} blgp = none : f32, f32, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 1 : i32, cbsz = 0 : i32} blgp = none : f32, f32, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
@@ -102,9 +86,7 @@ func.func @abid_without_bradcast(%a: f32, %b: f32, %c: vector<32xf32>) -> vector
 
 func.func @abid_too_large(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op block ID for permuting A (abid) must be below 2 ** cbsz}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 2 : i32, cbsz = 1 : i32} blgp = none : f32, f32, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 2 : i32, cbsz = 1 : i32} blgp = none : f32, f32, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
@@ -112,17 +94,87 @@ func.func @abid_too_large(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32
 
 func.func @no_negation(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
   // expected-error@+1 {{'amdgpu.mfma' op negation flags only available for double-precision operations}}
-  %d = amdgpu.mfma %a * %b + %c {
-    m = 32 : i32, n = 32 : i32, k = 1 : i32, blocks = 2 : i32,
-    abid = 0 : i32, cbsz = 0 : i32, negateA} blgp = none : f32, f32, vector<32xf32>
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 2 : i32, abid = 0 : i32, cbsz = 0 : i32, negateA} blgp = none : f32, f32, vector<32xf32>
   func.return %d : vector<32xf32>
 }
 
 // -----
 
-func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xi32>) -> vector<8xi32> {
-  // expected-error@+1 {{'amdgpu.wmma' op Expected int sources with int destination}}
-  %0 = amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xi32>
+func.func @mfma_invalid_m(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
+  // expected-error@+1 {{'amdgpu.mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {4, 16, 32}}}
+  %d = amdgpu.mfma 7x32x1 %a * %b + %c { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<32xf32>
+  func.return %d : vector<32xf32>
+}
+
+// -----
+
+func.func @mfma_invalid_n(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
+  // expected-error@+1 {{'amdgpu.mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {4, 16, 32}}}
+  %d = amdgpu.mfma 32x7x1 %a * %b + %c { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<32xf32>
+  func.return %d : vector<32xf32>
+}
+
+// -----
+
+func.func @mfma_invalid_k(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
+  // expected-error@+1 {{'amdgpu.mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {1, 2, 4, 8, 16, 32, 64, 128}}}
+  %d = amdgpu.mfma 32x32x3 %a * %b + %c { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<32xf32>
+  func.return %d : vector<32xf32>
+}
+
+// -----
+
+func.func @mfma_invalid_blocks(%a: f32, %b: f32, %c: vector<32xf32>) -> vector<32xf32> {
+  // expected-error@+1 {{'amdgpu.mfma' op attribute 'blocks' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {1, 2, 4, 16}}}
+  %d = amdgpu.mfma 32x32x1 %a * %b + %c { blocks = 7 : i32, abid = 0 : i32, cbsz = 0 : i32 } blgp = none : f32, f32, vector<32xf32>
+  func.return %d : vector<32xf32>
+}
+
+// -----
+
+func.func @wmma_f16_i32(%arg0 : vector<16xf16>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op expected int sources with int destination}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_i16_f32(%arg0 : vector<16xi8>, %arg1 : vector<8xf32>) -> vector<8xf32> {
+  // expected-error@+1 {{'amdgpu.wmma' op expected float sources with float destination}}
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xf32>
+  func.return %0 : vector<8xf32>
+}
+
+// -----
+
+func.func @wmma_no_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' expected 3 dimensions in MNK dimension list}}
+  %0 = amdgpu.wmma 16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_m_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
+  %0 = amdgpu.wmma 32x16x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_n_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16}}}
+  %0 = amdgpu.wmma 16x32x16 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
+// -----
+
+func.func @wmma_wrong_k_dim(%arg0 : vector<16xi8>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // expected-error@+1 {{'amdgpu.wmma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.wmma 16x16x24 %arg0 * %arg0 + %arg1 : vector<16xi8>, vector<16xi8>, vector<8xi32>
   func.return %0 : vector<8xi32>
 }
 
@@ -262,3 +314,27 @@ func.func @amdgpu.scaled_ext_packed816_invalid_input_output_sizes(%v: vector<8xf
   %ret0 = amdgpu.scaled_ext_packed816 %v scale(%scale) blockSize(16) firstScaleLane(0) firstScaleByte(0) : vector<8xf8E5M2>, vector<4xf8E8M0FNU> -> vector<16xf16>
   func.return
 }
+
+// -----
+
+func.func @scaled_mfma_invalid_m(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'm' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.scaled_mfma 8x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_n(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'n' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {16, 32}}}
+  %0 = amdgpu.scaled_mfma 32x8x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
+
+// -----
+
+func.func @scaled_mfma_invalid_k(%arg0 : vector<4xf8E8M0FNU>, %arg1 : vector<32xf4E2M1FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
+  // expected-error@+1 {{'amdgpu.scaled_mfma' op attribute 'k' failed to satisfy constraint: 32-bit signless integer attribute whose value is one of {64, 128}}}
+  %0 = amdgpu.scaled_mfma 32x32x32 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<4xf8E8M0FNU>, vector<32xf4E2M1FN>, vector<16xf32>
+  func.return %0 : vector<16xf32>
+}
diff --git a/mlir/test/Dialect/AMDGPU/ops.mlir b/mlir/test/Dialect/AMDGPU/ops.mlir
index f9c6899dadfc1..a33096750ee23 100644
--- a/mlir/test/Dialect/AMDGPU/ops.mlir
+++ b/mlir/test/Dialect/AMDGPU/ops.mlir
@@ -559,19 +559,33 @@ func.func @sched_barrier() {
 }
 
 // CHECK-LABEL: func @mfma
-func.func @mfma(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
-  // CHECK: amdgpu.mfma
-  %0 = amdgpu.mfma %arg0 * %arg0 + %arg1 { abid = 1 : i32, cbsz = 1 : i32, k = 1 : i32, m = 32 : i32, n = 32 : i32, blocks = 2 : i32 } blgp = bcast_second_32 : f32, f32, vector<32xf32>
+func.func @mfma(%arg0 : vector<4xf16>, %arg1 : vector<4xf32>) -> vector<4xf32> {
+  // CHECK: amdgpu.mfma 16x16x16
+  %0 = amdgpu.mfma 16x16x16 %arg0 * %arg0 + %arg1 { abid = 0 : i32, cbsz = 0 : i32 } blgp = none : vector<4xf16>, vector<4xf16>, vector<4xf32>
+  func.return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: func @mfma_with_blocks
+func.func @mfma_with_blocks(%arg0 : f32, %arg1 : vector<32xf32>) -> vector<32xf32> {
+  // CHECK: amdgpu.mfma 32x32x1
+  %0 = amdgpu.mfma 32x32x1 %arg0 * %arg0 + %arg1 { abid = 1 : i32, cbsz = 1 : i32, blocks = 2 : i32 } blgp = bcast_second_32 : f32, f32, vector<32xf32>
   func.return %0 : vector<32xf32>
 }
 
-// CHECK-LABEL: func @wmma
-func.func @wmma(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16> {
-  // CHECK: amdgpu.wmma
-  %0 = amdgpu.wmma %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf16>
+// CHECK-LABEL: func @wmma_f16_16x16x16_f16
+func.func @wmma_f16_16x16x16_f16(%arg0 : vector<16xf16>, %arg1 : vector<8xf16>) -> vector<8xf16> {
+  // CHECK: amdgpu.wmma 16x16x16
+  %0 = amdgpu.wmma 16x16x16 %arg0 * %arg0 + %arg1 : vector<16xf16>, vector<16xf16>, vector<8xf16>
   func.return %0 : vector<8xf16>
 }
 
+// CHECK-LABEL: func @wmma_i32_16x16x32_i4
+func.func @wmma_i32_16x16x32_i4(%arg0 : vector<16xi4>, %arg1 : vector<8xi32>) -> vector<8xi32> {
+  // CHECK: amdgpu.wmma 16x16x32
+  %0 = amdgpu.wmma 16x16x32 %arg0 * %arg0 + %arg1 : vector<16xi4>, vector<16xi4>, vector<8xi32>
+  func.return %0 : vector<8xi32>
+}
+
 // CHECK-LABEL: func @swizzle_bitmode
 func.func @swizzle_bitmode(%arg0 : f32) -> f32 {
   // CHECK: amdgpu.swizzle_bitmode
@@ -595,8 +609,8 @@ func.func @permlane32_swap(%arg0 : f32) -> f32 {
 
 // CHECK-LABEL: func @scaled_mfma
 func.func @scaled_mfma(%arg0 : f8E8M0FNU, %arg1 : vector<32xf6E2M3FN>, %arg2 : vector<16xf32>) -> vector<16xf32> {
-  // CHECK: amdgpu.scaled_mfma
-  %0 = amdgpu.scaled_mfma(%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 { k = 64 : i32, m = 32 : i32, n = 32 : i32 } : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
+  // CHECK: amdgpu.scaled_mfma 32x32x64
+  %0 = amdgpu.scaled_mfma 32x32x64 (%arg0[0] * %arg1) * (%arg0[1] * %arg1) + %arg2 : f8E8M0FNU, vector<32xf6E2M3FN>, f8E8M0FNU, vector<32xf6E2M3FN>, vector<16xf32>
   func.return %0 : vector<16xf32>
 }
 
diff --git a/mlir/test/Dialect/Affine/unroll.mlir b/mlir/test/Dialect/Affine/unroll.mlir
index 574e9f41494af..efdceed7c9a25 100644
--- a/mlir/test/Dialect/Affine/unroll.mlir
+++ b/mlir/test/Dialect/Affine/unroll.mlir
@@ -1,9 +1,9 @@
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true}))" | FileCheck %s --check-prefix UNROLL-FULL
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-full=true unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=-1}))" | FileCheck %s --check-prefix UNROLL-FULL
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=-1 unroll-full-threshold=2}))" | FileCheck %s --check-prefix SHORT
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=4}))" | FileCheck %s --check-prefix UNROLL-BY-4
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=1}))" | FileCheck %s --check-prefix UNROLL-BY-1
 // RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(func.func(affine-loop-unroll{unroll-factor=5 cleanup-unroll=true}))" | FileCheck %s --check-prefix UNROLL-CLEANUP-LOOP
-// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-full=true})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL
+// RUN: mlir-opt -allow-unregistered-dialect %s -pass-pipeline="builtin.module(gpu.module(gpu.func(affine-loop-unroll{unroll-factor=-1})))" | FileCheck %s --check-prefix GPU-UNROLL-FULL
 
 // UNROLL-FULL-DAG: [[$MAP0:#map[0-9]*]] = affine_map<(d0) -> (d0 + 1)>
 // UNROLL-FULL-DAG: [[$MAP1:#map[0-9]*]] = affine_map<(d0) -> (d0 + 2)>
diff --git a/mlir/test/Dialect/MemRef/canonicalize.mlir b/mlir/test/Dialect/MemRef/canonicalize.mlir
index 7160b52af6353..313090272ef90 100644
--- a/mlir/test/Dialect/MemRef/canonicalize.mlir
+++ b/mlir/test/Dialect/MemRef/canonicalize.mlir
@@ -901,6 +901,132 @@ func.func @scope_merge_without_terminator() {
 
 // -----
 
+// Check that we simplify extract_strided_metadata of cast
+// when the source of the cast is compatible with what
+// `extract_strided_metadata`s accept.
+//
+// When we apply the transformation the resulting offset, sizes and strides
+// should come straight from the inputs of the cast.
+// Additionally the folder on extract_strided_metadata should propagate the
+// static information.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast
+//  CHECK-SAME: %[[ARG:.*]]: memref<3x?xi32, strided<[4, ?], offset: ?>>)
+//
+//   CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
+//
+//       CHECK: return %[[BASE]], %[[DYN_OFFSET]], %[[C3]], %[[DYN_SIZES]]#1, %[[C4]], %[[DYN_STRIDES]]#1
+func.func @extract_strided_metadata_of_cast(
+  %arg : memref<3x?xi32, strided<[4, ?], offset:?>>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<3x?xi32, strided<[4, ?], offset: ?>> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
+
+// -----
+
+// Check that we simplify extract_strided_metadata of cast
+// when the source of the cast is compatible with what
+// `extract_strided_metadata`s accept.
+//
+// Same as extract_strided_metadata_of_cast but with constant sizes and strides
+// in the destination type.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast_w_csts
+//  CHECK-SAME: %[[ARG:.*]]: memref<?x?xi32, strided<[?, ?], offset: ?>>)
+//
+//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
+//   CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index
+//   CHECK-DAG: %[[C25:.*]] = arith.constant 25 : index
+//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
+//
+//       CHECK: return %[[BASE]], %[[C25]], %[[C4]], %[[DYN_SIZES]]#1, %[[DYN_STRIDES]]#0, %[[C18]]
+func.func @extract_strided_metadata_of_cast_w_csts(
+  %arg : memref<?x?xi32, strided<[?, ?], offset:?>>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<?x?xi32, strided<[?, ?], offset: ?>> to
+      memref<4x?xi32, strided<[?, 18], offset: 25>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<4x?xi32, strided<[?, 18], offset: 25>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
+
+// -----
+
+// Check that we don't simplify extract_strided_metadata of
+// cast when the source of the cast is unranked.
+// Unranked memrefs cannot feed into extract_strided_metadata operations.
+// Note: Technically we could still fold the sizes and strides.
+//
+// CHECK-LABEL: func @extract_strided_metadata_of_cast_unranked
+//  CHECK-SAME: %[[ARG:.*]]: memref<*xi32>)
+//
+//       CHECK: %[[CAST:.*]] = memref.cast %[[ARG]] :
+//       CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[CAST]]
+//
+//       CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZES]]#0, %[[SIZES]]#1, %[[STRIDES]]#0, %[[STRIDES]]#1
+func.func @extract_strided_metadata_of_cast_unranked(
+  %arg : memref<*xi32>)
+  -> (memref<i32>, index,
+      index, index,
+      index, index) {
+
+  %cast =
+    memref.cast %arg :
+      memref<*xi32> to
+      memref<?x?xi32, strided<[?, ?], offset: ?>>
+
+  %base, %base_offset, %sizes:2, %strides:2 =
+    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
+    -> memref<i32>, index,
+       index, index,
+       index, index
+
+  return %base, %base_offset,
+    %sizes#0, %sizes#1,
+    %strides#0, %strides#1 :
+      memref<i32>, index,
+      index, index,
+      index, index
+}
+
+// -----
+
 // CHECK-LABEL: func @reinterpret_noop
 //  CHECK-SAME: (%[[ARG:.*]]: memref<2x3x4xf32>)
 //  CHECK-NEXT: return %[[ARG]]
diff --git a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
index 1e6b0111fa4c7..18cdfb73f6ba8 100644
--- a/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
+++ b/mlir/test/Dialect/MemRef/expand-strided-metadata.mlir
@@ -1376,133 +1376,6 @@ func.func @extract_strided_metadata_of_get_global_with_offset()
       memref<i32>, index, index, index, index, index
 }
 
-// -----
-
-// Check that we simplify extract_strided_metadata of cast
-// when the source of the cast is compatible with what
-// `extract_strided_metadata`s accept.
-//
-// When we apply the transformation the resulting offset, sizes and strides
-// should come straight from the inputs of the cast.
-// Additionally the folder on extract_strided_metadata should propagate the
-// static information.
-//
-// CHECK-LABEL: func @extract_strided_metadata_of_cast
-//  CHECK-SAME: %[[ARG:.*]]: memref<3x?xi32, strided<[4, ?], offset: ?>>)
-//
-//   CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index
-//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
-//
-//       CHECK: return %[[BASE]], %[[DYN_OFFSET]], %[[C3]], %[[DYN_SIZES]]#1, %[[C4]], %[[DYN_STRIDES]]#1
-func.func @extract_strided_metadata_of_cast(
-  %arg : memref<3x?xi32, strided<[4, ?], offset:?>>)
-  -> (memref<i32>, index,
-      index, index,
-      index, index) {
-
-  %cast =
-    memref.cast %arg :
-      memref<3x?xi32, strided<[4, ?], offset: ?>> to
-      memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  %base, %base_offset, %sizes:2, %strides:2 =
-    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
-    -> memref<i32>, index,
-       index, index,
-       index, index
-
-  return %base, %base_offset,
-    %sizes#0, %sizes#1,
-    %strides#0, %strides#1 :
-      memref<i32>, index,
-      index, index,
-      index, index
-}
-
-// -----
-
-// Check that we simplify extract_strided_metadata of cast
-// when the source of the cast is compatible with what
-// `extract_strided_metadata`s accept.
-//
-// Same as extract_strided_metadata_of_cast but with constant sizes and strides
-// in the destination type.
-//
-// CHECK-LABEL: func @extract_strided_metadata_of_cast_w_csts
-//  CHECK-SAME: %[[ARG:.*]]: memref<?x?xi32, strided<[?, ?], offset: ?>>)
-//
-//   CHECK-DAG: %[[C4:.*]] = arith.constant 4 : index
-//   CHECK-DAG: %[[C18:.*]] = arith.constant 18 : index
-//   CHECK-DAG: %[[C25:.*]] = arith.constant 25 : index
-//       CHECK: %[[BASE:.*]], %[[DYN_OFFSET:.*]], %[[DYN_SIZES:.*]]:2, %[[DYN_STRIDES:.*]]:2 = memref.extract_strided_metadata %[[ARG]]
-//
-//       CHECK: return %[[BASE]], %[[C25]], %[[C4]], %[[DYN_SIZES]]#1, %[[DYN_STRIDES]]#0, %[[C18]]
-func.func @extract_strided_metadata_of_cast_w_csts(
-  %arg : memref<?x?xi32, strided<[?, ?], offset:?>>)
-  -> (memref<i32>, index,
-      index, index,
-      index, index) {
-
-  %cast =
-    memref.cast %arg :
-      memref<?x?xi32, strided<[?, ?], offset: ?>> to
-      memref<4x?xi32, strided<[?, 18], offset: 25>>
-
-  %base, %base_offset, %sizes:2, %strides:2 =
-    memref.extract_strided_metadata %cast:memref<4x?xi32, strided<[?, 18], offset: 25>>
-    -> memref<i32>, index,
-       index, index,
-       index, index
-
-  return %base, %base_offset,
-    %sizes#0, %sizes#1,
-    %strides#0, %strides#1 :
-      memref<i32>, index,
-      index, index,
-      index, index
-}
-
-// -----
-
-// Check that we don't simplify extract_strided_metadata of
-// cast when the source of the cast is unranked.
-// Unranked memrefs cannot feed into extract_strided_metadata operations.
-// Note: Technically we could still fold the sizes and strides.
-//
-// CHECK-LABEL: func @extract_strided_metadata_of_cast_unranked
-//  CHECK-SAME: %[[ARG:.*]]: memref<*xi32>)
-//
-//       CHECK: %[[CAST:.*]] = memref.cast %[[ARG]] :
-//       CHECK: %[[BASE:.*]], %[[OFFSET:.*]], %[[SIZES:.*]]:2, %[[STRIDES:.*]]:2 = memref.extract_strided_metadata %[[CAST]]
-//
-//       CHECK: return %[[BASE]], %[[OFFSET]], %[[SIZES]]#0, %[[SIZES]]#1, %[[STRIDES]]#0, %[[STRIDES]]#1
-func.func @extract_strided_metadata_of_cast_unranked(
-  %arg : memref<*xi32>)
-  -> (memref<i32>, index,
-      index, index,
-      index, index) {
-
-  %cast =
-    memref.cast %arg :
-      memref<*xi32> to
-      memref<?x?xi32, strided<[?, ?], offset: ?>>
-
-  %base, %base_offset, %sizes:2, %strides:2 =
-    memref.extract_strided_metadata %cast:memref<?x?xi32, strided<[?, ?], offset: ?>>
-    -> memref<i32>, index,
-       index, index,
-       index, index
-
-  return %base, %base_offset,
-    %sizes#0, %sizes#1,
-    %strides#0, %strides#1 :
-      memref<i32>, index,
-      index, index,
-      index, index
-}
-
-
 // -----
 
 memref.global "private" @dynamicShmem : memref<0xf16,3>
diff --git a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
index 8bd7ae8df9049..ac1f22b68b1e1 100644
--- a/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
+++ b/mlir/test/Dialect/MemRef/value-bounds-op-interface-impl.mlir
@@ -63,6 +63,20 @@ func.func @memref_dim_all_positive(%m: memref<?xf32>, %x: index) {
 
 // -----
 
+// CHECK-LABEL: func @memref_expand(
+//  CHECK-SAME:     %[[m:[a-zA-Z0-9]+]]: memref<?xf32>
+//  CHECK-SAME:     %[[sz:[a-zA-Z0-9]+]]: index
+//       CHECK:   %[[c4:.*]] = arith.constant 4 : index
+//       CHECK:   return %[[sz]], %[[c4]]
+func.func @memref_expand(%m: memref<?xf32>, %sz: index) -> (index, index) {
+  %0 = memref.expand_shape %m [[0, 1]] output_shape [%sz, 4]: memref<?xf32> into memref<?x4xf32>
+  %1 = "test.reify_bound"(%0) {dim = 0} : (memref<?x4xf32>) -> (index)
+  %2 = "test.reify_bound"(%0) {dim = 1} : (memref<?x4xf32>) -> (index)
+  return %1, %2 : index, index
+}
+
+// -----
+
 // CHECK-LABEL: func @memref_get_global(
 //       CHECK:   %[[c4:.*]] = arith.constant 4 : index
 //       CHECK:   return %[[c4]]
diff --git a/mlir/test/Dialect/OpenACC/ops.mlir b/mlir/test/Dialect/OpenACC/ops.mlir
index 8713689ed5799..77d18da49276a 100644
--- a/mlir/test/Dialect/OpenACC/ops.mlir
+++ b/mlir/test/Dialect/OpenACC/ops.mlir
@@ -2200,3 +2200,46 @@ acc.private.recipe @privatization_memref_slice : memref<10x10xf32> init {
 
   acc.yield %result : memref<10x10xf32>
 }
+
+// -----
+
+func.func @test_firstprivate_map(%arg0: memref<10xf32>) {
+  // Map the function argument using firstprivate_map to enable
+  // moving to accelerator but prevent any present counter updates.
+  %mapped = acc.firstprivate_map varPtr(%arg0 : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+
+  acc.parallel {
+    // Allocate a local variable inside the parallel region to represent
+    // materialized privatization.
+    %local = memref.alloca() : memref<10xf32>
+
+    // Initialize the local variable with the mapped firstprivate value
+    %c0 = arith.constant 0 : index
+    %c10 = arith.constant 10 : index
+    %c1 = arith.constant 1 : index
+
+    scf.for %i = %c0 to %c10 step %c1 {
+      %val = memref.load %mapped[%i] : memref<10xf32>
+      memref.store %val, %local[%i] : memref<10xf32>
+    }
+
+    acc.yield
+  }
+
+  return
+}
+
+// CHECK-LABEL: func @test_firstprivate_map
+// CHECK-NEXT:   %[[MAPPED:.*]] = acc.firstprivate_map varPtr(%{{.*}} : memref<10xf32>) varType(tensor<10xf32>) -> memref<10xf32>
+// CHECK-NEXT:   acc.parallel {
+// CHECK-NEXT:     %[[LOCAL:.*]] = memref.alloca() : memref<10xf32>
+// CHECK-NEXT:     %[[C0:.*]] = arith.constant 0 : index
+// CHECK-NEXT:     %[[C10:.*]] = arith.constant 10 : index
+// CHECK-NEXT:     %[[C1:.*]] = arith.constant 1 : index
+// CHECK-NEXT:     scf.for %{{.*}} = %[[C0]] to %[[C10]] step %[[C1]] {
+// CHECK-NEXT:       %{{.*}} = memref.load %[[MAPPED]][%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:       memref.store %{{.*}}, %[[LOCAL]][%{{.*}}] : memref<10xf32>
+// CHECK-NEXT:     }
+// CHECK-NEXT:     acc.yield
+// CHECK-NEXT:   }
+// CHECK-NEXT:   return
diff --git a/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir
new file mode 100644
index 0000000000000..af52befb156e4
--- /dev/null
+++ b/mlir/test/Dialect/OpenACC/support-analysis-varname.mlir
@@ -0,0 +1,88 @@
+// RUN: mlir-opt %s -split-input-file -test-acc-support | FileCheck %s
+
+// Test with direct variable names
+func.func @test_direct_var_name() {
+  // Create a memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"my_variable">} : memref<10xi32>
+
+  %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName="my_variable"
+
+  return
+}
+
+// -----
+
+// Test through memref.cast
+func.func @test_through_cast() {
+  // Create a 5x2 memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"casted_variable">} : memref<5x2xi32>
+
+  // Cast to dynamic dimensions
+  %1 = memref.cast %0 : memref<5x2xi32> to memref<?x?xi32>
+
+  // Mark with test attribute - should find name through cast
+  %2 = memref.cast %1 {test.var_name} : memref<?x?xi32> to memref<5x2xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<?x?xi32> to memref<5x2xi32>
+  // CHECK-NEXT: getVariableName="casted_variable"
+
+  return
+}
+
+// -----
+
+// Test with no variable name
+func.func @test_no_var_name() {
+  // Create a memref without acc.var_name attribute
+  %0 = memref.alloca() : memref<10xi32>
+
+  // Mark with test attribute - should find empty string
+  %1 = memref.cast %0 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName=""
+
+  return
+}
+
+// -----
+
+// Test through multiple casts
+func.func @test_multiple_casts() {
+  // Create a memref with acc.var_name attribute
+  %0 = memref.alloca() {acc.var_name = #acc.var_name<"multi_cast">} : memref<10xi32>
+
+  // Multiple casts
+  %1 = memref.cast %0 : memref<10xi32> to memref<?xi32>
+  %2 = memref.cast %1 : memref<?xi32> to memref<10xi32>
+
+  // Mark with test attribute - should find name through multiple casts
+  %3 = memref.cast %2 {test.var_name} : memref<10xi32> to memref<10xi32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xi32> to memref<10xi32>
+  // CHECK-NEXT: getVariableName="multi_cast"
+
+  return
+}
+
+// -----
+
+// Test with acc.copyin operation
+func.func @test_copyin_name() {
+  // Create a memref
+  %0 = memref.alloca() : memref<10xf32>
+
+  // Create an acc.copyin operation with a name
+  %1 = acc.copyin varPtr(%0 : memref<10xf32>) -> memref<10xf32> {name = "input_data"}
+
+  // Mark with test attribute - should find name from copyin operation
+  %2 = memref.cast %1 {test.var_name} : memref<10xf32> to memref<?xf32>
+
+  // CHECK: op=%{{.*}} = memref.cast %{{.*}} {test.var_name} : memref<10xf32> to memref<?xf32>
+  // CHECK-NEXT: getVariableName="input_data"
+
+  return
+}
diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir
new file mode 100644
index 0000000000000..8972a083e2c47
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare-by-value.mlir
@@ -0,0 +1,157 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+
+  omp.private {type = firstprivate} @private_eye : i32 copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.load %arg0 : !llvm.ptr -> i32
+    llvm.store %0, %arg1 : i32, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  }
+  omp.private {type = firstprivate} @boxchar_firstprivate : !llvm.struct<(ptr, i64)> init {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %8 = llvm.call @malloc(%1) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+    %9 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %10 = llvm.insertvalue %8, %9[0] : !llvm.struct<(ptr, i64)>
+    %11 = llvm.insertvalue %1, %10[1] : !llvm.struct<(ptr, i64)>
+    omp.yield(%11 : !llvm.struct<(ptr, i64)>)
+  } copy {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>, %arg1: !llvm.struct<(ptr, i64)>):
+    %3 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %4 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    %5 = llvm.extractvalue %arg1[0] : !llvm.struct<(ptr, i64)>
+    %6 = llvm.extractvalue %arg1[1] : !llvm.struct<(ptr, i64)>
+    %7 = llvm.icmp "slt" %6, %4 : i64
+    %8 = llvm.select %7, %6, %4 : i1, i64
+    "llvm.intr.memmove"(%5, %3, %8) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+    omp.yield(%arg1 : !llvm.struct<(ptr, i64)>)
+  } dealloc {
+  ^bb0(%arg0: !llvm.struct<(ptr, i64)>):
+    %0 = llvm.extractvalue %arg0[0] : !llvm.struct<(ptr, i64)>
+    %1 = llvm.extractvalue %arg0[1] : !llvm.struct<(ptr, i64)>
+    llvm.call @free(%0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  llvm.func @target_boxchar_(%arg0: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+    %0 = llvm.mlir.constant(1 : i64) : i64
+    %1 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+    %2 = llvm.mlir.constant(1 : i64) : i64
+    %3 = llvm.alloca %2 x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+    %4 = llvm.mlir.constant(1 : index) : i64
+    %5 = llvm.mlir.constant(0 : index) : i64
+    %6 = llvm.mlir.constant(0 : i32) : i32
+    %7 = llvm.mlir.constant(1 : i64) : i64
+    %8 = llvm.mlir.constant(1 : i64) : i64
+    %9 = llvm.load %arg0 : !llvm.ptr -> i32
+    %10 = llvm.icmp "sgt" %9, %6 : i32
+    %11 = llvm.select %10, %9, %6 : i1, i32
+    %12 = llvm.mlir.constant(1 : i64) : i64
+    %13 = llvm.sext %11 : i32 to i64
+    %14 = llvm.alloca %13 x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+    %15 = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+    %16 = llvm.sext %11 : i32 to i64
+    %17 = llvm.insertvalue %14, %15[0] : !llvm.struct<(ptr, i64)>
+    %18 = llvm.insertvalue %16, %17[1] : !llvm.struct<(ptr, i64)>
+    llvm.store %18, %3 : !llvm.struct<(ptr, i64)>, !llvm.ptr
+    %19 = llvm.load %3 : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+    %20 = llvm.extractvalue %19[0] : !llvm.struct<(ptr, i64)>
+    %21 = llvm.extractvalue %19[1] : !llvm.struct<(ptr, i64)>
+    %22 = llvm.sub %21, %4 : i64
+    %23 = omp.map.bounds lower_bound(%5 : i64) upper_bound(%22 : i64) extent(%21 : i64) stride(%4 : i64) start_idx(%5 : i64) {stride_in_bytes = true}
+    %24 = llvm.getelementptr %3[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+    %25 = omp.map.info var_ptr(%3 : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%24 : !llvm.ptr) bounds(%23) -> !llvm.ptr
+    %26 = omp.map.info var_ptr(%3 : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%25 : [0] : !llvm.ptr) -> !llvm.ptr
+    %27 = omp.map.info var_ptr(%1 : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+    omp.target nowait map_entries(%26 -> %arg1, %27 -> %arg2, %25 -> %arg3 : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %18 -> %arg4 [map_idx=0], @private_eye %1 -> %arg5 [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+      omp.terminator
+    }
+    llvm.return
+  }
+}
+// CHECK-LABEL:   llvm.func @target_boxchar_(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "l"}) attributes {fir.internal_name = "_QPtarget_boxchar", frame_pointer = #llvm.framePointerKind<all>, target_cpu = "x86-64"} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(16 : i64) : i64
+// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_2]] x !llvm.struct<(ptr, i64)> : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(1 : index) : i64
+// CHECK: %[[VAL_7:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_8:.*]] = llvm.mlir.constant(0 : i32) : i32
+// CHECK: %[[VAL_9:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_10:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_11:.*]] = llvm.load %[[ARG0]] : !llvm.ptr -> i32
+// CHECK: %[[VAL_12:.*]] = llvm.icmp "sgt" %[[VAL_11]], %[[VAL_8]] : i32
+// CHECK: %[[VAL_13:.*]] = llvm.select %[[VAL_12]], %[[VAL_11]], %[[VAL_8]] : i1, i32
+// CHECK: %[[VAL_14:.*]] = llvm.mlir.constant(1 : i64) : i64
+// CHECK: %[[VAL_15:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_16:.*]] = llvm.alloca %[[VAL_15]] x i8 {bindc_name = "char_var"} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_17:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_18:.*]] = llvm.sext %[[VAL_13]] : i32 to i64
+// CHECK: %[[VAL_19:.*]] = llvm.insertvalue %[[VAL_16]], %[[VAL_17]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_20:.*]] = llvm.insertvalue %[[VAL_18]], %[[VAL_19]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_20]], %[[VAL_5]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_21:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_22:.*]] = llvm.extractvalue %[[VAL_21]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_23:.*]] = llvm.extractvalue %[[VAL_21]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_24:.*]] = llvm.sub %[[VAL_23]], %[[VAL_6]] : i64
+// CHECK: %[[VAL_25:.*]] = omp.map.bounds lower_bound(%[[VAL_7]] : i64) upper_bound(%[[VAL_24]] : i64) extent(%[[VAL_23]] : i64) stride(%[[VAL_6]] : i64) start_idx(%[[VAL_7]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_26:.*]] = llvm.load %[[VAL_5]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_28:.*]] = llvm.call @boxchar_firstprivate_init(%[[VAL_26]], %[[VAL_27]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_29:.*]] = llvm.call @boxchar_firstprivate_copy(%[[VAL_26]], %[[VAL_28]]) : (!llvm.struct<(ptr, i64)>, !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)>
+// CHECK: llvm.store %[[VAL_29]], %[[HEAP0]] : !llvm.struct<(ptr, i64)>, !llvm.ptr
+// CHECK: %[[VAL_30:.*]] = omp.map.info var_ptr(%[[VAL_1]] : !llvm.ptr, i32) map_clauses(to) capture(ByCopy) -> !llvm.ptr
+// CHECK: %[[VAL_31:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_32:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i8) map_clauses(implicit, to) capture(ByRef) var_ptr_ptr(%[[VAL_31]] : !llvm.ptr) bounds(%[[VAL_25]]) -> !llvm.ptr
+// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64)>) map_clauses(to) capture(ByRef) members(%[[VAL_32]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_34:.*]] = llvm.load %[[HEAP0]] : !llvm.ptr -> !llvm.struct<(ptr, i64)>
+// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_33]] -> %[[VAL_35:.*]], %[[VAL_30]] -> %[[VAL_36:.*]], %[[VAL_32]] -> %[[VAL_37:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@boxchar_firstprivate %[[VAL_34]] -> %[[VAL_38:.*]] [map_idx=0], @private_eye %[[VAL_1]] -> %[[VAL_39:.*]] [map_idx=1] : !llvm.struct<(ptr, i64)>, !llvm.ptr) {
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) {
+// CHECK: llvm.call @boxchar_firstprivate_dealloc(%[[VAL_29]]) : (!llvm.struct<(ptr, i64)>) -> ()
+// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> ()
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.call @malloc(%[[VAL_1]]) {bindc_name = "", uniq_name = ""} : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.insertvalue %[[VAL_2]], %[[VAL_3]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_5:.*]] = llvm.insertvalue %[[VAL_1]], %[[VAL_4]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.return %[[VAL_5]] : !llvm.struct<(ptr, i64)>
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.struct<(ptr, i64)>) -> !llvm.struct<(ptr, i64)> attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_2:.*]] = llvm.extractvalue %[[ARG1]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_3:.*]] = llvm.extractvalue %[[ARG1]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_4:.*]] = llvm.icmp "slt" %[[VAL_3]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_5:.*]] = llvm.select %[[VAL_4]], %[[VAL_3]], %[[VAL_1]] : i1, i64
+// CHECK: "llvm.intr.memmove"(%[[VAL_2]], %[[VAL_0]], %[[VAL_5]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i64) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.struct<(ptr, i64)>
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @boxchar_firstprivate_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.struct<(ptr, i64)>) attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.extractvalue %[[ARG0]][0] : !llvm.struct<(ptr, i64)>
+// CHECK: %[[VAL_1:.*]] = llvm.extractvalue %[[ARG0]][1] : !llvm.struct<(ptr, i64)>
+// CHECK: llvm.call @free(%[[VAL_0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
diff --git a/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
new file mode 100644
index 0000000000000..0377d4962cba9
--- /dev/null
+++ b/mlir/test/Dialect/OpenMP/omp-offload-privatization-prepare.mlir
@@ -0,0 +1,201 @@
+// RUN: mlir-opt --mlir-disable-threading -omp-offload-privatization-prepare --split-input-file %s | FileCheck %s
+
+module attributes {dlti.dl_spec = #dlti.dl_spec<!llvm.ptr<270> = dense<32> : vector<4xi64>, !llvm.ptr<271> = dense<32> : vector<4xi64>, !llvm.ptr<272> = dense<64> : vector<4xi64>, i64 = dense<64> : vector<2xi64>, i128 = dense<128> : vector<2xi64>, f80 = dense<128> : vector<2xi64>, !llvm.ptr = dense<64> : vector<4xi64>, i1 = dense<8> : vector<2xi64>, i8 = dense<8> : vector<2xi64>, i16 = dense<16> : vector<2xi64>, i32 = dense<32> : vector<2xi64>, f16 = dense<16> : vector<2xi64>, f64 = dense<64> : vector<2xi64>, f128 = dense<128> : vector<2xi64>, "dlti.endianness" = "little", "dlti.mangling_mode" = "e", "dlti.legal_int_widths" = array<i32: 8, 16, 32, 64>, "dlti.stack_alignment" = 128 : i64>} {
+  llvm.func @free(!llvm.ptr)
+  llvm.func @malloc(i64) -> !llvm.ptr
+
+  omp.private {type = firstprivate} @firstprivatizer : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i64) : i64
+    %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %1, %2 : !llvm.ptr, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  } copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+  omp.private {type = firstprivate} @firstprivatizer_1 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> init {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i64) : i64
+    %1 = llvm.call @malloc(%0) : (i64) -> !llvm.ptr
+    %2 = llvm.getelementptr %arg1[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %1, %2 : !llvm.ptr, !llvm.ptr
+    omp.yield(%arg1 : !llvm.ptr)
+  } copy {
+  ^bb0(%arg0: !llvm.ptr, %arg1: !llvm.ptr):
+    %0 = llvm.mlir.constant(48 : i32) : i32
+    "llvm.intr.memcpy"(%arg1, %arg0, %0) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+    omp.yield(%arg1 : !llvm.ptr)
+  } dealloc {
+  ^bb0(%arg0: !llvm.ptr):
+    llvm.call @free(%arg0) : (!llvm.ptr) -> ()
+    omp.yield
+  }
+
+  llvm.func internal @firstprivate_test(%arg0: !llvm.ptr {fir.bindc_name = "ptr0"}, %arg1: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+    %0 = llvm.mlir.constant(1 : i32) : i32
+    %1 = llvm.mlir.constant(0 : index) : i64
+    %5 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+    %19 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+    %20 = llvm.alloca %0 x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr
+    %21 = llvm.alloca %0 x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+    %33 = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    llvm.store %33, %19 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %33, %20 : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+    llvm.store %0, %21 : i32, !llvm.ptr
+    %124 = omp.map.info var_ptr(%21 : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"}
+    %150 = llvm.getelementptr %19[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %151 = llvm.load %150 : !llvm.ptr -> i64
+    %152 = llvm.getelementptr %19[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %153 = llvm.load %152 : !llvm.ptr -> i64
+    %154 = llvm.getelementptr %19[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %155 = llvm.load %154 : !llvm.ptr -> i64
+    %156 = llvm.sub %153, %1 : i64
+    %157 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%156 : i64) extent(%153 : i64) stride(%155 : i64) start_idx(%151 : i64) {stride_in_bytes = true}
+    %158 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %159 = omp.map.info var_ptr(%19 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%158 : !llvm.ptr) bounds(%157) -> !llvm.ptr {name = ""}
+    %160 = omp.map.info var_ptr(%19 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%159 : [0] : !llvm.ptr) -> !llvm.ptr
+    %1501 = llvm.getelementptr %20[0, 7, %1, 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1511 = llvm.load %1501 : !llvm.ptr -> i64
+    %1521 = llvm.getelementptr %20[0, 7, %1, 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1531 = llvm.load %1521 : !llvm.ptr -> i64
+    %1541 = llvm.getelementptr %20[0, 7, %1, 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1551 = llvm.load %1541 : !llvm.ptr -> i64
+    %1561 = llvm.sub %1531, %1 : i64
+    %1571 = omp.map.bounds lower_bound(%1 : i64) upper_bound(%1561 : i64) extent(%1531 : i64) stride(%1551 : i64) start_idx(%1511 : i64) {stride_in_bytes = true}
+    %1581 = llvm.getelementptr %20[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %1591 = omp.map.info var_ptr(%20 : !llvm.ptr, i32) map_clauses(descriptor_base_addr, to) capture(ByRef) var_ptr_ptr(%1581 : !llvm.ptr) bounds(%1571) -> !llvm.ptr {name = ""}
+    %1601 = omp.map.info var_ptr(%20 : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always, descriptor, to) capture(ByRef) members(%1591 : [0] : !llvm.ptr) -> !llvm.ptr
+
+    // Test with two firstprivate variables so that we test that even if there are multiple variables to be cleaned up
+    // only one cleanup omp.task is generated.
+    omp.target nowait map_entries(%124 -> %arg2, %160 -> %arg5, %159 -> %arg8, %1601 -> %arg9, %1591 -> %arg10  : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %19 -> %arg11 [map_idx=1], @firstprivatizer_1 %20 -> %arg12 [map_idx=3] : !llvm.ptr, !llvm.ptr) {
+      omp.terminator
+    }
+    %166 = llvm.mlir.constant(48 : i32) : i32
+    %167 = llvm.getelementptr %19[0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+    %168 = llvm.load %167 : !llvm.ptr -> !llvm.ptr
+    llvm.call @free(%168) : (!llvm.ptr) -> ()
+    llvm.return
+  }
+
+}
+// CHECK-LABEL:   llvm.func @free(!llvm.ptr)
+// CHECK: llvm.func @malloc(i64) -> !llvm.ptr
+
+
+// CHECK-LABEL:   llvm.func internal @firstprivate_test(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr {fir.bindc_name = "ptr0"},
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr {fir.bindc_name = "ptr1"}) {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(1 : i32) : i32
+// CHECK: %[[VAL_1:.*]] = llvm.mlir.constant(0 : index) : i64
+// CHECK: %[[VAL_2:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {alignment = 8 : i64} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_3:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP0:.*]] = llvm.call @malloc(%[[VAL_3]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_5:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "local"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_6:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[HEAP1:.*]] = llvm.call @malloc(%[[VAL_6]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_8:.*]] = llvm.alloca %[[VAL_0]] x !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)> {bindc_name = "glocal"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_9:.*]] = llvm.alloca %[[VAL_0]] x i32 {bindc_name = "i"} : (i32) -> !llvm.ptr
+// CHECK: %[[VAL_10:.*]] = llvm.mlir.undef : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_10]], %[[VAL_5]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_10]], %[[VAL_8]] : !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>, !llvm.ptr
+// CHECK: llvm.store %[[VAL_0]], %[[VAL_9]] : i32, !llvm.ptr
+// CHECK: %[[VAL_11:.*]] = omp.map.info var_ptr(%[[VAL_9]] : !llvm.ptr, i32) map_clauses(implicit) capture(ByCopy) -> !llvm.ptr {name = "i"}
+// CHECK: %[[VAL_12:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_13:.*]] = llvm.load %[[VAL_12]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_14:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_15:.*]] = llvm.load %[[VAL_14]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_16:.*]] = llvm.getelementptr %[[VAL_5]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_17:.*]] = llvm.load %[[VAL_16]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_18:.*]] = llvm.sub %[[VAL_15]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_19:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_18]] : i64) extent(%[[VAL_15]] : i64) stride(%[[VAL_17]] : i64) start_idx(%[[VAL_13]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_20:.*]] = llvm.call @firstprivatizer_init(%[[VAL_5]], %[[HEAP0]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_21:.*]] = llvm.call @firstprivatizer_copy(%[[VAL_5]], %[[VAL_20]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_22:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 0] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_23:.*]] = llvm.load %[[VAL_22]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_24:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 1] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_25:.*]] = llvm.load %[[VAL_24]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_26:.*]] = llvm.getelementptr %[[VAL_8]][0, 7, %[[VAL_1]], 2] : (!llvm.ptr, i64) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_27:.*]] = llvm.load %[[VAL_26]] : !llvm.ptr -> i64
+// CHECK: %[[VAL_28:.*]] = llvm.sub %[[VAL_25]], %[[VAL_1]] : i64
+// CHECK: %[[VAL_29:.*]] = omp.map.bounds lower_bound(%[[VAL_1]] : i64) upper_bound(%[[VAL_28]] : i64) extent(%[[VAL_25]] : i64) stride(%[[VAL_27]] : i64) start_idx(%[[VAL_23]] : i64) {stride_in_bytes = true}
+// CHECK: %[[VAL_30:.*]] = llvm.call @firstprivatizer_1_init(%[[VAL_8]], %[[HEAP1]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_31:.*]] = llvm.call @firstprivatizer_1_copy(%[[VAL_8]], %[[VAL_30]]) : (!llvm.ptr, !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_32:.*]] = llvm.getelementptr %[[HEAP0]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_33:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_32]] : !llvm.ptr) bounds(%[[VAL_19]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_34:.*]] = omp.map.info var_ptr(%[[HEAP0]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_33]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: %[[VAL_35:.*]] = llvm.getelementptr %[[HEAP1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_36:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, i32) map_clauses({{.*}}to{{.*}}) capture(ByRef) var_ptr_ptr(%[[VAL_35]] : !llvm.ptr) bounds(%[[VAL_29]]) -> !llvm.ptr {name = ""}
+// CHECK: %[[VAL_37:.*]] = omp.map.info var_ptr(%[[HEAP1]] : !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>) map_clauses(always,{{.*}}to) capture(ByRef) members(%[[VAL_36]] : [0] : !llvm.ptr) -> !llvm.ptr
+// CHECK: omp.target depend(taskdependout -> %[[HEAP0]] : !llvm.ptr) nowait map_entries(%[[VAL_11]] -> %[[VAL_38:.*]], %[[VAL_34]] -> %[[VAL_39:.*]], %[[VAL_33]] -> %[[VAL_40:.*]], %[[VAL_37]] -> %[[VAL_41:.*]], %[[VAL_36]] -> %[[VAL_42:.*]] : !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr, !llvm.ptr) private(@firstprivatizer %[[HEAP0]] -> %[[VAL_43:.*]] [map_idx=1], @firstprivatizer_1 %[[HEAP1]] -> %[[VAL_44:.*]] [map_idx=3] : !llvm.ptr, !llvm.ptr) {
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: omp.task depend(taskdependin -> %[[HEAP0]] : !llvm.ptr) {
+// CHECK: llvm.call @firstprivatizer_1_dealloc(%[[VAL_31]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @free(%[[HEAP1]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @firstprivatizer_dealloc(%[[VAL_21]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.call @free(%[[HEAP0]]) : (!llvm.ptr) -> ()
+// CHECK: omp.terminator
+// CHECK: }
+// CHECK: %[[VAL_45:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: %[[VAL_46:.*]] = llvm.getelementptr %[[VAL_5]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: %[[VAL_47:.*]] = llvm.load %[[VAL_46]] : !llvm.ptr -> !llvm.ptr
+// CHECK: llvm.call @free(%[[VAL_47]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} {
+// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_init(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i64) : i64
+// CHECK: %[[VAL_1:.*]] = llvm.call @malloc(%[[VAL_0]]) : (i64) -> !llvm.ptr
+// CHECK: %[[VAL_2:.*]] = llvm.getelementptr %[[ARG1]][0, 0] : (!llvm.ptr) -> !llvm.ptr, !llvm.struct<(ptr, i64, i32, i8, i8, i8, i8, array<1 x array<3 x i64>>)>
+// CHECK: llvm.store %[[VAL_1]], %[[VAL_2]] : !llvm.ptr, !llvm.ptr
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_copy(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr,
+// CHECK-SAME:      %[[ARG1:.*]]: !llvm.ptr) -> !llvm.ptr attributes {always_inline} {
+// CHECK: %[[VAL_0:.*]] = llvm.mlir.constant(48 : i32) : i32
+// CHECK: "llvm.intr.memcpy"(%[[ARG1]], %[[ARG0]], %[[VAL_0]]) <{isVolatile = false}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
+// CHECK: llvm.return %[[ARG1]] : !llvm.ptr
+// CHECK: }
+
+// CHECK-LABEL:   llvm.func @firstprivatizer_1_dealloc(
+// CHECK-SAME:      %[[ARG0:.*]]: !llvm.ptr) attributes {always_inline} {
+// CHECK: llvm.call @free(%[[ARG0]]) : (!llvm.ptr) -> ()
+// CHECK: llvm.return
+// CHECK: }
diff --git a/mlir/test/Dialect/SCF/canonicalize.mlir b/mlir/test/Dialect/SCF/canonicalize.mlir
index 2bec63672e783..084c3fc065de3 100644
--- a/mlir/test/Dialect/SCF/canonicalize.mlir
+++ b/mlir/test/Dialect/SCF/canonicalize.mlir
@@ -1604,6 +1604,148 @@ func.func @func_execute_region_inline_multi_yield() {
 
 // -----
 
+// Test case with single scf.yield op inside execute_region and its operand is defined outside the execute_region op.
+// Make scf.execute_region not to return anything.
+
+// CHECK:           scf.execute_region no_inline {
+// CHECK:             func.call @foo() : () -> ()
+// CHECK:             scf.yield
+// CHECK:           }
+
+module {
+func.func private @foo()->()
+func.func private @execute_region_yeilding_external_value() -> memref<1x60xui8> {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>  
+  %1 = scf.execute_region -> memref<1x60xui8> no_inline {    
+    func.call @foo():()->()
+    scf.yield %alloc: memref<1x60xui8>
+  }  
+  return %1 : memref<1x60xui8>
+}
+}
+
+// -----
+
+// Test case with scf.yield op inside execute_region with multiple operands.
+// One of operands is defined outside the execute_region op.
+// Remove just this operand from the op results.
+
+// CHECK:           %[[VAL_1:.*]] = scf.execute_region -> memref<1x120xui8> no_inline {
+// CHECK:             %[[VAL_2:.*]] = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8>
+// CHECK:             func.call @foo() : () -> ()
+// CHECK:             scf.yield %[[VAL_2]] : memref<1x120xui8>
+// CHECK:           }
+module {
+func.func private @foo()->()
+func.func private @execute_region_yeilding_external_and_local_values() -> (memref<1x60xui8>, memref<1x120xui8>) {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>  
+  %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline {    
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8>
+    func.call @foo():()->()
+    scf.yield %alloc, %alloc_1: memref<1x60xui8>,  memref<1x120xui8>
+  }  
+  return %1, %2 : memref<1x60xui8>, memref<1x120xui8>
+}
+}
+
+// -----
+
+// Test case with multiple scf.yield ops inside execute_region with same operands and those operands are defined outside the execute_region op..
+// Make scf.execute_region not to return anything.
+// scf.yield must remain, cause scf.execute_region can't be empty.
+
+// CHECK:           scf.execute_region no_inline {
+// CHECK:             %[[VAL_3:.*]] = "test.cmp"() : () -> i1
+// CHECK:             cf.cond_br %[[VAL_3]], ^bb1, ^bb2
+// CHECK:           ^bb1:
+// CHECK:             scf.yield
+// CHECK:           ^bb2:
+// CHECK:             scf.yield
+// CHECK:           }
+
+module {
+  func.func private @foo()->()
+  func.func private @execute_region_multiple_yields_same_operands() -> (memref<1x60xui8>, memref<1x120xui8>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>  
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8>  
+    %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline {
+      %c = "test.cmp"() : () -> i1
+      cf.cond_br %c, ^bb2, ^bb3
+    ^bb2:    
+      func.call @foo():()->()
+      scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8>
+    ^bb3: 
+      func.call @foo():()->()   
+      scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8>
+    }  
+    return %1, %2 : memref<1x60xui8>, memref<1x120xui8>
+  }
+}
+
+// -----
+
+// Test case with multiple scf.yield ops with at least one different operand, then no change.
+
+// CHECK:           %[[VAL_3:.*]]:2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline {
+// CHECK:           ^bb1:
+// CHECK:             scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8>
+// CHECK:           ^bb2:
+// CHECK:             scf.yield %{{.*}}, %{{.*}} : memref<1x60xui8>, memref<1x120xui8>
+// CHECK:           }
+
+module {
+  func.func private @foo()->()
+  func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>, memref<1x120xui8>) {
+    %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>  
+    %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8>  
+    %alloc_2 = memref.alloc() {alignment = 64 : i64} : memref<1x120xui8>  
+    %1, %2 = scf.execute_region -> (memref<1x60xui8>, memref<1x120xui8>) no_inline {
+      %c = "test.cmp"() : () -> i1
+      cf.cond_br %c, ^bb2, ^bb3
+    ^bb2:    
+      func.call @foo():()->()
+      scf.yield %alloc, %alloc_1 : memref<1x60xui8>, memref<1x120xui8>
+    ^bb3: 
+      func.call @foo():()->()   
+      scf.yield %alloc, %alloc_2 : memref<1x60xui8>, memref<1x120xui8>
+    }  
+    return %1, %2 : memref<1x60xui8>, memref<1x120xui8>
+  }
+}
+
+// -----
+
+// Test case with multiple scf.yield ops each has different operand.
+// In this case scf.execute_region isn't changed.
+
+// CHECK:           %[[VAL_2:.*]] = scf.execute_region -> memref<1x60xui8> no_inline {
+// CHECK:           ^bb1:
+// CHECK:             scf.yield %{{.*}} : memref<1x60xui8>
+// CHECK:           ^bb2:
+// CHECK:             scf.yield %{{.*}} : memref<1x60xui8>
+// CHECK:           }
+
+module {
+func.func private @foo()->()
+func.func private @execute_region_multiple_yields_different_operands() -> (memref<1x60xui8>) {
+  %alloc = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>  
+  %alloc_1 = memref.alloc() {alignment = 64 : i64} : memref<1x60xui8>   
+  %1 = scf.execute_region -> (memref<1x60xui8>) no_inline {
+    %c = "test.cmp"() : () -> i1
+    cf.cond_br %c, ^bb2, ^bb3
+  ^bb2:    
+    func.call @foo():()->()
+    scf.yield %alloc : memref<1x60xui8>
+  ^bb3:    
+    func.call @foo():()->()
+    scf.yield %alloc_1 : memref<1x60xui8>
+  }    
+  return %1 : memref<1x60xui8>
+}
+}
+
+// -----
+
 // CHECK-LABEL: func @canonicalize_parallel_insert_slice_indices(
 //  CHECK-SAME:     %[[arg0:.*]]: tensor<1x5xf32>, %[[arg1:.*]]: tensor<?x?xf32>
 func.func @canonicalize_parallel_insert_slice_indices(
diff --git a/mlir/test/Dialect/Tosa/availability.mlir b/mlir/test/Dialect/Tosa/availability.mlir
index 600c4c717922a..a05f42395778a 100644
--- a/mlir/test/Dialect/Tosa/availability.mlir
+++ b/mlir/test/Dialect/Tosa/availability.mlir
@@ -9,7 +9,7 @@
 // CHECK-LABEL: argmax
 func.func @test_argmax(%arg0: tensor<14x19xf32>) -> tensor<14xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [int16, fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [int16, fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.argmax %arg0 {axis = 1 : i32} : (tensor<14x19xf32>) -> tensor<14xi32>
   return %0 : tensor<14xi32>
 }
@@ -144,7 +144,7 @@ func.func @test_erf(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-LABEL: add
 func.func @test_add(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.add %arg0, %arg1 : (tensor<13x21x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -153,7 +153,7 @@ func.func @test_add(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> t
 // CHECK-LABEL: arithmetic_right_shift
 func.func @test_arithmetic_right_shift(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.arithmetic_right_shift %arg0, %arg1 {round = false} : (tensor<13x21x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -162,7 +162,7 @@ func.func @test_arithmetic_right_shift(%arg0: tensor<13x21x1xf32>, %arg1: tensor
 // CHECK-LABEL: bitwise_and
 func.func @test_bitwise_and(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.bitwise_and %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -171,7 +171,7 @@ func.func @test_bitwise_and(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi3
 // CHECK-LABEL: bitwise_or
 func.func @test_bitwise_or(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.bitwise_or %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x1x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -180,7 +180,7 @@ func.func @test_bitwise_or(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1x3xi32>
 // CHECK-LABEL: bitwise_xor
 func.func @test_bitwise_xor(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.bitwise_xor %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -189,7 +189,7 @@ func.func @test_bitwise_xor(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi3
 // CHECK-LABEL: intdiv
 func.func @test_intdiv(%arg0: tensor<13x21x1xi32>, %arg1: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.intdiv %arg0, %arg1 : (tensor<13x21x1xi32>, tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -207,7 +207,7 @@ func.func @test_logical_and(%arg0: tensor<13x21x3xi1>, %arg1: tensor<13x21x1xi1>
 // CHECK-LABEL: logical_left_shift
 func.func @test_logical_left_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.logical_left_shift %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -216,7 +216,7 @@ func.func @test_logical_left_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x
 // CHECK-LABEL: logical_right_shift
 func.func @test_logical_right_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x21x1xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.logical_right_shift %arg0, %arg1 : (tensor<13x21x3xi32>, tensor<13x21x1xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -243,7 +243,7 @@ func.func @test_logical_xor(%arg0: tensor<13x1x3xi1>, %arg1: tensor<13x21x3xi1>)
 // CHECK-LABEL: maximum
 func.func @test_max(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x1xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.maximum %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x21x1xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -252,7 +252,7 @@ func.func @test_max(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x1xf32>) -> t
 // CHECK-LABEL: minimum
 func.func @test_min(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.minimum %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<1x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -262,7 +262,7 @@ func.func @test_min(%arg0: tensor<13x21x3xf32>, %arg1: tensor<1x21x3xf32>) -> te
 func.func @test_mul(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xf32> {
   %shift = "tosa.const"() <{values = dense<0> : tensor<1xi8>}> : () -> tensor<1xi8>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.mul %arg0, %arg1, %shift : (tensor<13x21x3xf32>, tensor<13x1x3xf32>, tensor<1xi8>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -280,7 +280,7 @@ func.func @test_pow(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x1xf32>) -> t
 // CHECK-LABEL: sub
 func.func @test_sub(%arg0: tensor<1x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.sub %arg0, %arg1 : (tensor<1x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -298,7 +298,7 @@ func.func @test_table(%arg0: tensor<64xi32>, %arg1: tensor<513x!quant.uniform<i1
 // CHECK-LABEL: abs
 func.func @test_abs(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.abs %arg0 : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -307,7 +307,7 @@ func.func @test_abs(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-LABEL: bitwise_not
 func.func @test_bitwise_not(%arg0: tensor<13x21x1xi32>) -> tensor<13x21x1xi32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.bitwise_not %arg0 : (tensor<13x21x1xi32>) -> tensor<13x21x1xi32>
   return %0 : tensor<13x21x1xi32>
 }
@@ -325,7 +325,7 @@ func.func @test_ceil(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-LABEL: clz
 func.func @test_clz(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int] ]
-  // CHECK: extensions: [ ]
+  // CHECK: extensions: [ [int64] ]
   %0 = tosa.clz %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -379,7 +379,7 @@ func.func @test_logical_not(%arg0: tensor<1x21x3xi1>) -> tensor<1x21x3xi1> {
 // CHECK-LABEL: negate
 func.func @test_negate(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32>
   %output_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : () -> tensor<1xf32>
   %0 = tosa.negate %arg0, %input_zp, %output_zp : (tensor<13x21x3xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<13x21x3xf32>
@@ -417,7 +417,7 @@ func.func @test_sin(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 // CHECK-LABEL: select
 func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xf32>, %arg2: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.select %arg0, %arg1, %arg2 : (tensor<1x1x1xi1>, tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -426,7 +426,7 @@ func.func @test_select(%arg0: tensor<1x1x1xi1>, %arg1: tensor<13x21x3xf32>, %arg
 // CHECK-LABEL: equal
 func.func @test_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) -> tensor<13x21x3xi1> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.equal %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x1x3xf32>) -> tensor<13x21x3xi1>
   return %0 : tensor<13x21x3xi1>
 }
@@ -435,7 +435,7 @@ func.func @test_equal(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x1x3xf32>) ->
 // CHECK-LABEL: greater
 func.func @test_greater(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xi1> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.greater %arg0, %arg1 : (tensor<13x21x1xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
   return %0 : tensor<13x21x3xi1>
 }
@@ -444,7 +444,7 @@ func.func @test_greater(%arg0: tensor<13x21x1xf32>, %arg1: tensor<13x21x3xf32>)
 // CHECK-LABEL: greater_equal
 func.func @test_greater_equal(%arg0: tensor<13x1x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<13x21x3xi1> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.greater_equal %arg0, %arg1 : (tensor<13x1x3xf32>, tensor<13x21x3xf32>) -> tensor<13x21x3xi1>
   return %0 : tensor<13x21x3xi1>
 }
@@ -471,7 +471,7 @@ func.func @test_reduce_any(%arg0: tensor<13x21x3xi1>) -> tensor<1x21x3xi1> {
 // CHECK-LABEL: reduce_max
 func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.reduce_max %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<1x21x3xf32>
   return %0 : tensor<1x21x3xf32>
 }
@@ -480,7 +480,7 @@ func.func @test_reduce_max(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> {
 // CHECK-LABEL: reduce_min
 func.func @test_reduce_min(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.reduce_min %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<1x21x3xf32>
   return %0 : tensor<1x21x3xf32>
 }
@@ -498,7 +498,7 @@ func.func @test_reduce_product(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32>
 // CHECK-LABEL: reduce_sum
 func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [bf16] ]
+  // CHECK: extensions: [ [bf16, int64] ]
   %0 = tosa.reduce_sum %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<1x21x3xf32>
   return %0 : tensor<1x21x3xf32>
 }
@@ -507,7 +507,7 @@ func.func @test_reduce_sum(%arg0: tensor<13x21x3xf32>) -> tensor<1x21x3xf32> {
 // CHECK-LABEL: concat
 func.func @test_concat(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x21x3xf32>) -> tensor<26x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int16, int64] ]
   %0 = tosa.concat %arg0, %arg1 {axis = 0 : i32} : (tensor<13x21x3xf32>, tensor<13x21x3xf32>) -> tensor<26x21x3xf32>
   return %0 : tensor<26x21x3xf32>
 }
@@ -518,7 +518,7 @@ func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   %padding = tosa.const_shape {values = dense<0> : tensor<6xindex>} : () -> !tosa.shape<6>
   %pad_const = "tosa.const"() {values = dense<3.14> : tensor<1xf32>} : () -> tensor<1xf32>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.pad %arg0, %padding, %pad_const : (tensor<13x21x3xf32>, !tosa.shape<6>, tensor<1xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -528,7 +528,7 @@ func.func @test_pad(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
 func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
   %1 = tosa.const_shape {values = dense<[1, 819]> : tensor<2xindex>} : () -> !tosa.shape<2>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.reshape %arg0, %1 : (tensor<13x21x3xf32>, !tosa.shape<2>) -> tensor<1x819xf32>
   return %0 : tensor<1x819xf32>
 }
@@ -537,7 +537,7 @@ func.func @test_reshape(%arg0: tensor<13x21x3xf32>) -> tensor<1x819xf32> {
 // CHECK-LABEL: reverse
 func.func @test_reverse(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.reverse %arg0 {axis = 0 : i32} : (tensor<13x21x3xf32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -548,7 +548,7 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
   %0 = tosa.const_shape {values = dense<[4, 11, 1]> : tensor<3xindex>} : () -> !tosa.shape<3>
   %1 = tosa.const_shape {values = dense<[6, 8, 0]> : tensor<3xindex>} : () -> !tosa.shape<3>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %2 = tosa.slice %arg0, %0, %1 : (tensor<13x21x3xf32>, !tosa.shape<3>, !tosa.shape<3>) -> tensor<4x11x1xf32>
   return %2 : tensor<4x11x1xf32>
 }
@@ -558,7 +558,7 @@ func.func @test_slice(%arg0: tensor<13x21x3xf32>) -> tensor<4x11x1xf32> {
 func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<39x21x6xf32> {
   %cst = tosa.const_shape { values = dense<[3, 1, 2]> : tensor<3xindex> } : () -> !tosa.shape<3>
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.tile %arg0, %cst: (tensor<13x21x3xf32>, !tosa.shape<3>) -> tensor<39x21x6xf32>
   return %0 : tensor<39x21x6xf32>
 }
@@ -567,7 +567,7 @@ func.func @test_tile(%arg0: tensor<13x21x3xf32>) -> tensor<39x21x6xf32> {
 // CHECK-LABEL: transpose
 func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %1 = tosa.transpose %arg0 {perms = array<i32: 2, 0, 1>}: (tensor<13x21x3xf32>) -> tensor<3x13x21xf32>
   return %1 : tensor<3x13x21xf32>
 }
@@ -576,7 +576,7 @@ func.func @test_transpose(%arg0: tensor<13x21x3xf32>) -> tensor<3x13x21xf32> {
 // CHECK-LABEL: gather
 func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) -> tensor<13x26x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.gather %arg0, %arg1 : (tensor<13x21x3xf32>, tensor<13x26xi32>) -> tensor<13x26x3xf32>
   return %0 : tensor<13x26x3xf32>
 }
@@ -585,7 +585,7 @@ func.func @test_gather(%arg0: tensor<13x21x3xf32>, %arg1: tensor<13x26xi32>) ->
 // CHECK-LABEL: scatter
 func.func @test_scatter(%arg0: tensor<13x28x3xf32>, %arg1: tensor<13x26xi32>, %arg2: tensor<13x26x3xf32>) -> tensor<13x28x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x28x3xf32>, tensor<13x26xi32>, tensor<13x26x3xf32>) -> tensor<13x28x3xf32>
   return %0 : tensor<13x28x3xf32>
 }
@@ -606,7 +606,7 @@ func.func @test_resize(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x8xf32> {
 // CHECK-LABEL: cast
 func.func @test_cast1(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xf32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [fp8e4m3, fp8e5m2, bf16, mxfp, int64] ]
   %0 = tosa.cast %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xf32>
   return %0 : tensor<13x21x3xf32>
 }
@@ -626,7 +626,7 @@ func.func @test_rescale(%arg0: tensor<13x21x3x!quant.uniform<u8:f32, 0.015655439
 // CHECK-LABEL: test_const
 func.func @test_const(%arg0 : index) -> tensor<4xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [int4, int16, fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [int4, int16, fp8e4m3, fp8e5m2, bf16, mxfp, int64] ]
     %0 = "tosa.const"() {values = dense<[3, 0, 1, 2]> : tensor<4xi32>} : () -> tensor<4xi32>
     return %0 : tensor<4xi32>
 }
@@ -635,7 +635,7 @@ func.func @test_const(%arg0 : index) -> tensor<4xi32> {
 // CHECK-LABEL: identity
 func.func @test_identity(%arg0: tensor<13x21x3xi32>) -> tensor<13x21x3xi32> {
   // CHECK: profiles: [ [pro_int, pro_fp] ]
-  // CHECK: extensions: [ [int4, int16, fp8e4m3, fp8e5m2, bf16] ]
+  // CHECK: extensions: [ [int4, int16, fp8e4m3, fp8e5m2, bf16, int64] ]
   %0 = tosa.identity %arg0 : (tensor<13x21x3xi32>) -> tensor<13x21x3xi32>
   return %0 : tensor<13x21x3xi32>
 }
@@ -696,3 +696,21 @@ func.func @test_const_shape() -> !tosa.shape<4> {
   return %cst : !tosa.shape<4>
 }
 
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled
+func.func @test_cast_from_block_scaled(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // CHECK: profiles: [ [pro_fp] ]
+  // CHECK: extensions: [ [bf16, mxfp] ]
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled
+func.func @test_cast_to_block_scaled(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // CHECK: profiles: [ [pro_fp] ]
+  // CHECK: extensions: [ [bf16, mxfp] ]
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = BLOCK_SIZE_32, stochastic_round = false} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
diff --git a/mlir/test/Dialect/Tosa/canonicalize.mlir b/mlir/test/Dialect/Tosa/canonicalize.mlir
index e8525a5d2ed62..5a40f3fa8572c 100644
--- a/mlir/test/Dialect/Tosa/canonicalize.mlir
+++ b/mlir/test/Dialect/Tosa/canonicalize.mlir
@@ -9,11 +9,20 @@ func.func @argmax_nofold(%arg0: tensor<?x1xf32>) -> tensor<1xi32> {
 
 // -----
 
-// CHECK-LABEL: @pad_wh_avg_pool2d_fold
-func.func @pad_wh_avg_pool2d_fold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> {
-  // CHECK-NOT: tosa.pad
+// CHECK-LABEL: @test_argmax_fold_i64_index
+func.func @test_argmax_fold_i64_index(%arg0: tensor<1xi8>) -> tensor<i64> {
+  // CHECK: "tosa.const"() <{values = dense<0> : tensor<i64>}> : () -> tensor<i64>
+  %0 = tosa.argmax %arg0 {axis = 0 : i32} : (tensor<1xi8>) -> tensor<i64>
+  return %0 : tensor<i64>
+}
+
+// -----
+
+// CHECK-LABEL: @pad_wh_avg_pool2d_nofold
+func.func @pad_wh_avg_pool2d_nofold(%input: tensor<1x10x8x3xf32>) -> tensor<1x6x5x3xf32> {
+  // CHECK: tosa.pad
   // CHECK: tosa.avg_pool2d
-  // CHECK-SAME: pad = array<i64: 1, 1, 1, 1>
+  // CHECK-SAME: pad = array<i64: 0, 1, 0, 1>
   %pad_shape = tosa.const_shape { values = dense<[0, 0, 1, 0, 1, 0, 0, 0]> : tensor<8xindex>} : () -> !tosa.shape<8>
   %pad_const = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32>
   %input_zp = "tosa.const"() <{values = dense<0.0> : tensor<1xf32>}> : ()-> tensor<1xf32>
diff --git a/mlir/test/Dialect/Tosa/invalid.mlir b/mlir/test/Dialect/Tosa/invalid.mlir
index 2a3985c3d2438..119991ca7b451 100644
--- a/mlir/test/Dialect/Tosa/invalid.mlir
+++ b/mlir/test/Dialect/Tosa/invalid.mlir
@@ -729,15 +729,6 @@ func.func @test_mul_missing_shift(%arg0: tensor<13x21x3xi32>, %arg1: tensor<13x1
 
 // -----
 
-// CHECK-LABEL: test_unsupported_int64_data_type
-func.func @test_unsupported_int64_data_type(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> {
-  // expected-error@+1 {{'tosa.argmax' op is not profile-aligned: element type 'i64' is not legal}}
-  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64>
-  return %0 : tensor<1x13x13xi64>
-}
-
-// -----
-
 // CHECK-LABEL: test_mismatch_in_out_data_type_clamp
 func.func @test_mismatch_in_out_data_type_clamp(%arg0: tensor<13x21x3xf32>) -> tensor<13x21x3xf16> {
   // expected-error@+1 {{'tosa.clamp' op requires the same element type for all operands and results}}
diff --git a/mlir/test/Dialect/Tosa/invalid_extension.mlir b/mlir/test/Dialect/Tosa/invalid_extension.mlir
index e5c9402caaddc..68a95787b81c7 100644
--- a/mlir/test/Dialect/Tosa/invalid_extension.mlir
+++ b/mlir/test/Dialect/Tosa/invalid_extension.mlir
@@ -538,3 +538,49 @@ func.func @test_avg_pool2d_non_const_output_zp(%arg0: tensor<1x32x32x8xf32>, %ou
          (tensor<1x32x32x8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<1x32x32x8xf32>
   return %0 : tensor<1x32x32x8xf32>
 }
+
+// -----
+
+func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op illegal: requires [mxfp] but not enabled in target}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+func.func @test_argmax_int64(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> {
+  // expected-error@+1 {{'tosa.argmax' op illegal: requires [int64] but not enabled in target}}
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64>
+  return %0 : tensor<1x13x13xi64>
+}
+
+// -----
+func.func @test_const_fp6e3m2(%arg0 : index) -> tensor<4xf6E3M2FN> {
+  // expected-error@+1 {{'tosa.const' op illegal: requires [mxfp] but not enabled in target}}
+    %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN>
+    return %0 : tensor<4xf6E3M2FN>
+}
+
+// -----
+func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16> {
+  // expected-error@+1 {{'tosa.cast' op illegal: requires all of [bf16, mxfp] but not enabled in target}}
+  %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16>
+  return %0 : tensor<13x21x3xbf16>
+}
+
+// -----
+
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op illegal: requires [mxfp] but not enabled in target}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op illegal: requires [mxfp] but not enabled in target}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/level_check.mlir b/mlir/test/Dialect/Tosa/level_check.mlir
index 8cc357efa0c77..a7087647e542b 100644
--- a/mlir/test/Dialect/Tosa/level_check.mlir
+++ b/mlir/test/Dialect/Tosa/level_check.mlir
@@ -1622,3 +1622,43 @@ func.func @test_unranked_weight_conv2d(%arg0: tensor<1x4x4x4xf32>, %arg1: tensor
   %0 = tosa.conv2d %arg0, %arg1, %arg2, %arg3, %arg4 {acc_type = f32, dilation = array<i64: 1, 1>, pad = array<i64: 0, 0, 0, 0>, stride = array<i64: 1, 1>, local_bound = true} : (tensor<1x4x4x4xf32>, tensor<*xf32>, tensor<8xf32>, tensor<1xf32>, tensor<1xf32>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// -----
+
+func.func @test_matmul_t_block_scaled_invalid_size(%arg0: tensor<4x8x536870912xf4E2M1FN>, %arg1: tensor<4x8x16777216xf8E8M0FNU>, %arg2: tensor<4x16x536870912xf4E2M1FN>, %arg3: tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x536870912xf4E2M1FN>, tensor<4x8x16777216xf8E8M0FNU>, tensor<4x16x536870912xf4E2M1FN>, tensor<4x16x16777216xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+func.func @test_cast_from_block_scaled_invalid_size(%arg0: tensor<67108864x32xf6E2M3FN>, %arg1: tensor<67108864x1xf8E8M0FNU>) -> tensor<67108864x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>) -> tensor<67108864x32xf32>
+  return %0 : tensor<67108864x32xf32>
+}
+
+// -----
+
+func.func @test_cast_from_block_scaled_invalid_rank(%arg0: tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, %arg1: tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) -> tensor<1x2x3x4x5x6x7x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op failed level check: operand rank(shape) <= MAX_RANK}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) -> tensor<1x2x3x4x5x6x7x32xf32>
+  return %0 : tensor<1x2x3x4x5x6x7x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_invalid_size(%arg0: tensor<67108864x32xf32>) -> (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op failed level check: operand tensor size (in bytes) <= (1 << MAX_LOG2_SIZE - 1)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<67108864x32xf32>) -> (tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<67108864x32xf6E2M3FN>, tensor<67108864x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_invalid_rank(%arg0: tensor<1x2x3x4x5x6x7x32xf32>) -> (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op failed level check: operand rank(shape) <= MAX_RANK}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<1x2x3x4x5x6x7x32xf32>) -> (tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<1x2x3x4x5x6x7x32xf6E2M3FN>, tensor<1x2x3x4x5x6x7x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/ops.mlir b/mlir/test/Dialect/Tosa/ops.mlir
index 868b7b7a93335..865f712ce1a5a 100644
--- a/mlir/test/Dialect/Tosa/ops.mlir
+++ b/mlir/test/Dialect/Tosa/ops.mlir
@@ -1226,3 +1226,73 @@ func.func @test_scatter_f8E4M3FN(%arg0: tensor<13x29x3xf8E4M3FN>, %arg1: tensor<
   %0 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<13x29x3xf8E4M3FN>, tensor<13x26xi32>, tensor<13x26x3xf8E4M3FN>) -> tensor<13x29x3xf8E4M3FN>
   return %0 : tensor<13x29x3xf8E4M3FN>
 }
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_static
+func.func @test_matmul_t_block_scaled_static(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_unranked
+func.func @test_matmul_t_block_scaled_unranked(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<*xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e3m2
+func.func @test_matmul_t_block_scaled_fp6e3m2(%arg0: tensor<4x8x32xf6E3M2FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E3M2FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E3M2FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E3M2FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3
+func.func @test_matmul_t_block_scaled_fp6e2m3(%arg0: tensor<4x8x32xf6E2M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E2M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E2M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E2M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_fp4e2m1
+func.func @test_matmul_t_block_scaled_fp4e2m1(%arg0: tensor<4x8x32xf4E2M1FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf4E2M1FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf4E2M1FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf4E2M1FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: test_matmul_t_block_scaled_broadcast
+func.func @test_matmul_t_block_scaled_broadcast(%arg0: tensor<?x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<?x16x32xf8E4M3FN>, %arg3: tensor<1x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<?x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<?x16x32xf8E4M3FN>, tensor<1x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled_static
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_from_block_scaled_unranked
+func.func @test_cast_from_block_scaled_unranked(%arg0: tensor<*xf4E2M1FN>, %arg1: tensor<*xf8E8M0FNU>) -> tensor<*xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+// CHECK-LABEL: test_cast_to_block_scaled_unranked
+func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
index 7ff8065ee41fd..7de7b85bcaedf 100644
--- a/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
+++ b/mlir/test/Dialect/Tosa/profile_pro_fp_unsupported.mlir
@@ -2,7 +2,7 @@
 // Enable all supported extensions to focus the verification of expected profile requirement errors.
 //--------------------------------------------------------------------------------------------------
 
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="profiles=pro_int extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment"
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,dynamic,doubleround,inexactround,mxfp" -tosa-validate="strict-op-spec-alignment"
 
 // -----
 func.func @test_const_f16() -> tensor<3x11x11x3xf16> {
@@ -325,3 +325,24 @@ func.func @test_resize(%arg0: tensor<1x32x32x8xf32>) -> tensor<1x64x64x8xf32> {
   %1 = tosa.resize %arg0, %scale, %offset, %border { mode = BILINEAR } : (tensor<1x32x32x8xf32>, !tosa.shape<4>, !tosa.shape<2>, !tosa.shape<2>) -> tensor<1x64x64x8xf32>
   return %1 : tensor<1x64x64x8xf32>
 }
+
+// -----
+func.func @test_matmul_t_block_scaled(%arg0: tensor<4x8x32xf6E3M2FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E3M2FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op illegal: requires [pro_fp] but not enabled in target}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf6E3M2FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E3M2FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op illegal: requires [pro_fp] but not enabled in target}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op illegal: requires [pro_fp] but not enabled in target}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
index 80f06f11fe4ad..54556a0eb08e0 100644
--- a/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-infer-shapes.mlir
@@ -1574,3 +1574,102 @@ func.func @test_mul_scalar(%arg0: tensor<f32>, %arg1: tensor<f32>) -> tensor<*xf
   %0 = tosa.mul %arg0, %arg1, %shift : (tensor<f32>, tensor<f32>, tensor<1xi8>) -> tensor<*xf32>
   return %0 : tensor<*xf32>
 }
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_static
+func.func @test_matmul_t_block_scaled_static(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<1x16x32xf8E4M3FN>, %arg3: tensor<1x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<4x8x16xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<1x16x32xf8E4M3FN>, tensor<1x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_unranked_a_data
+func.func @test_matmul_t_block_scaled_unranked_a_data(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<4x8x16xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_unranked_b_data_and_scale
+func.func @test_matmul_t_block_scaled_unranked_b_data_and_scale(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<4x8x?xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_unranked_all
+func.func @test_matmul_t_block_scaled_unranked_all(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<?x?x?xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_broadcast_b_data
+func.func @test_matmul_t_block_scaled_broadcast_b_data(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<1x4x32xf8E4M3FN>, %arg3: tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<?x?x4xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<1x4x32xf8E4M3FN>, tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_broadcast_b_scale
+func.func @test_matmul_t_block_scaled_broadcast_b_scale(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<*xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32> {
+  // CHECK: -> tensor<?x?x4xf32>
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf8E4M3FN>, tensor<*xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<1x4x1xf8E8M0FNU>) -> tensor<?x?x?xf32>
+  return %0 : tensor<?x?x?xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_static
+func.func @test_cast_from_block_scaled_static(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<*xf32> {
+  // CHECK: -> tensor<4x32xf32>
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_unranked_input_scale
+func.func @test_cast_from_block_scaled_unranked_input_scale(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<*xf8E8M0FNU>) -> tensor<*xf32> {
+  // CHECK: -> tensor<4x32xf32>
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<*xf8E8M0FNU>) -> tensor<*xf32>
+  return %0 : tensor<*xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_unranked
+func.func @test_cast_to_block_scaled_unranked(%arg0: tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<*xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_dynamic_scales
+func.func @test_cast_to_block_scaled_dynamic_scales(%arg0: tensor<4x?xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>) {
+  // CHECK: -> (tensor<4x?xf4E2M1FN>, tensor<4x?xf8E8M0FNU>)
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x?xf32>) -> (tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<*xf4E2M1FN>, tensor<*xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
index 81645092bf195..f3d8dab2f6b0f 100644
--- a/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
+++ b/mlir/test/Dialect/Tosa/tosa-validation-version-1p1-valid.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround" -tosa-validate="strict-op-spec-alignment" | FileCheck %s
+// RUN: mlir-opt %s -split-input-file -verify-diagnostics -tosa-attach-target="specification_version=1.1.draft profiles=pro_int,pro_fp extensions=int16,int4,bf16,fp8e4m3,fp8e5m2,fft,variable,controlflow,doubleround,inexactround,mxfp,int64" -tosa-validate="strict-op-spec-alignment" | FileCheck %s
 
 // -----
 
@@ -18,3 +18,67 @@ func.func @test_matmul_fp8_input_fp32_acc_type(%arg0: tensor<1x14x19xf8E4M3FN>,
   %0 = tosa.matmul %arg0, %arg1, %azp0, %bzp0 : (tensor<1x14x19xf8E4M3FN>, tensor<1x19x28xf8E4M3FN>, tensor<1xf8E4M3FN>, tensor<1xf8E4M3FN>)  -> tensor<1x14x28xf32>
   return %0 : tensor<1x14x28xf32>
 }
+
+// -----
+
+// CHECK-LABEL: test_matmul_t_block_scaled_fp6e2m3
+func.func @test_matmul_t_block_scaled_fp6e2m3(%arg0: tensor<4x8x32xf6E2M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf6E2M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = BLOCK_SIZE_32} : (tensor<4x8x32xf6E2M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf6E2M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_argmax_int64
+func.func @test_argmax_int64(%arg0: tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64> {
+  %0 = tosa.argmax %arg0 {axis = 3 : i32} : (tensor<1x13x13x5xf32>) -> tensor<1x13x13xi64>
+  return %0 : tensor<1x13x13xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_const_i64
+func.func @test_const_i64(%arg0 : index) -> tensor<4xi64> {
+    %0 = "tosa.const"() {values = dense<[3, 0, 1, 2]> : tensor<4xi64>} : () -> tensor<4xi64>
+    return %0 : tensor<4xi64>
+}
+
+// -----
+
+// CHECK-LABEL: test_const_fp6e3m2
+func.func @test_const_fp6e3m2(%arg0 : index) -> tensor<4xf6E3M2FN> {
+    %0 = "tosa.const"() {values = dense<[0.0, 0.0, 0.0, 0.0]> : tensor<4xf6E3M2FN>} : () -> tensor<4xf6E3M2FN>
+    return %0 : tensor<4xf6E3M2FN>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_f4e2m1
+func.func @test_cast_f4e2m1(%arg0: tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16> {
+  %0 = tosa.cast %arg0 : (tensor<13x21x3xf4E2M1FN>) -> tensor<13x21x3xbf16>
+  return %0 : tensor<13x21x3xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_fp8e5m2_fp32
+func.func @test_cast_from_block_scaled_fp8e5m2_fp32(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_from_block_scaled_fp8e5m2_bf16
+func.func @test_cast_from_block_scaled_fp8e5m2_bf16(%arg0: tensor<4x32xf8E5M2>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x32xbf16> {
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf8E5M2>, tensor<4x1xf8E8M0FNU>) -> tensor<4x32xbf16>
+  return %0 : tensor<4x32xbf16>
+}
+
+// -----
+
+// CHECK-LABEL: test_cast_to_block_scaled_static
+func.func @test_cast_to_block_scaled_static(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>) {
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf6E3M2FN>, tensor<4x1xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Tosa/verifier.mlir b/mlir/test/Dialect/Tosa/verifier.mlir
index 430b06ad16c39..6cf76cdc7ad8e 100644
--- a/mlir/test/Dialect/Tosa/verifier.mlir
+++ b/mlir/test/Dialect/Tosa/verifier.mlir
@@ -1033,7 +1033,6 @@ module {
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_indices_N
 func.func @scatter_invalid_indices_N(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<3x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires indices dimension 0 to have size 2, got 3}}
   %1 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<2x4x5xi32>, tensor<3x2xi32>, tensor<2x2x5xi32>) -> tensor<2x4x5xi32>
@@ -1042,7 +1041,6 @@ func.func @scatter_invalid_indices_N(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<3
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_N
 func.func @scatter_invalid_input_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<2x2xi32>, %arg2 : tensor<3x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 0 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<2x2xi32>, tensor<3x2x5xi32>) -> tensor<2x4x5xi32>
@@ -1051,7 +1049,6 @@ func.func @scatter_invalid_input_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<2x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_N
 func.func @scatter_invalid_out_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 0 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<3x4x5xi32>
@@ -1060,7 +1057,6 @@ func.func @scatter_invalid_out_N(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_K
 func.func @scatter_invalid_out_K(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 1 to have size 4, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<2x3x5xi32>
@@ -1069,7 +1065,6 @@ func.func @scatter_invalid_out_K(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_W
 func.func @scatter_invalid_input_W(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x3x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 1 to have size 2, got 3}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x3x5xi32>) -> tensor<2x4x5xi32>
@@ -1078,7 +1073,6 @@ func.func @scatter_invalid_input_W(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_input_C
 func.func @scatter_invalid_input_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x6xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires input dimension 2 to have size 5, got 6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x6xi32>) -> tensor<2x4x5xi32>
@@ -1087,7 +1081,6 @@ func.func @scatter_invalid_input_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_out_C
 func.func @scatter_invalid_out_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi32>, %arg2 : tensor<2x2x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires values_out dimension 2 to have size 5, got 6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<?x4x5xi32>, tensor<?x2xi32>, tensor<2x2x5xi32>) -> tensor<2x4x6xi32>
@@ -1096,9 +1089,136 @@ func.func @scatter_invalid_out_C(%arg0 : tensor<?x4x5xi32>, %arg1 : tensor<?x2xi
 
 // -----
 
-// CHECK-LABEL: @scatter_invalid_K_W
 func.func @scatter_invalid_K_W(%arg0 : tensor<2x4x5xi32>, %arg1 : tensor<2x6xi32>, %arg2 : tensor<2x6x5xi32>) {
   // expected-error@+1 {{'tosa.scatter' op requires dimensions K >= W, got K=4 and W=6}}
   %2 = tosa.scatter %arg0, %arg1, %arg2 : (tensor<2x4x5xi32>, tensor<2x6xi32>, tensor<2x6x5xi32>) -> tensor<2x4x5xi32>
   return
 }
+
+// -----
+
+func.func @test_matmul_t_block_scaled_data_mismatch(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E5M2>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expect A_data and B_data to have same element type, got 'f8E4M3FN' and 'f8E5M2'}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E5M2>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+func.func @test_matmul_t_block_scaled_output_batch_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x8x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<4x?x?xf8E8M0FNU>) -> tensor<5x?x?xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape 5, ?, ? to be compatible with expected output shape 4, 8, ?}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x8x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<4x?x?xf8E8M0FNU>) -> tensor<5x?x?xf32>
+  return %0 : tensor<5x?x?xf32>
+}
+
+// -----
+
+func.func @test_matmul_t_block_scaled_output_height_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x9x1xf8E8M0FNU>, %arg2: tensor<*xf8E4M3FN>, %arg3: tensor<4x?x?xf8E8M0FNU>) -> tensor<4x8x?xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape 4, 8, ? to be compatible with expected output shape 4, 9, ?}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x9x1xf8E8M0FNU>, tensor<*xf8E4M3FN>, tensor<4x?x?xf8E8M0FNU>) -> tensor<4x8x?xf32>
+  return %0 : tensor<4x8x?xf32>
+}
+
+// -----
+
+func.func @test_matmul_t_block_scaled_output_width_mismatch(%arg0: tensor<*xf8E4M3FN>, %arg1: tensor<?x?x1xf8E8M0FNU>, %arg2: tensor<?x1x?xf8E4M3FN>, %arg3: tensor<*xf8E8M0FNU>) -> tensor<?x?x10xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected output shape ?, ?, 10 to be compatible with expected output shape ?, ?, 1}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<*xf8E4M3FN>, tensor<?x?x1xf8E8M0FNU>, tensor<?x1x?xf8E4M3FN>, tensor<*xf8E8M0FNU>) -> tensor<?x?x10xf32>
+  return %0 : tensor<?x?x10xf32>
+}
+
+// -----
+
+func.func @test_matmul_t_block_scaled_channel_not_multiple_of_block_size(%arg0: tensor<4x8x55xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<4x16x32xf8E4M3FN>, %arg3: tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expected channels of b_data to match size 55, got 32}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x55xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<4x16x32xf8E4M3FN>, tensor<4x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+func.func @test_matmul_t_block_scaled_batch_mismatch(%arg0: tensor<4x8x32xf8E4M3FN>, %arg1: tensor<4x8x1xf8E8M0FNU>, %arg2: tensor<2x16x32xf8E4M3FN>, %arg3: tensor<2x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32> {
+  // expected-error@+1 {{'tosa.matmul_t_block_scaled' op expect B matrix batch size to be broadcast compatible with A, got D=2 vs N=4}}
+  %0 = tosa.matmul_t_block_scaled %arg0, %arg1, %arg2, %arg3 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x8x32xf8E4M3FN>, tensor<4x8x1xf8E8M0FNU>, tensor<2x16x32xf8E4M3FN>, tensor<2x16x1xf8E8M0FNU>) -> tensor<4x8x16xf32>
+  return %0 : tensor<4x8x16xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_incompatible_input_output_shape(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<5x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf4E2M1FN>') and output_data ('tensor<5x32xf32>')}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<5x32xf32>
+  return %0 : tensor<5x32xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_not_scalar(%arg0: tensor<f4E2M1FN>, %arg1: tensor<f8E8M0FNU>) -> tensor<f32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor<f4E2M1FN>'}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<f4E2M1FN>, tensor<f8E8M0FNU>) -> tensor<f32>
+  return %0 : tensor<f32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_not_divisible_by_block_size(%arg0: tensor<4x33xf4E2M1FN>, %arg1: tensor<4x1xf8E8M0FNU>) -> tensor<4x33xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op expect last dimension of input_data (33) to be divisible by block_size (32)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) -> tensor<4x33xf32>
+  return %0 : tensor<4x33xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_data_scale_mismatch(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<5x1xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf4E2M1FN>') and input_scale ('tensor<5x1xf8E8M0FNU>') except for the last dimension}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32> : i32} : (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @cast_from_block_scaled_data_scale_channel_mismatch(%arg0: tensor<4x32xf4E2M1FN>, %arg1: tensor<4x2xf8E8M0FNU>) -> tensor<4x32xf32> {
+  // expected-error@+1 {{'tosa.cast_from_block_scaled' op expect last dimension of input_scale (2) to be equal to last dimension of input_data / block_size (1)}}
+  %0 = tosa.cast_from_block_scaled %arg0, %arg1 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>) -> tensor<4x32xf32>
+  return %0 : tensor<4x32xf32>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_incompatible_input_output_shape(%arg0: tensor<4x32xf32>) -> (tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op require compatible shapes for input_data ('tensor<4x32xf32>') and output_data ('tensor<5x32xf4E2M1FN>')}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<5x32xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_not_scalar(%arg0: tensor<f32>) -> (tensor<f4E2M1FN>, tensor<f8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op operand #0 must be tosa-conformant tensor of at least rank 1, but got 'tensor<f32>'}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<f32>) -> (tensor<f4E2M1FN>, tensor<f8E8M0FNU>)
+  return %0#0, %0#1 : tensor<f4E2M1FN>, tensor<f8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_not_divisible_by_block_size(%arg0: tensor<4x33xf32>) -> (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op expect last dimension of input_data (33) to be divisible by block_size (32)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x33xf32>) -> (tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x33xf4E2M1FN>, tensor<4x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_data_scale_mismatch(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op require compatible shapes for output_data ('tensor<4x32xf4E2M1FN>') and output_scale ('tensor<5x1xf8E8M0FNU>') except for the last dimension}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<5x1xf8E8M0FNU>
+}
+
+// -----
+
+func.func @test_cast_to_block_scaled_data_scale_channel_mismatch(%arg0: tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>) {
+  // expected-error@+1 {{'tosa.cast_to_block_scaled' op expect last dimension of output_scale (2) to be equal to last dimension of output_data / block_size (1)}}
+  %0:2 = tosa.cast_to_block_scaled %arg0 {block_size = #tosa.block_size<BLOCK_SIZE_32>} : (tensor<4x32xf32>) -> (tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>)
+  return %0#0, %0#1 : tensor<4x32xf4E2M1FN>, tensor<4x2xf8E8M0FNU>
+}
diff --git a/mlir/test/Dialect/Vector/vector-mem-transforms.mlir b/mlir/test/Dialect/Vector/vector-mem-transforms.mlir
index e6593320f1bde..2004a47851e2e 100644
--- a/mlir/test/Dialect/Vector/vector-mem-transforms.mlir
+++ b/mlir/test/Dialect/Vector/vector-mem-transforms.mlir
@@ -1,12 +1,16 @@
 // RUN: mlir-opt %s -test-vector-to-vector-lowering | FileCheck %s
 
-// CHECK-LABEL:   func @maskedload0(
-// CHECK-SAME:                      %[[A0:.*]]: memref<?xf32>,
-// CHECK-SAME:                      %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-DAG:       %[[C:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[T:.*]] = vector.load %[[A0]][%[[C]]] : memref<?xf32>, vector<16xf32>
-// CHECK-NEXT:      return %[[T]] : vector<16xf32>
-func.func @maskedload0(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+//-----------------------------------------------------------------------------
+// [Pattern: MaskedLoadFolder]
+//-----------------------------------------------------------------------------
+
+// CHECK-LABEL:   func @fold_maskedload_all_true_dynamic(
+// CHECK-SAME:                      %[[BASE:.*]]: memref<?xf32>,
+// CHECK-SAME:                      %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-DAG:       %[[IDX:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[LOAD:.*]] = vector.load %[[BASE]][%[[IDX]]] : memref<?xf32>, vector<16xf32>
+// CHECK-NEXT:      return %[[LOAD]] : vector<16xf32>
+func.func @fold_maskedload_all_true_dynamic(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   %ld = vector.maskedload %base[%c0], %mask, %pass_thru
@@ -14,13 +18,13 @@ func.func @maskedload0(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vect
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @maskedload1(
-// CHECK-SAME:                      %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                      %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-DAG:       %[[C:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[T:.*]] = vector.load %[[A0]][%[[C]]] : memref<16xf32>, vector<16xf32>
-// CHECK-NEXT:      return %[[T]] : vector<16xf32>
-func.func @maskedload1(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+// CHECK-LABEL:   func @fold_maskedload_all_true_static(
+// CHECK-SAME:                      %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                      %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-DAG:       %[[IDX:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      %[[LOAD:.*]] = vector.load %[[BASE]][%[[IDX]]] : memref<16xf32>, vector<16xf32>
+// CHECK-NEXT:      return %[[LOAD]] : vector<16xf32>
+func.func @fold_maskedload_all_true_static(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   %ld = vector.maskedload %base[%c0], %mask, %pass_thru
@@ -28,11 +32,11 @@ func.func @maskedload1(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vec
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @maskedload2(
-// CHECK-SAME:                      %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                      %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-NEXT:      return %[[A1]] : vector<16xf32>
-func.func @maskedload2(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+// CHECK-LABEL:   func @fold_maskedload_all_false_static(
+// CHECK-SAME:                      %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                      %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-NEXT:      return %[[PASS_THRU]] : vector<16xf32>
+func.func @fold_maskedload_all_false_static(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [0] : vector<16xi1>
   %ld = vector.maskedload %base[%c0], %mask, %pass_thru
@@ -40,13 +44,13 @@ func.func @maskedload2(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vec
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @maskedload3(
-// CHECK-SAME:                      %[[A0:.*]]: memref<?xf32>,
-// CHECK-SAME:                      %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-DAG:       %[[C:.*]] = arith.constant 8 : index
-// CHECK-NEXT:      %[[T:.*]] = vector.load %[[A0]][%[[C]]] : memref<?xf32>, vector<16xf32>
-// CHECK-NEXT:      return %[[T]] : vector<16xf32>
-func.func @maskedload3(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+// CHECK-LABEL:   func @fold_maskedload_dynamic_non_zero_idx(
+// CHECK-SAME:                      %[[BASE:.*]]: memref<?xf32>,
+// CHECK-SAME:                      %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-DAG:       %[[IDX:.*]] = arith.constant 8 : index
+// CHECK-NEXT:      %[[LOAD:.*]] = vector.load %[[BASE]][%[[IDX]]] : memref<?xf32>, vector<16xf32>
+// CHECK-NEXT:      return %[[LOAD]] : vector<16xf32>
+func.func @fold_maskedload_dynamic_non_zero_idx(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c8 = arith.constant 8 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   %ld = vector.maskedload %base[%c8], %mask, %pass_thru
@@ -54,39 +58,49 @@ func.func @maskedload3(%base: memref<?xf32>, %pass_thru: vector<16xf32>) -> vect
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @maskedstore1(
-// CHECK-SAME:                       %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                       %[[A1:.*]]: vector<16xf32>) {
-// CHECK-NEXT:      %[[C:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      vector.store %[[A1]], %[[A0]][%[[C]]] : memref<16xf32>, vector<16xf32>
+//-----------------------------------------------------------------------------
+// [Pattern: MaskedStoreFolder]
+//-----------------------------------------------------------------------------
+
+// CHECK-LABEL:   func @fold_maskedstore_all_true(
+// CHECK-SAME:                       %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                       %[[VALUE:.*]]: vector<16xf32>) {
+// CHECK-NEXT:      %[[IDX:.*]] = arith.constant 0 : index
+// CHECK-NEXT:      vector.store %[[VALUE]], %[[BASE]][%[[IDX]]] : memref<16xf32>, vector<16xf32>
 // CHECK-NEXT:      return
-func.func @maskedstore1(%base: memref<16xf32>, %value: vector<16xf32>) {
+func.func @fold_maskedstore_all_true(%base: memref<16xf32>, %value: vector<16xf32>) {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   vector.maskedstore %base[%c0], %mask, %value : memref<16xf32>, vector<16xi1>, vector<16xf32>
   return
 }
 
-// CHECK-LABEL:   func @maskedstore2(
-// CHECK-SAME:                       %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                       %[[A1:.*]]: vector<16xf32>) {
+// CHECK-LABEL:   func @fold_maskedstore_all_false(
+// CHECK-SAME:                       %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                       %[[VALUE:.*]]: vector<16xf32>) {
 // CHECK-NEXT:      return
-func.func @maskedstore2(%base: memref<16xf32>, %value: vector<16xf32>)  {
+func.func @fold_maskedstore_all_false(%base: memref<16xf32>, %value: vector<16xf32>)  {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [0] : vector<16xi1>
   vector.maskedstore %base[%c0], %mask, %value : memref<16xf32>, vector<16xi1>, vector<16xf32>
   return
 }
 
-// CHECK-LABEL:   func @gather1(
-// CHECK-SAME:                  %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                  %[[A1:.*]]: vector<16xi32>,
-// CHECK-SAME:                  %[[A2:.*]]: vector<16xf32>) -> vector<16xf32> {
+//-----------------------------------------------------------------------------
+// [Pattern: GatherFolder]
+//-----------------------------------------------------------------------------
+
+/// There is no alternative (i.e. simpler) Op for this, hence no-fold.
+
+// CHECK-LABEL:   func @no_fold_gather_all_true(
+// CHECK-SAME:                  %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                  %[[INDICES:.*]]: vector<16xi32>,
+// CHECK-SAME:                  %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
 // CHECK-NEXT:      %[[C:.*]] = arith.constant 0 : index
 // CHECK-NEXT:      %[[M:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT:      %[[G:.*]] = vector.gather %[[A0]][%[[C]]] [%[[A1]]], %[[M]], %[[A2]] : memref<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
+// CHECK-NEXT:      %[[G:.*]] = vector.gather %[[BASE]][%[[C]]] [%[[INDICES]]], %[[M]], %[[PASS_THRU]] : memref<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32> into vector<16xf32>
 // CHECK-NEXT:      return %[[G]] : vector<16xf32>
-func.func @gather1(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+func.func @no_fold_gather_all_true(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   %ld = vector.gather %base[%c0][%indices], %mask, %pass_thru
@@ -94,12 +108,12 @@ func.func @gather1(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru:
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @gather2(
-// CHECK-SAME:                  %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                  %[[A1:.*]]: vector<16xi32>,
-// CHECK-SAME:                  %[[A2:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-NEXT:      return %[[A2]] : vector<16xf32>
-func.func @gather2(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+// CHECK-LABEL:   func @fold_gather_all_true(
+// CHECK-SAME:                  %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                  %[[INDICES:.*]]: vector<16xi32>,
+// CHECK-SAME:                  %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-NEXT:      return %[[PASS_THRU]] : vector<16xf32>
+func.func @fold_gather_all_true(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [0] : vector<16xi1>
   %ld = vector.gather %base[%c0][%indices], %mask, %pass_thru
@@ -107,15 +121,21 @@ func.func @gather2(%base: memref<16xf32>, %indices: vector<16xi32>, %pass_thru:
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @scatter1(
-// CHECK-SAME:                   %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                   %[[A1:.*]]: vector<16xi32>,
-// CHECK-SAME:                   %[[A2:.*]]: vector<16xf32>) {
+//-----------------------------------------------------------------------------
+// [Pattern: ScatterFolder]
+//-----------------------------------------------------------------------------
+
+/// There is no alternative (i.e. simpler) Op for this, hence no-fold.
+
+// CHECK-LABEL:   func @no_fold_scatter_all_true(
+// CHECK-SAME:                   %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                   %[[INDICES:.*]]: vector<16xi32>,
+// CHECK-SAME:                   %[[VALUE:.*]]: vector<16xf32>) {
 // CHECK-NEXT:      %[[C:.*]] = arith.constant 0 : index
 // CHECK-NEXT:      %[[M:.*]] = arith.constant dense<true> : vector<16xi1>
-// CHECK-NEXT:      vector.scatter %[[A0]][%[[C]]] [%[[A1]]], %[[M]], %[[A2]] : memref<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
+// CHECK-NEXT:      vector.scatter %[[BASE]][%[[C]]] [%[[INDICES]]], %[[M]], %[[VALUE]] : memref<16xf32>, vector<16xi32>, vector<16xi1>, vector<16xf32>
 // CHECK-NEXT:      return
-func.func @scatter1(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vector<16xf32>) {
+func.func @no_fold_scatter_all_true(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vector<16xf32>) {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   vector.scatter %base[%c0][%indices], %mask, %value
@@ -123,12 +143,12 @@ func.func @scatter1(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vec
   return
 }
 
-// CHECK-LABEL:   func @scatter2(
-// CHECK-SAME:                   %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                   %[[A1:.*]]: vector<16xi32>,
-// CHECK-SAME:                   %[[A2:.*]]: vector<16xf32>) {
+// CHECK-LABEL:   func @fold_scatter_all_false(
+// CHECK-SAME:                   %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                   %[[INDICES:.*]]: vector<16xi32>,
+// CHECK-SAME:                   %[[VALUE:.*]]: vector<16xf32>) {
 // CHECK-NEXT:      return
-func.func @scatter2(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vector<16xf32>) {
+func.func @fold_scatter_all_false(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vector<16xf32>) {
   %c0 = arith.constant 0 : index
   %0 = vector.type_cast %base : memref<16xf32> to memref<vector<16xf32>>
   %mask = vector.constant_mask [0] : vector<16xi1>
@@ -137,13 +157,17 @@ func.func @scatter2(%base: memref<16xf32>, %indices: vector<16xi32>, %value: vec
   return
 }
 
-// CHECK-LABEL:   func @expand1(
-// CHECK-SAME:                  %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                  %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
+//-----------------------------------------------------------------------------
+// [Pattern: ExpandLoadFolder]
+//-----------------------------------------------------------------------------
+
+// CHECK-LABEL:   func @fold_expandload_all_true(
+// CHECK-SAME:                  %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                  %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
 // CHECK-DAG:       %[[C:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      %[[T:.*]] = vector.load %[[A0]][%[[C]]] : memref<16xf32>, vector<16xf32>
+// CHECK-NEXT:      %[[T:.*]] = vector.load %[[BASE]][%[[C]]] : memref<16xf32>, vector<16xf32>
 // CHECK-NEXT:      return %[[T]] : vector<16xf32>
-func.func @expand1(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+func.func @fold_expandload_all_true(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   %ld = vector.expandload %base[%c0], %mask, %pass_thru
@@ -151,11 +175,11 @@ func.func @expand1(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @expand2(
-// CHECK-SAME:                  %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                  %[[A1:.*]]: vector<16xf32>) -> vector<16xf32> {
-// CHECK-NEXT:      return %[[A1]] : vector<16xf32>
-func.func @expand2(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
+// CHECK-LABEL:   func @fold_expandload_all_false(
+// CHECK-SAME:                  %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                  %[[PASS_THRU:.*]]: vector<16xf32>) -> vector<16xf32> {
+// CHECK-NEXT:      return %[[PASS_THRU]] : vector<16xf32>
+func.func @fold_expandload_all_false(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<16xf32> {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [0] : vector<16xi1>
   %ld = vector.expandload %base[%c0], %mask, %pass_thru
@@ -163,24 +187,28 @@ func.func @expand2(%base: memref<16xf32>, %pass_thru: vector<16xf32>) -> vector<
   return %ld : vector<16xf32>
 }
 
-// CHECK-LABEL:   func @compress1(
-// CHECK-SAME:                    %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                    %[[A1:.*]]: vector<16xf32>) {
+//-----------------------------------------------------------------------------
+// [Pattern: CompressStoreFolder]
+//-----------------------------------------------------------------------------
+
+// CHECK-LABEL:   func @fold_compressstore_all_true(
+// CHECK-SAME:                    %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                    %[[VALUE:.*]]: vector<16xf32>) {
 // CHECK-NEXT:      %[[C:.*]] = arith.constant 0 : index
-// CHECK-NEXT:      vector.store %[[A1]], %[[A0]][%[[C]]] : memref<16xf32>, vector<16xf32>
+// CHECK-NEXT:      vector.store %[[VALUE]], %[[BASE]][%[[C]]] : memref<16xf32>, vector<16xf32>
 // CHECK-NEXT:      return
-func.func @compress1(%base: memref<16xf32>, %value: vector<16xf32>) {
+func.func @fold_compressstore_all_true(%base: memref<16xf32>, %value: vector<16xf32>) {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [16] : vector<16xi1>
   vector.compressstore %base[%c0], %mask, %value  : memref<16xf32>, vector<16xi1>, vector<16xf32>
   return
 }
 
-// CHECK-LABEL:   func @compress2(
-// CHECK-SAME:                    %[[A0:.*]]: memref<16xf32>,
-// CHECK-SAME:                    %[[A1:.*]]: vector<16xf32>) {
+// CHECK-LABEL:   func @fold_compressstore_all_false(
+// CHECK-SAME:                    %[[BASE:.*]]: memref<16xf32>,
+// CHECK-SAME:                    %[[VALUE:.*]]: vector<16xf32>) {
 // CHECK-NEXT:      return
-func.func @compress2(%base: memref<16xf32>, %value: vector<16xf32>) {
+func.func @fold_compressstore_all_false(%base: memref<16xf32>, %value: vector<16xf32>) {
   %c0 = arith.constant 0 : index
   %mask = vector.constant_mask [0] : vector<16xi1>
   vector.compressstore %base[%c0], %mask, %value : memref<16xf32>, vector<16xi1>, vector<16xf32>
diff --git a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
index 0e1365aa64171..27a3dc373c739 100644
--- a/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
+++ b/mlir/test/Dialect/XeGPU/subgroup-distribute.mlir
@@ -214,3 +214,54 @@ gpu.module @xevm_module{
 
   }
 }
+
+// -----
+// CHECK-LABEL: gpu.func @warp_scf_for_unused_uniform_for_result(
+// CHECK:         %[[W:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16] args(%{{.*}} : index,
+// CHECK-SAME:      !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+// CHECK-SAME:      memref<16x16xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:           gpu.yield %{{.*}}, {{.*}} : vector<16x16xf32>, vector<16x1xf32>
+// CHECK:         }
+// CHECK:         %{{.*}}:2 = scf.for {{.*}} to %{{.*}} step %{{.*}} iter_args
+// CHECK-SAME:      (%{{.*}} = %[[W]]#0, %{{.*}} = %[[W]]#1) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:           %[[W1:.*]]:2 = gpu.warp_execute_on_lane_0(%{{.*}})[16]
+// CHECK-SAME:        args(%{{.*}} : vector<16x1xf32>, vector<16x1xf32>) -> (vector<16x1xf32>, vector<16x1xf32>) {
+// CHECK:             gpu.yield %{{.*}}, %{{.*}} : vector<16x16xf32>, vector<16x1xf32>
+// CHECK:           }
+// CHECK:           scf.yield %[[W1]]#0, %[[W1]]#1 : vector<16x1xf32>, vector<16x1xf32>
+// CHECK:         }
+gpu.module @xevm_module{
+  gpu.func @warp_scf_for_unused_uniform_for_result(%arg0: index,
+    %arg1: !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>,
+    %arg2: memref<16x16xf32>) {
+    %c128 = arith.constant 128 : index
+    %c1 = arith.constant 1 : index
+    %c0 = arith.constant 0 : index
+    %ini = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x1xf32>)
+    %ini2 = "some_def"() {layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>}
+      : () -> (vector<16x16xf32>)
+    %3:2 = scf.for %arg3 = %c0 to %c128 step %c1 iter_args(%arg4 = %ini2, %arg5 = %ini) -> (vector<16x16xf32>, vector<16x1xf32>) {
+      %1  = "some_def"(%arg5)
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : (vector<16x1xf32>) -> (vector<16x1xf32>)
+      %acc = "some_def"(%arg4, %1)
+        {
+          layout_operand_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_operand_1 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>,
+          layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+        }
+        : (vector<16x16xf32>, vector<16x1xf32>) -> (vector<16x16xf32>)
+      scf.yield %acc, %1 : vector<16x16xf32>, vector<16x1xf32>
+    }
+    {
+      layout_result_0 = #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>
+    }
+    xegpu.store_nd %3#0, %arg1[%c0, %c0]
+      : vector<16x16xf32>, !xegpu.tensor_desc<16x16xf32, #xegpu.layout<lane_layout = [1, 16], lane_data = [1, 1]>>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
index fe4f44c0b02ab..7e742af754fbe 100644
--- a/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
+++ b/mlir/test/Dialect/XeGPU/xegpu-blocking.mlir
@@ -682,3 +682,73 @@ gpu.module @test_kernel {
     gpu.return
   }
 }
+
+// -----
+gpu.module @test_kernel {
+  // CHECK-LABEL: remove_unit_dim_inst_data
+  // CHECK-SAME: [[arg0:%.+]]: ui64
+  // CHECK: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x1x32xf32>
+  // CHECK: [[cst_0:%.+]] = arith.constant dense<true> : vector<16xi1>
+  // CHECK: [[cst_1:%.+]] = arith.constant dense<[0, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120]> : vector<16xindex>
+  // CHECK: [[cst_2:%.+]] = arith.constant dense<[128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248]> : vector<16xindex>
+  // CHECK: [[ld_0:%.+]] = xegpu.load [[arg0]][[[cst_1]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
+  // CHECK: [[ld_1:%.+]] = xegpu.load [[arg0]][[[cst_2]]], [[cst_0]] <{chunk_size = 1 : i64, l1_hint = #xegpu.cache_hint<cached>}> : ui64, vector<16xindex>, vector<16xi1> -> vector<16xf32>
+  // CHECK: [[ins_0:%.+]] = vector.insert_strided_slice [[ld_0]], [[cst]] {offsets = [0, 0, 0], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
+  // CHECK: [[ins_1:%.+]] = vector.insert_strided_slice [[ld_1]], [[ins_0]] {offsets = [0, 0, 16], strides = [1]} : vector<16xf32> into vector<1x1x32xf32>
+  gpu.func @remove_unit_dim_inst_data(%src: ui64) -> vector<1x1x32xf32> {
+      %cst = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<[[
+      [0,   8,  16,  24,  32,  40,  48,  56,
+      64,  72,  80,  88,  96, 104, 112, 120,
+      128, 136, 144, 152, 160, 168, 176, 184,
+      192, 200, 208, 216, 224, 232, 240, 248]
+      ]]> : vector<1x1x32xindex>
+
+      %mask = arith.constant {layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>} dense<true> : vector<1x1x32xi1>
+      %ld = xegpu.load %src[%cst], %mask {chunk_size = 1, layout_result_0 = #xegpu.layout<inst_data = [1, 1, 16]>, l1_hint = #xegpu.cache_hint<cached>} : ui64, vector<1x1x32xindex>, vector<1x1x32xi1> -> vector<1x1x32xf32>
+
+      gpu.return %ld : vector<1x1x32xf32>
+  }
+}
+
+// -----
+#l = #xegpu.layout<inst_data = [1, 16]>
+gpu.module @test_kernel {
+  // CHECK-LABEL: load_store_nd_with_offsets
+  // CHECK-SAME: [[arg0:%.+]]: memref<1024x1024xf32>, [[arg1:%.+]]: memref<1024x1024xf32>, [[arg2:%.+]]: memref<1024x1024xf32>
+  // CHECK-DAG: [[cst:%.+]] = arith.constant dense<0.000000e+00> : vector<1x32xf32>
+  // CHECK-DAG: [[c16:%.+]] = arith.constant 16 : index
+  // CHECK-DAG: [[c0:%.+]] = arith.constant 0 : index
+  // CHECK: [[tdesc_a:%.+]] = xegpu.create_nd_tdesc [[arg0]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[tdesc_b:%.+]] = xegpu.create_nd_tdesc [[arg1]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[tdesc_c:%.+]] = xegpu.create_nd_tdesc [[arg2]] : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x16xf32>
+  // CHECK: [[ld_a0:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c0]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_a1:%.+]] = xegpu.load_nd [[tdesc_a]][[[c0]], [[c16]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_b0:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c0]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[ld_b1:%.+]] = xegpu.load_nd [[tdesc_b]][[[c0]], [[c16]]]  : !xegpu.tensor_desc<1x16xf32> -> vector<1x16xf32>
+  // CHECK: [[cast_a0:%.+]] = vector.shape_cast [[ld_a0]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[cast_b0:%.+]] = vector.shape_cast [[ld_b0]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[add0:%.+]] = arith.addf [[cast_a0]], [[cast_b0]] : vector<16xf32>
+  // CHECK: [[ins0:%.+]] = vector.insert_strided_slice [[add0]], [[cst]] {offsets = [0, 0], strides = [1]} : vector<16xf32> into vector<1x32xf32>
+  // CHECK: [[cast_a1:%.+]] = vector.shape_cast [[ld_a1]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[cast_b1:%.+]] = vector.shape_cast [[ld_b1]] : vector<1x16xf32> to vector<16xf32>
+  // CHECK: [[add1:%.+]] = arith.addf [[cast_a1]], [[cast_b1]] : vector<16xf32>
+  // CHECK: [[ins1:%.+]] = vector.insert_strided_slice [[add1]], [[ins0]] {offsets = [0, 16], strides = [1]} : vector<16xf32> into vector<1x32xf32>
+  // CHECK: [[ext0:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 0], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
+  // CHECK: [[ext1:%.+]] = vector.extract_strided_slice [[ins1]] {offsets = [0, 16], sizes = [1, 16], strides = [1, 1]} : vector<1x32xf32> to vector<1x16xf32>
+  // CHECK: xegpu.store_nd [[ext0]], [[tdesc_c]][[[c0]], [[c0]]]  : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+  // CHECK: xegpu.store_nd [[ext1]], [[tdesc_c]][[[c0]], [[c16]]]  : vector<1x16xf32>, !xegpu.tensor_desc<1x16xf32>
+  gpu.func @load_store_nd_with_offsets(%A: memref<1024x1024xf32>, %B: memref<1024x1024xf32>, %C: memref<1024x1024xf32>) {
+    %c0 = arith.constant 0 : index
+
+    %a_tdesc = xegpu.create_nd_tdesc %A : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+    %b_tdesc = xegpu.create_nd_tdesc %B : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+    %c_tdesc = xegpu.create_nd_tdesc %C : memref<1024x1024xf32> -> !xegpu.tensor_desc<1x32xf32, #l>
+
+    %a = xegpu.load_nd %a_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+    %b = xegpu.load_nd %b_tdesc[%c0, %c0] : !xegpu.tensor_desc<1x32xf32, #l> -> vector<1x32xf32>
+
+    %result = arith.addf %a, %b {layout_result_0 = #l} : vector<1x32xf32>
+    xegpu.store_nd %result, %c_tdesc[%c0, %c0] : vector<1x32xf32>, !xegpu.tensor_desc<1x32xf32, #l>
+    gpu.return
+  }
+}
diff --git a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
index 9f4393efc87bf..127ab70cb4539 100644
--- a/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
+++ b/mlir/test/Integration/Dialect/Linalg/CPU/runtime-verification.mlir
@@ -103,6 +103,17 @@ func.func @main() {
   // CHECK: unexpected negative result on dimension #0 of input/output operand #0
   func.call @reverse_from_3(%d5x) : (tensor<?xf32>) -> (tensor<?xf32>)
 
+  %c0x = arith.constant dense<1.0> : tensor<0xf32>
+  %d0x = tensor.cast %c0x : tensor<0xf32> to tensor<?xf32>
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_1d(%d0x) : (tensor<?xf32>) -> (tensor<?xf32>)
+
+  %c0x5 = arith.constant dense<0.0> : tensor<0x5xf32>
+  %d0x5 = tensor.cast %c0x5 : tensor<0x5xf32> to tensor<?x?xf32>
+
+  // CHECK-NOT: ERROR: Runtime op verification failed
+  func.call @fill_empty_2d(%d0x5) : (tensor<?x?xf32>) -> (tensor<?x?xf32>)
+
   return
 }
 
@@ -297,3 +308,15 @@ func.func @reverse_from_3(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
   } -> tensor<?xf32>
   return %result : tensor<?xf32>
 }
+
+func.func @fill_empty_1d(%arg0: tensor<?xf32>) -> (tensor<?xf32>) {
+  %c0 = arith.constant 0.0 : f32
+  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?xf32>) -> tensor<?xf32>
+  return %0 : tensor<?xf32>
+}
+
+func.func @fill_empty_2d(%arg0: tensor<?x?xf32>) -> (tensor<?x?xf32>) {
+  %c0 = arith.constant 0.0 : f32
+  %0 = linalg.fill ins(%c0 : f32) outs(%arg0 : tensor<?x?xf32>) -> tensor<?x?xf32>
+  return %0 : tensor<?x?xf32>
+}
diff --git a/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
new file mode 100644
index 0000000000000..edf8775c72418
--- /dev/null
+++ b/mlir/test/Integration/Dialect/XeVM/GPU/gpu_printf.mlir
@@ -0,0 +1,30 @@
+// RUN: mlir-opt %s \
+// RUN: | mlir-opt -pass-pipeline='builtin.module(cse,func.func(gpu-async-region),xevm-attach-target,gpu.module(convert-gpu-to-llvm-spv{use-64bit-index=true},convert-xevm-to-llvm,cse))' \
+// RUN: | mlir-opt -convert-scf-to-cf -convert-cf-to-llvm -convert-vector-to-llvm -convert-arith-to-llvm \
+// RUN: | mlir-opt -gpu-to-llvm -reconcile-unrealized-casts -cse -gpu-module-to-binary \
+// RUN: | mlir-runner \
+// RUN:   --shared-libs=%mlir_sycl_runtime \
+// RUN:   --shared-libs=%mlir_runner_utils \
+// RUN:   --shared-libs=%mlir_c_runner_utils \
+// RUN:   --entry-point-result=void \
+// RUN: | FileCheck %s
+
+module @test attributes {gpu.container_module} {
+  gpu.module @test_module {
+    gpu.func @test_printf(%arg0: i32, %arg1: f32) kernel {
+      gpu.printf "Hello: %d\n", %arg0 : i32
+      gpu.printf "Hello: %f\n", %arg1 : f32
+      gpu.return
+    }
+  }
+
+  func.func @main() attributes {llvm.emit_c_interface} {
+    %c1 = arith.constant 1 : index
+    %c11 = arith.constant 11 : i32
+    %c4 = arith.constant 4.0 : f32
+    // CHECK: Hello: 11
+    // CHECK: Hello: 4.000000
+    gpu.launch_func @test_module::@test_printf blocks in (%c1, %c1, %c1) threads in (%c1, %c1, %c1) args(%c11 : i32, %c4 : f32)
+    return
+  }
+}
diff --git a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
index 8116044594fca..21d7816934bf9 100644
--- a/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
+++ b/mlir/test/Interfaces/TilingInterface/tile-and-fuse-using-interface.mlir
@@ -658,33 +658,20 @@ module attributes {transform.with_named_sequence} {
   }
 }
 
-// CHECK:           func.func private @tile_one_consumer_using_tile_and_fuse(%[[VAL_0:.*]]: tensor<16x128x48x96xf32>, %[[VAL_1:.*]]: tensor<16x96x48x128xf32>) -> tensor<16x96x48x128xf32> {
-// CHECK:             %[[VAL_2:.*]] = arith.constant 0 : index
-// CHECK:             %[[VAL_3:.*]] = arith.constant 16 : index
-// CHECK:             %[[VAL_4:.*]] = arith.constant 128 : index
-// CHECK:             %[[VAL_5:.*]] = arith.constant 48 : index
-// CHECK:             %[[VAL_6:.*]] = arith.constant 96 : index
-// CHECK:             %[[VAL_7:.*]] = arith.constant 1 : index
-// CHECK:             %[[VAL_8:.*]] = scf.for %[[VAL_9:.*]] = %[[VAL_2]] to %[[VAL_3]] step %[[VAL_7]] iter_args(%[[VAL_10:.*]] = %[[VAL_1]]) -> (tensor<16x96x48x128xf32>) {
-// CHECK:               %[[VAL_11:.*]] = scf.for %[[VAL_12:.*]] = %[[VAL_2]] to %[[VAL_4]] step %[[VAL_3]] iter_args(%[[VAL_13:.*]] = %[[VAL_10]]) -> (tensor<16x96x48x128xf32>) {
-// CHECK:                 %[[VAL_14:.*]] = scf.for %[[VAL_15:.*]] = %[[VAL_2]] to %[[VAL_5]] step %[[VAL_3]] iter_args(%[[VAL_16:.*]] = %[[VAL_13]]) -> (tensor<16x96x48x128xf32>) {
-// CHECK:                   %[[VAL_17:.*]] = scf.for %[[VAL_18:.*]] = %[[VAL_2]] to %[[VAL_6]] step %[[VAL_3]] iter_args(%[[VAL_19:.*]] = %[[VAL_16]]) -> (tensor<16x96x48x128xf32>) {
-// CHECK:                     %[[VAL_20:.*]] = tensor.extract_slice %[[VAL_0]]{{\[}}%[[VAL_9]], %[[VAL_12]], %[[VAL_15]], %[[VAL_18]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x128x48x96xf32> to tensor<1x16x16x16xf32>
-// CHECK:                     %[[VAL_21:.*]] = tensor.extract_slice %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<16x96x48x128xf32> to tensor<1x16x16x16xf32>
-// CHECK:                     %[[VAL_22:.*]] = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%[[VAL_20]] : tensor<1x16x16x16xf32>) outs(%[[VAL_21]] : tensor<1x16x16x16xf32>) {
-// CHECK:                     ^bb0(%[[VAL_23:.*]]: f32, %[[VAL_24:.*]]: f32):
-// CHECK:                       linalg.yield %[[VAL_23]] : f32
-// CHECK:                     } -> tensor<1x16x16x16xf32>
-// CHECK:                     %[[VAL_25:.*]] = tensor.insert_slice %[[VAL_26:.*]] into %[[VAL_19]]{{\[}}%[[VAL_9]], %[[VAL_18]], %[[VAL_15]], %[[VAL_12]]] [1, 16, 16, 16] [1, 1, 1, 1] : tensor<1x16x16x16xf32> into tensor<16x96x48x128xf32>
-// CHECK:                     scf.yield %[[VAL_25]] : tensor<16x96x48x128xf32>
-// CHECK:                   }
-// CHECK:                   scf.yield %[[VAL_27:.*]] : tensor<16x96x48x128xf32>
-// CHECK:                 }
-// CHECK:                 scf.yield %[[VAL_28:.*]] : tensor<16x96x48x128xf32>
-// CHECK:               }
-// CHECK:               scf.yield %[[VAL_29:.*]] : tensor<16x96x48x128xf32>
-// CHECK:             }
-// CHECK:             return %[[VAL_30:.*]] : tensor<16x96x48x128xf32>
-// CHECK:           }
-// CHECK:         }
-
+// CHECK-LABEL: func private @tile_one_consumer_using_tile_and_fuse
+//  CHECK-SAME:   %[[ARG0:.*]]: tensor<16x128x48x96xf32>
+//  CHECK-SAME:   %[[ARG1:.*]]: tensor<16x96x48x128xf32>
+//       CHECK:   scf.for %[[IV0:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:     iter_args(%[[ITERARG0:.+]] = %[[ARG1]])
+//       CHECK:     scf.for %[[IV1:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:       iter_args(%[[ITERARG1:.+]] = %[[ITERARG0]])
+//       CHECK:       scf.for %[[IV2:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:         iter_args(%[[ITERARG2:.+]] = %[[ITERARG1]])
+//       CHECK:         scf.for %[[IV3:[a-zA-Z0-9]+]] =
+//  CHECK-SAME:           iter_args(%[[ITERARG3:.+]] = %[[ITERARG2]])
+//       CHECK:           %[[TILEDARG0:.*]] = tensor.extract_slice %[[ARG0]]{{\[}}%[[IV0]], %[[IV1]], %[[IV2]], %[[IV3]]]
+//       CHECK:           %[[TILEDARG1:.*]] = tensor.extract_slice %[[ITERARG3]]{{\[}}%[[IV0]], %[[IV3]], %[[IV2]], %[[IV1]]]
+//       CHECK:           %[[RES:.*]] = linalg.generic
+//  CHECK-SAME:           ins(%[[TILEDARG0]]
+//  CHECK-SAME:           outs(%[[TILEDARG1]]
+//       CHECK:           tensor.insert_slice %[[RES:.*]]
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir
index 04e2ddff802a9..451475ca76027 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp4x2.mlir
@@ -10,3 +10,14 @@ llvm.func @convert_f32x2_to_f4x2_e2m1(%srcA : f32, %srcB : f32) {
   %res2 = nvvm.convert.f32x2.to.f4x2 %srcA, %srcB {relu = true} : i8 (f4E2M1FN)
   llvm.return
 }
+
+// CHECK-LABEL: @convert_f4x2_to_f16x2
+llvm.func @convert_f4x2_to_f16x2(%src : i8) {
+  // CHECK: %[[res1:.*]] = zext i8 %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn(i16 %[[res1]])
+  %res1 = nvvm.convert.f4x2.to.f16x2 %src : i8 (f4E2M1FN)-> vector<2xf16>
+  // CHECK: %[[res2:.*]] = zext i8 %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m1x2.to.f16x2.rn.relu(i16 %[[res2]])
+  %res2 = nvvm.convert.f4x2.to.f16x2 %src {relu = true} : i8 (f4E2M1FN)-> vector<2xf16>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
index 99289923b58b1..61a7a48f40d54 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp6x2.mlir
@@ -19,3 +19,27 @@ llvm.func @convert_f32x2_to_fp6x2_vector(%srcA : f32, %srcB : f32) {
   %res2 = nvvm.convert.f32x2.to.f6x2 %srcA, %srcB : vector<2xi8> (f6E3M2FN)
   llvm.return
 }
+
+// -----
+
+// CHECK-LABEL: @convert_f6x2_to_f16x2_e2m3
+llvm.func @convert_f6x2_to_f16x2_e2m3(%src : vector<2xi8>) {
+  // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn(i16 %[[res1]])
+  %res1 = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f6E2M3FN)-> vector<2xf16>
+  // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e2m3x2.to.f16x2.rn.relu(i16 %[[res2]])
+  %res2 = nvvm.convert.f6x2.to.f16x2 %src {relu = true} : vector<2xi8> (f6E2M3FN)-> vector<2xf16>
+  llvm.return
+}
+
+// CHECK-LABEL: @convert_f6x2_to_f16x2_e3m2
+llvm.func @convert_f6x2_to_f16x2_e3m2(%src : vector<2xi8>) {
+  // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn(i16 %[[res1]])
+  %res1 = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f6E3M2FN)-> vector<2xf16>
+  // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e3m2x2.to.f16x2.rn.relu(i16 %[[res2]])
+  %res2 = nvvm.convert.f6x2.to.f16x2 %src {relu = true} : vector<2xi8> (f6E3M2FN)-> vector<2xf16>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
index de21826445afb..4afe901bc08e9 100644
--- a/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
+++ b/mlir/test/Target/LLVMIR/nvvm/convert_fp8x2.mlir
@@ -100,3 +100,37 @@ llvm.func @convert_bf16x2_to_f8x2_vector_return(%src : vector<2xbf16>) {
   %res2 = nvvm.convert.bf16x2.to.f8x2 %src {rnd = #nvvm.fp_rnd_mode<rp>, sat = #nvvm.sat_mode<satfinite>} : vector<2xbf16> -> vector<2xi8> (f8E8M0FNU)
   llvm.return
 }
+
+// -----
+
+// CHECK-LABEL: @convert_f8x2_to_f16x2
+llvm.func @convert_f8x2_to_f16x2_e4m3(%src : vector<2xi8>) {
+  // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn(i16 %[[res1]])
+  %res1 = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E4M3FN)-> vector<2xf16>
+  // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e4m3x2.to.f16x2.rn.relu(i16 %[[res2]])
+  %res2 = nvvm.convert.f8x2.to.f16x2 %src {relu = true} : vector<2xi8> (f8E4M3FN)-> vector<2xf16>
+  llvm.return
+}
+
+// CHECK-LABEL: @convert_f8x2_to_f16x2_e5m2
+llvm.func @convert_f8x2_to_f16x2_e5m2(%src : vector<2xi8>) {
+  // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn(i16 %[[res1]])
+  %res1 = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E5M2)-> vector<2xf16>
+  // CHECK: %[[res2:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x half> @llvm.nvvm.e5m2x2.to.f16x2.rn.relu(i16 %[[res2]])
+  %res2 = nvvm.convert.f8x2.to.f16x2 %src {relu = true} : vector<2xi8> (f8E5M2)-> vector<2xf16>
+  llvm.return
+}
+
+// -----
+
+// CHECK-LABEL: @convert_f8x2_to_bf16x2_ue8m0
+llvm.func @convert_f8x2_to_bf16x2_ue8m0(%src : vector<2xi8>) {
+  // CHECK: %[[res1:.*]] = bitcast <2 x i8> %{{.*}} to i16
+  // CHECK-NEXT: %{{.*}} = call <2 x bfloat> @llvm.nvvm.ue8m0x2.to.bf16x2(i16 %[[res1]])
+  %res1 = nvvm.convert.f8x2.to.bf16x2 %src : vector<2xi8> (f8E8M0FNU)-> vector<2xbf16>
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
index 6cccfe424d293..09b8f593154b5 100644
--- a/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir-invalid.mlir
@@ -262,6 +262,38 @@ llvm.func @nvvm_cvt_f32x2_to_f4x2_invalid_type(%a : f32, %b : f32) {
 
 // -----
 
+llvm.func @nvvm_cvt_f8x2_to_f16x2_invalid_type(%src : vector<2xi8>) {
+  // expected-error @below {{Only 'f8E4M3FN' and 'f8E5M2' types are supported for conversions from f8x2 to f16x2.}}
+  %res = nvvm.convert.f8x2.to.f16x2 %src : vector<2xi8> (f8E4M3) -> vector<2xf16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_cvt_f8x2_to_bf16x2_invalid_type(%src : vector<2xi8>) {
+  // expected-error @below {{Only 'f8E8M0FNU' type is supported for conversions from f8x2 to bf16x2.}}
+  %res = nvvm.convert.f8x2.to.bf16x2 %src : vector<2xi8> (f8E4M3FN) -> vector<2xbf16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_cvt_f6x2_to_f16x2_invalid_type(%src : vector<2xi8>) {
+  // expected-error @below {{Only 'f6E2M3FN' and 'f6E3M2FN' types are supported for conversions from f6x2 to f16x2.}}
+  %res = nvvm.convert.f6x2.to.f16x2 %src : vector<2xi8> (f8E4M3FN) -> vector<2xf16>
+  llvm.return
+}
+
+// -----
+
+llvm.func @nvvm_cvt_f4x2_to_f16x2_invalid_type(%src : i8) {
+  // expected-error @below {{Only 'f4E2M1FN' type is supported for conversions from f4x2 to f16x2.}}
+  %res = nvvm.convert.f4x2.to.f16x2 %src : i8 (f6E2M3FN) -> vector<2xf16>
+  llvm.return
+}
+
+// -----
+
 llvm.func @nvvm_prefetch_L1_with_evict_priority(%global_ptr: !llvm.ptr<1>) {
   // expected-error @below {{cache eviction priority supported only for cache level L2}}
   nvvm.prefetch level = L1, evict_priority = evict_last, %global_ptr : !llvm.ptr<1>
diff --git a/mlir/test/Target/LLVMIR/openmp-todo.mlir b/mlir/test/Target/LLVMIR/openmp-todo.mlir
index 7355beaaa6d76..3fa982e0ed321 100644
--- a/mlir/test/Target/LLVMIR/openmp-todo.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-todo.mlir
@@ -246,24 +246,6 @@ llvm.func @target_is_device_ptr(%x : !llvm.ptr) {
 
 // -----
 
-omp.private {type = firstprivate} @x.privatizer : i32 copy {
-^bb0(%mold: !llvm.ptr, %private: !llvm.ptr):
-  %0 = llvm.load %mold : !llvm.ptr -> i32
-  llvm.store %0, %private : i32, !llvm.ptr
-  omp.yield(%private: !llvm.ptr)
-}
-llvm.func @target_firstprivate(%x : !llvm.ptr) {
-  %0 = omp.map.info var_ptr(%x : !llvm.ptr, i32) map_clauses(to) capture(ByRef) -> !llvm.ptr
-  // expected-error@below {{not yet implemented: Unhandled clause privatization for deferred target tasks in omp.target operation}}
-  // expected-error@below {{LLVM Translation failed for operation: omp.target}}
-  omp.target nowait map_entries(%0 -> %blockarg0 : !llvm.ptr) private(@x.privatizer %x -> %arg0 [map_idx=0] : !llvm.ptr) {
-    omp.terminator
-  }
-  llvm.return
-}
-
-// -----
-
 llvm.func @target_enter_data_depend(%x: !llvm.ptr) {
   // expected-error@below {{not yet implemented: Unhandled clause depend in omp.target_enter_data operation}}
   // expected-error@below {{LLVM Translation failed for operation: omp.target_enter_data}}
diff --git a/mlir/test/Transforms/scf-loop-unroll.mlir b/mlir/test/Transforms/scf-loop-unroll.mlir
index 0ef6ad15d4eb0..db96c659c49fb 100644
--- a/mlir/test/Transforms/scf-loop-unroll.mlir
+++ b/mlir/test/Transforms/scf-loop-unroll.mlir
@@ -1,6 +1,6 @@
 // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=3" -split-input-file -canonicalize | FileCheck %s
 // RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-BY-1
-// RUN: mlir-opt %s --test-loop-unrolling="unroll-full=true" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL
+// RUN: mlir-opt %s --test-loop-unrolling="unroll-factor=-1" -split-input-file -canonicalize | FileCheck %s --check-prefix UNROLL-FULL
 
 // CHECK-LABEL: scf_loop_unroll_single
 func.func @scf_loop_unroll_single(%arg0 : f32, %arg1 : f32) -> f32 {
diff --git a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
index 1e593389ec683..a54b642d4db42 100644
--- a/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
+++ b/mlir/test/lib/Dialect/OpenACC/CMakeLists.txt
@@ -2,6 +2,7 @@ add_mlir_library(MLIROpenACCTestPasses
   TestOpenACC.cpp
   TestPointerLikeTypeInterface.cpp
   TestRecipePopulate.cpp
+  TestOpenACCSupport.cpp
   
   EXCLUDE_FROM_LIBMLIR
 )
@@ -11,6 +12,7 @@ mlir_target_link_libraries(MLIROpenACCTestPasses PUBLIC
   MLIRFuncDialect
   MLIRMemRefDialect
   MLIROpenACCDialect
+  MLIROpenACCAnalysis
   MLIRPass
   MLIRSupport
 )
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
index bea21b9827f7e..e59d77777ee40 100644
--- a/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACC.cpp
@@ -16,11 +16,13 @@ namespace test {
 // Forward declarations of individual test pass registration functions
 void registerTestPointerLikeTypeInterfacePass();
 void registerTestRecipePopulatePass();
+void registerTestOpenACCSupportPass();
 
 // Unified registration function for all OpenACC tests
 void registerTestOpenACC() {
   registerTestPointerLikeTypeInterfacePass();
   registerTestRecipePopulatePass();
+  registerTestOpenACCSupportPass();
 }
 
 } // namespace test
diff --git a/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
new file mode 100644
index 0000000000000..8bf984bdc2632
--- /dev/null
+++ b/mlir/test/lib/Dialect/OpenACC/TestOpenACCSupport.cpp
@@ -0,0 +1,73 @@
+//===- TestOpenACCSupport.cpp - Test OpenACCSupport Analysis -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains test passes for testing the OpenACCSupport analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/MemRef/IR/MemRef.h"
+#include "mlir/Dialect/OpenACC/Analysis/OpenACCSupport.h"
+#include "mlir/Dialect/OpenACC/OpenACC.h"
+#include "mlir/Pass/Pass.h"
+
+using namespace mlir;
+using namespace mlir::acc;
+
+namespace {
+
+struct TestOpenACCSupportPass
+    : public PassWrapper<TestOpenACCSupportPass, OperationPass<func::FuncOp>> {
+  MLIR_DEFINE_EXPLICIT_INTERNAL_INLINE_TYPE_ID(TestOpenACCSupportPass)
+
+  StringRef getArgument() const override { return "test-acc-support"; }
+
+  StringRef getDescription() const override {
+    return "Test OpenACCSupport analysis";
+  }
+
+  void runOnOperation() override;
+
+  void getDependentDialects(DialectRegistry &registry) const override {
+    registry.insert<acc::OpenACCDialect>();
+    registry.insert<memref::MemRefDialect>();
+  }
+};
+
+void TestOpenACCSupportPass::runOnOperation() {
+  auto func = getOperation();
+
+  // Get the OpenACCSupport analysis
+  OpenACCSupport &support = getAnalysis<OpenACCSupport>();
+
+  // Walk through operations looking for test attributes
+  func.walk([&](Operation *op) {
+    // Check for test.var_name attribute. This is the marker used to identify
+    // the operations that need to be tested for getVariableName.
+    if (op->hasAttr("test.var_name")) {
+      // For each result of this operation, try to get the variable name
+      for (auto result : op->getResults()) {
+        std::string foundName = support.getVariableName(result);
+        llvm::outs() << "op=" << *op << "\n\tgetVariableName=\"" << foundName
+                     << "\"\n";
+      }
+    }
+  });
+}
+
+} // namespace
+
+namespace mlir {
+namespace test {
+
+void registerTestOpenACCSupportPass() {
+  PassRegistration<TestOpenACCSupportPass>();
+}
+
+} // namespace test
+} // namespace mlir
diff --git a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
index ced003305a7b8..2470380682318 100644
--- a/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
+++ b/mlir/test/lib/Dialect/SCF/TestLoopUnrolling.cpp
@@ -42,11 +42,10 @@ struct TestLoopUnrollingPass
   TestLoopUnrollingPass(const TestLoopUnrollingPass &) {}
   explicit TestLoopUnrollingPass(uint64_t unrollFactorParam,
                                  unsigned loopDepthParam,
-                                 bool annotateLoopParam, bool unrollFullParam) {
+                                 bool annotateLoopParam) {
     unrollFactor = unrollFactorParam;
     loopDepth = loopDepthParam;
     annotateLoop = annotateLoopParam;
-    unrollFull = unrollFactorParam;
   }
 
   void getDependentDialects(DialectRegistry &registry) const override {
@@ -54,6 +53,12 @@ struct TestLoopUnrollingPass
   }
 
   void runOnOperation() override {
+    if (!(unrollFactor.getValue() > 0 || unrollFactor.getValue() == -1)) {
+      emitError(UnknownLoc::get(&getContext()),
+                "Invalid option: 'unroll-factor' should be greater than 0 or "
+                "equal to -1");
+      return signalPassFailure();
+    }
     SmallVector<scf::ForOp, 4> loops;
     getOperation()->walk([&](scf::ForOp forOp) {
       if (getNestingDepth(forOp) == loopDepth)
@@ -65,15 +70,15 @@ struct TestLoopUnrollingPass
       }
     };
     for (auto loop : loops) {
-      if (unrollFull)
+      if (unrollFactor.getValue() == -1)
         (void)loopUnrollFull(loop);
       else
         (void)loopUnrollByFactor(loop, unrollFactor, annotateFn);
     }
   }
-  Option<uint64_t> unrollFactor{*this, "unroll-factor",
-                                llvm::cl::desc("Loop unroll factor."),
-                                llvm::cl::init(1)};
+  Option<int64_t> unrollFactor{*this, "unroll-factor",
+                               llvm::cl::desc("Loop unroll factor."),
+                               llvm::cl::init(1)};
   Option<bool> annotateLoop{*this, "annotate",
                             llvm::cl::desc("Annotate unrolled iterations."),
                             llvm::cl::init(false)};
@@ -82,9 +87,6 @@ struct TestLoopUnrollingPass
                                 llvm::cl::init(false)};
   Option<unsigned> loopDepth{*this, "loop-depth", llvm::cl::desc("Loop depth."),
                              llvm::cl::init(0)};
-  Option<bool> unrollFull{*this, "unroll-full",
-                          llvm::cl::desc("Full unroll loops."),
-                          llvm::cl::init(false)};
 };
 } // namespace
 
diff --git a/mlir/test/lib/Dialect/Test/TestPatterns.cpp b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
index 48bba2cf7cc85..4f858e6d532cd 100644
--- a/mlir/test/lib/Dialect/Test/TestPatterns.cpp
+++ b/mlir/test/lib/Dialect/Test/TestPatterns.cpp
@@ -2137,7 +2137,7 @@ struct TestTypeConversionDriver
                                           Location loc) -> Value {
       if (inputs.size() != 1 || !inputs[0].getType().isInteger(37))
         return Value();
-      return builder.create<UnrealizedConversionCastOp>(loc, type, inputs)
+      return UnrealizedConversionCastOp::create(builder, loc, type, inputs)
           .getResult(0);
     });
 
diff --git a/mlir/test/python/dialects/shard.py b/mlir/test/python/dialects/shard.py
new file mode 100644
index 0000000000000..c3ba605296345
--- /dev/null
+++ b/mlir/test/python/dialects/shard.py
@@ -0,0 +1,67 @@
+# RUN: %PYTHON %s | FileCheck %s
+
+from mlir.ir import *
+from mlir.dialects import shard
+from mlir.dialects import func
+
+
+def constructAndPrintInModule(f):
+    print("\nTEST:", f.__name__)
+    with Context(), Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            f()
+        print(module)
+    module.operation.verify()
+    return f
+
+
+# CHECK-LABEL: TEST: testShardGrid
+@constructAndPrintInModule
+def testShardGrid():
+    # Test creating shard grids with different shapes
+    grid2d = shard.GridOp("grid_2d", [2, 2])
+    grid1d = shard.GridOp("grid_1d", [4])
+
+    # CHECK: shard.grid @grid_2d(shape = 2x2)
+    # CHECK: shard.grid @grid_1d(shape = 4)
+
+
+# CHECK-LABEL: TEST: testCollectiveOperations
+@constructAndPrintInModule
+def testCollectiveOperations():
+    # Create grid and types
+    grid_op = shard.GridOp("grid_2x2", [2, 2])
+    i32 = IntegerType.get_signless(32)
+    index_type = IndexType.get()
+    input_type = RankedTensorType.get([4, 2], i32)
+    gather_result_type = RankedTensorType.get([4, 4], i32)
+
+    # Create a function to hold the operations
+    func_type = FunctionType.get([input_type], [input_type])
+    test_func = func.FuncOp("test_collectives", func_type)
+
+    with InsertionPoint(test_func.add_entry_block()):
+        arg = test_func.entry_block.arguments[0]
+
+        gather_op = shard.AllGatherOp(
+            input=arg,
+            grid=FlatSymbolRefAttr.get("grid_2x2"),
+            grid_axes=DenseI16ArrayAttr.get([1]),
+            gather_axis=IntegerAttr.get(index_type, 1),
+            result=gather_result_type,
+        )
+
+        reduce_op = shard.AllReduceOp(
+            input=arg,
+            grid=FlatSymbolRefAttr.get("grid_2x2"),
+            reduction=shard.ReductionKind.Sum,
+            result=input_type,
+        )
+
+        func.ReturnOp([reduce_op])
+
+    # CHECK: shard.grid @grid_2x2(shape = 2x2)
+    # CHECK: func.func @test_collectives(%arg0: tensor<4x2xi32>) -> tensor<4x2xi32>
+    # CHECK: %all_gather = shard.all_gather %arg0 on @grid_2x2 grid_axes = [1] gather_axis = 1 : tensor<4x2xi32> -> tensor<4x4xi32>
+    # CHECK: %all_reduce = shard.all_reduce %arg0 on @grid_2x2 : tensor<4x2xi32> -> tensor<4x2xi32>
diff --git a/mlir/test/python/ir/operation.py b/mlir/test/python/ir/operation.py
index 1d4ede1ee336a..f5fa4dad856f8 100644
--- a/mlir/test/python/ir/operation.py
+++ b/mlir/test/python/ir/operation.py
@@ -1187,3 +1187,15 @@ def callback(op):
         module.operation.walk(callback)
     except RuntimeError:
         print("Exception raised")
+
+
+# CHECK-LABEL: TEST: testGetOwnerConcreteOpview
+@run
+def testGetOwnerConcreteOpview():
+    with Context() as ctx, Location.unknown():
+        module = Module.create()
+        with InsertionPoint(module.body):
+            a = arith.ConstantOp(value=42, result=IntegerType.get_signless(32))
+            r = arith.AddIOp(a, a, overflowFlags=arith.IntegerOverflowFlags.nsw)
+            for u in a.result.uses:
+                assert isinstance(u.owner, arith.AddIOp)
diff --git a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
index ab817b640edb3..3fbbcc90a67c9 100644
--- a/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
+++ b/mlir/unittests/Dialect/OpenACC/OpenACCUtilsTest.cpp
@@ -410,3 +410,78 @@ TEST_F(OpenACCUtilsTest, getTypeCategoryArray) {
   VariableTypeCategory category = getTypeCategory(varPtr);
   EXPECT_EQ(category, VariableTypeCategory::array);
 }
+
+//===----------------------------------------------------------------------===//
+// getVariableName Tests
+//===----------------------------------------------------------------------===//
+
+TEST_F(OpenACCUtilsTest, getVariableNameDirect) {
+  // Create a memref with acc.var_name attribute
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  // Set the acc.var_name attribute
+  auto varNameAttr = VarNameAttr::get(&context, "my_variable");
+  allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr);
+
+  Value varPtr = allocOp->getResult();
+
+  // Test that getVariableName returns the variable name
+  std::string varName = getVariableName(varPtr);
+  EXPECT_EQ(varName, "my_variable");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameThroughCast) {
+  // Create a 5x2 memref with acc.var_name attribute
+  auto memrefTy = MemRefType::get({5, 2}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  // Set the acc.var_name attribute on the alloca
+  auto varNameAttr = VarNameAttr::get(&context, "casted_variable");
+  allocOp.get()->setAttr(getVarNameAttrName(), varNameAttr);
+
+  Value allocResult = allocOp->getResult();
+
+  // Create a memref.cast operation to a flattened 10-element array
+  auto castedMemrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::CastOp> castOp =
+      memref::CastOp::create(b, loc, castedMemrefTy, allocResult);
+
+  Value castedPtr = castOp->getResult();
+
+  // Test that getVariableName walks through the cast to find the variable name
+  std::string varName = getVariableName(castedPtr);
+  EXPECT_EQ(varName, "casted_variable");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameNotFound) {
+  // Create a memref without acc.var_name attribute
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  Value varPtr = allocOp->getResult();
+
+  // Test that getVariableName returns empty string when no name is found
+  std::string varName = getVariableName(varPtr);
+  EXPECT_EQ(varName, "");
+}
+
+TEST_F(OpenACCUtilsTest, getVariableNameFromCopyin) {
+  // Create a memref
+  auto memrefTy = MemRefType::get({10}, b.getI32Type());
+  OwningOpRef<memref::AllocaOp> allocOp =
+      memref::AllocaOp::create(b, loc, memrefTy);
+
+  Value varPtr = allocOp->getResult();
+  StringRef name = "data_array";
+  OwningOpRef<CopyinOp> copyinOp =
+      CopyinOp::create(b, loc, varPtr, /*structured=*/true, /*implicit=*/true,
+                       /*name=*/name);
+
+  // Test that getVariableName extracts the name from the copyin operation
+  std::string varName = getVariableName(copyinOp->getAccVar());
+  EXPECT_EQ(varName, name);
+}
diff --git a/offload/include/OffloadPolicy.h b/offload/include/OffloadPolicy.h
index 800fefb224326..9bbd1fffb3798 100644
--- a/offload/include/OffloadPolicy.h
+++ b/offload/include/OffloadPolicy.h
@@ -22,8 +22,6 @@ enum kmp_target_offload_kind_t {
   tgt_mandatory = 2
 };
 
-extern "C" int __kmpc_get_target_offload(void) __attribute__((weak));
-
 class OffloadPolicy {
 
   OffloadPolicy(PluginManager &PM) {
diff --git a/offload/include/OpenMP/InternalTypes.h b/offload/include/OpenMP/InternalTypes.h
index bd84c38fb2011..5460c2a05fa7e 100644
--- a/offload/include/OpenMP/InternalTypes.h
+++ b/offload/include/OpenMP/InternalTypes.h
@@ -70,10 +70,23 @@ typedef struct kmp_task {
   int32_t part_id;
 } kmp_task_t;
 
-int32_t __kmpc_global_thread_num(void *) __attribute__((weak));
-bool __kmpc_omp_has_task_team(int32_t gtid) __attribute__((weak));
-void **__kmpc_omp_get_target_async_handle_ptr(int32_t gtid)
-    __attribute__((weak));
+// Implemented in libomp, they are called from within __tgt_* functions.
+int32_t __kmpc_global_thread_num(void *);
+bool __kmpc_omp_has_task_team(int32_t gtid);
+void **__kmpc_omp_get_target_async_handle_ptr(int32_t gtid);
+int __kmpc_get_target_offload(void);
+kmp_task_t *
+__kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
+                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
+                             kmp_routine_entry_t task_entry, int64_t device_id);
+int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
+                                  kmp_task_t *new_task, int32_t ndeps,
+                                  kmp_depend_info_t *dep_list,
+                                  int32_t ndeps_noalias,
+                                  kmp_depend_info_t *noalias_dep_list);
+void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
+                          kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
+                          kmp_depend_info_t *noalias_dep_list);
 
 /**
  * The argument set that is passed from asynchronous memory copy to block
diff --git a/offload/include/OpenMP/OMPT/Interface.h b/offload/include/OpenMP/OMPT/Interface.h
index a5072864cf5fc..3df552a133217 100644
--- a/offload/include/OpenMP/OMPT/Interface.h
+++ b/offload/include/OpenMP/OMPT/Interface.h
@@ -21,6 +21,8 @@
 #include "Shared/Debug.h"
 #include "omp-tools.h"
 
+#include "GenericProfiler.h"
+
 #include "llvm/Support/ErrorHandling.h"
 
 #include <functional>
@@ -473,16 +475,20 @@ template <typename FunctionPairTy, typename AsyncInfoTy, typename... ArgsTy>
 class TracerInterfaceRAII {
 public:
   TracerInterfaceRAII(FunctionPairTy Callbacks, AsyncInfoTy &AsyncInfo,
-                      void *OmptSpecificData, int TracedDeviceId,
+                      plugin::GenericProfilerTy *Prof, int TracedDeviceId,
                       ompt_callbacks_t EventType, ArgsTy... Args)
       : Arguments(Args...), beginFunction(std::get<0>(Callbacks)) {
     __tgt_async_info *AI = AsyncInfo;
     if (isTracingEnabled(TracedDeviceId, EventType)) {
       auto Record = begin();
 
-      // Access the already allocated profiler data and populate it
+      // The Profiler can allocate specific data to be used to pass information
+      // from here to lower parts of the runtime system.
+      // NOTE: It is the responsibility of the programmer to ensure type
+      // compatibility and correct usage of the data. The profiler, however,
+      // OWNS the pointer and frees it at an appropriate time.
       OmptEventInfoTy *ProfilerData =
-          reinterpret_cast<OmptEventInfoTy *>(OmptSpecificData);
+          reinterpret_cast<OmptEventInfoTy *>(Prof->getProfilerSpecificData());
       ProfilerData->TraceRecord = Record;
       ProfilerData->NumTeams = 0;
 
diff --git a/offload/include/OpenMP/omp.h b/offload/include/OpenMP/omp.h
index 49d9f1fa75c20..768ca46a9bed0 100644
--- a/offload/include/OpenMP/omp.h
+++ b/offload/include/OpenMP/omp.h
@@ -40,7 +40,7 @@ typedef void *omp_depend_t;
 /// API declarations
 ///{
 
-int omp_get_default_device(void) __attribute__((weak));
+int omp_get_default_device(void);
 
 ///}
 
diff --git a/offload/include/device.h b/offload/include/device.h
index b28930d58a551..9b707dd8d20fb 100644
--- a/offload/include/device.h
+++ b/offload/include/device.h
@@ -184,6 +184,8 @@ struct DeviceTy {
   /// Ask the device to perform sanity checks for zero-copy configurations.
   void zeroCopySanityChecksAndDiag(bool isUnifiedSharedMemory,
                                    bool isAutoZeroCopy, bool isEagerMaps);
+  /// Ask the device whether the storage is accessible.
+  bool isAccessiblePtr(const void *Ptr, size_t Size);
 
   /// Check if there are pending images for this device.
   bool hasPendingImages() const { return HasPendingImages; }
diff --git a/offload/include/omptarget.h b/offload/include/omptarget.h
index 2b24a6b25bc95..6ab0c54738c0d 100644
--- a/offload/include/omptarget.h
+++ b/offload/include/omptarget.h
@@ -283,6 +283,7 @@ int omp_get_initial_device(void);
 void *omp_target_alloc(size_t Size, int DeviceNum);
 void omp_target_free(void *DevicePtr, int DeviceNum);
 int omp_target_is_present(const void *Ptr, int DeviceNum);
+int omp_target_is_accessible(const void *Ptr, size_t Size, int DeviceNum);
 int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                       size_t DstOffset, size_t SrcOffset, int DstDevice,
                       int SrcDevice);
diff --git a/offload/libomptarget/OpenMP/API.cpp b/offload/libomptarget/OpenMP/API.cpp
index c050135334106..e8e2c408ab169 100644
--- a/offload/libomptarget/OpenMP/API.cpp
+++ b/offload/libomptarget/OpenMP/API.cpp
@@ -59,29 +59,6 @@ void *targetLockExplicit(void *HostPtr, size_t Size, int DeviceNum,
                          const char *Name);
 void targetUnlockExplicit(void *HostPtr, int DeviceNum, const char *Name);
 
-// Implemented in libomp, they are called from within __tgt_* functions.
-extern "C" {
-int __kmpc_get_target_offload(void) __attribute__((weak));
-kmp_task_t *__kmpc_omp_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                                  size_t sizeof_kmp_task_t,
-                                  size_t sizeof_shareds,
-                                  kmp_routine_entry_t task_entry)
-    __attribute__((weak));
-
-kmp_task_t *
-__kmpc_omp_target_task_alloc(ident_t *loc_ref, int32_t gtid, int32_t flags,
-                             size_t sizeof_kmp_task_t, size_t sizeof_shareds,
-                             kmp_routine_entry_t task_entry, int64_t device_id)
-    __attribute__((weak));
-
-int32_t __kmpc_omp_task_with_deps(ident_t *loc_ref, int32_t gtid,
-                                  kmp_task_t *new_task, int32_t ndeps,
-                                  kmp_depend_info_t *dep_list,
-                                  int32_t ndeps_noalias,
-                                  kmp_depend_info_t *noalias_dep_list)
-    __attribute__((weak));
-}
-
 EXTERN int omp_get_num_devices(void) {
   TIMESCOPE();
   OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
@@ -226,6 +203,34 @@ EXTERN int omp_target_is_present(const void *Ptr, int DeviceNum) {
   return Rc;
 }
 
+/// Check whether a pointer is accessible from a device.
+/// Returns true when accessibility is guaranteed otherwise returns false.
+EXTERN int omp_target_is_accessible(const void *Ptr, size_t Size,
+                                    int DeviceNum) {
+  TIMESCOPE();
+  OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
+  DP("Call to omp_target_is_accessible for device %d, address " DPxMOD
+     ", size %zu\n",
+     DeviceNum, DPxPTR(Ptr), Size);
+
+  if (!Ptr) {
+    DP("Call to omp_target_is_accessible with NULL ptr returning false\n");
+    return false;
+  }
+
+  if (DeviceNum == omp_get_initial_device() || DeviceNum == -1) {
+    DP("Call to omp_target_is_accessible on host, returning true\n");
+    return true;
+  }
+
+  // The device number must refer to a valid device
+  auto DeviceOrErr = PM->getDevice(DeviceNum);
+  if (!DeviceOrErr)
+    FATAL_MESSAGE(DeviceNum, "%s", toString(DeviceOrErr.takeError()).c_str());
+
+  return DeviceOrErr->isAccessiblePtr(Ptr, Size);
+}
+
 EXTERN int omp_target_memcpy(void *Dst, const void *Src, size_t Length,
                              size_t DstOffset, size_t SrcOffset, int DstDevice,
                              int SrcDevice) {
diff --git a/offload/libomptarget/OpenMP/InteropAPI.cpp b/offload/libomptarget/OpenMP/InteropAPI.cpp
index d6ef17c06355c..b307011445170 100644
--- a/offload/libomptarget/OpenMP/InteropAPI.cpp
+++ b/offload/libomptarget/OpenMP/InteropAPI.cpp
@@ -18,14 +18,6 @@
 #include <cstdlib>
 #include <cstring>
 
-extern "C" {
-
-void __kmpc_omp_wait_deps(ident_t *loc_ref, int32_t gtid, int32_t ndeps,
-                          kmp_depend_info_t *dep_list, int32_t ndeps_noalias,
-                          kmp_depend_info_t *noalias_dep_list);
-
-} // extern "C"
-
 namespace {
 omp_interop_rc_t getPropertyErrorType(omp_interop_property_t Property) {
   switch (Property) {
diff --git a/offload/libomptarget/device.cpp b/offload/libomptarget/device.cpp
index c6fd3acc8abbf..6a5c6e7b6b7d9 100644
--- a/offload/libomptarget/device.cpp
+++ b/offload/libomptarget/device.cpp
@@ -245,18 +245,12 @@ int32_t DeviceTy::submitData(void *TgtPtrBegin, void *HstPtrBegin, int64_t Size,
           RegionInterface.getCallbacks<ompt_target_data_transfer_to_device>(),
           omp_get_initial_device(), HstPtrBegin, DeviceID, TgtPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
-      // The Profiler can allocate specific data to be used to pass information
-      // from here to lower parts of the runtime system.
-      // NOTE: It is the responsibility of the programmer to ensure type
-      // compatibility and correct usage of the data. The profiler, however,
-      // OWNS the pointer and frees it at an appropriate time.
-      void *ProfData = RTL->getProfiler()->getProfilerSpecificData();
       // Only if 'TracedDeviceId' is actually traced, AsyncInfo->OmptEventInfo
       // is set and a trace record generated. Otherwise: No OMPT device tracing.
       TracerInterfaceRAII TargetDataSubmitTraceRAII(
           RegionInterface
               .getTraceGenerators<ompt_target_data_transfer_to_device>(),
-          AsyncInfo, ProfData, /*TracedDeviceId=*/DeviceID,
+          AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/DeviceID,
           /*EventType=*/ompt_callback_target_data_op, omp_get_initial_device(),
           HstPtrBegin, DeviceID, TgtPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
@@ -281,18 +275,12 @@ int32_t DeviceTy::retrieveData(void *HstPtrBegin, void *TgtPtrBegin,
           RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
           DeviceID, TgtPtrBegin, omp_get_initial_device(), HstPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
-      // The Profiler can allocate specific data to be used to pass information
-      // from here to lower parts of the runtime system.
-      // NOTE: It is the responsibility of the programmer to ensure type
-      // compatibility and correct usage of the data. The profiler, however,
-      // OWNS the pointer and frees it at an appropriate time.
-      void *ProfData = RTL->getProfiler()->getProfilerSpecificData();
       // Only if 'TracedDeviceId' is actually traced, AsyncInfo->OmptEventInfo
       // is set and a trace record generated. Otherwise: No OMPT device tracing.
       TracerInterfaceRAII TargetDataSubmitTraceRAII(
           RegionInterface
               .getTraceGenerators<ompt_target_data_transfer_from_device>(),
-          AsyncInfo, ProfData, /*TracedDeviceId=*/DeviceID,
+          AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/DeviceID,
           /*EventType=*/ompt_callback_target_data_op, DeviceID, TgtPtrBegin,
           omp_get_initial_device(), HstPtrBegin, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
@@ -316,18 +304,12 @@ int32_t DeviceTy::dataExchange(void *SrcPtr, DeviceTy &DstDev, void *DstPtr,
           RegionInterface.getCallbacks<ompt_target_data_transfer_from_device>(),
           RTLDeviceID, SrcPtr, DstDev.RTLDeviceID, DstPtr, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);
-      // The Profiler can allocate specific data to be used to pass information
-      // from here to lower parts of the runtime system.
-      // NOTE: It is the responsibility of the programmer to ensure type
-      // compatibility and correct usage of the data. The profiler, however,
-      // OWNS the pointer and frees it at an appropriate time.
-      void *ProfData = RTL->getProfiler()->getProfilerSpecificData();
       // Only if 'TracedDeviceId' is actually traced, AsyncInfo->OmptEventInfo
       // is set and a trace record generated. Otherwise: No OMPT device tracing.
       TracerInterfaceRAII TargetDataExchangeTraceRAII(
           RegionInterface
               .getTraceGenerators<ompt_target_data_transfer_from_device>(),
-          AsyncInfo, ProfData, /*TracedDeviceId=*/RTLDeviceID,
+          AsyncInfo, RTL->getProfiler(), /*TracedDeviceId=*/RTLDeviceID,
           /*EventType=*/ompt_callback_target_data_op, RTLDeviceID, SrcPtr,
           DstDev.RTLDeviceID, DstPtr, Size,
           /*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
@@ -456,3 +438,7 @@ uint32_t DeviceTy::getNumMultiDevices() const {
 bool DeviceTy::isMultiDeviceKernel(void *TgtEntryPtr) {
   return RTL->kernel_is_multi_device(RTLDeviceID, TgtEntryPtr);
 }
+
+bool DeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
+  return RTL->is_accessible_ptr(RTLDeviceID, Ptr, Size);
+}
diff --git a/offload/libomptarget/exports b/offload/libomptarget/exports
index cc6f7903f494c..6a9afd6534a1a 100644
--- a/offload/libomptarget/exports
+++ b/offload/libomptarget/exports
@@ -44,6 +44,7 @@ VERS1.0 {
     omp_get_initial_device;
     omp_target_alloc;
     omp_target_free;
+    omp_target_is_accessible;
     omp_target_is_present;
     omp_target_memcpy;
     omp_target_memcpy_rect;
diff --git a/offload/libomptarget/omptarget.cpp b/offload/libomptarget/omptarget.cpp
index 2c37973612d6a..1753917667fb4 100644
--- a/offload/libomptarget/omptarget.cpp
+++ b/offload/libomptarget/omptarget.cpp
@@ -2101,13 +2101,6 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
     InterfaceRAII TargetSubmitRAII(
         RegionInterface.getCallbacks<ompt_callback_target_submit>(), NumTeams);
 
-    // The Profiler can allocate specific data to be used to pass information
-    // from here to lower parts of the runtime system.
-    // NOTE: It is the responsibility of the programmer to ensure type
-    // compatibility and correct usage of the data. The profiler, however, OWNS
-    // the pointer and frees it at an appropriate time.
-    void *ProfData = Device.RTL->getProfiler()->getProfilerSpecificData();
-
     // Calls "begin" for the OMPT trace record and let the plugin
     // enqueue the stop operation for after the kernel is done. The stop
     // operation completes the trace record entry with the information from
@@ -2116,7 +2109,7 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
     // set and a trace record generated. Otherwise: No OMPT device tracing.
     TracerInterfaceRAII TargetTraceRAII(
         RegionInterface.getTraceGenerators<ompt_callback_target_submit>(),
-        AsyncInfo, ProfData, /*TracedDeviceId=*/DeviceId,
+        AsyncInfo, Device.RTL->getProfiler(), /*TracedDeviceId=*/DeviceId,
         /*EventType=*/ompt_callback_target_submit, DeviceId, NumTeams);
 #endif
     Ret = Device.launchKernel(TgtEntryPtr, TgtArgs.data(), TgtOffsets.data(),
diff --git a/offload/plugins-nextgen/amdgpu/src/rtl.cpp b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
index 0d410d1cc5fe4..cb33851f7f067 100644
--- a/offload/plugins-nextgen/amdgpu/src/rtl.cpp
+++ b/offload/plugins-nextgen/amdgpu/src/rtl.cpp
@@ -160,6 +160,13 @@ static Error timeDataTransferInNsAsync(void *Data) {
 
   return Plugin::success();
 }
+
+static void *
+getOrNullProfilerSpecificData(AsyncInfoWrapperTy &AsyncInfoWrapper) {
+  __tgt_async_info *AI = AsyncInfoWrapper;
+  return AI ? AI->ProfilerData : nullptr;
+}
+
 } // namespace plugin
 } // namespace target
 } // namespace omp
@@ -197,40 +204,6 @@ static double getTimeOfDay() {
   return TimeVal;
 }
 
-#ifdef OMPT_SUPPORT
-namespace llvm::omp::target::plugin {
-/// Returns a pointer to an OmptEventInfoTy object to be used for OMPT tracing
-/// or nullptr. It is the caller's duty to free the returned pointer when no
-/// longer needed.
-static ompt::OmptEventInfoTy *
-getOrNullOmptEventInfo(AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  __tgt_async_info *AI = AsyncInfoWrapper;
-  if (!AI || !AI->ProfilerData)
-    return nullptr;
-
-  // The profiler data is allocated in the profiler for each individual event.
-  return reinterpret_cast<ompt::OmptEventInfoTy *>(AI->ProfilerData);
-}
-
-} // namespace llvm::omp::target::plugin
-
-#else // OMPT_SUPPORT
-namespace llvm::omp::target::ompt {
-
-struct OmptEventInfoTy {};
-} // namespace llvm::omp::target::ompt
-
-namespace llvm::omp::target::plugin {
-
-/// When no OMPT is enabled, return nullptr to de-facto disable the profiling
-static ompt::OmptEventInfoTy *
-getOrNullOmptEventInfo(AsyncInfoWrapperTy &AsyncInfoWrapper) {
-  return nullptr;
-}
-
-} // namespace llvm::omp::target::plugin
-#endif
-
 namespace llvm {
 namespace omp {
 namespace target {
@@ -1444,7 +1417,7 @@ struct AMDGPUQueueTy {
     hsa_status_t Status =
         hsa_queue_create(Agent, QueueSize, HSA_QUEUE_TYPE_MULTI, callbackError,
                          &Device, UINT32_MAX, UINT32_MAX, &Queue);
-    if (Device.Plugin.getProfiler()->shouldEnableProfiling() ||
+    if (Device.Plugin.getProfiler()->isProfilingEnabled() ||
         OMPX_EnableQueueProfiling)
       hsa_amd_profiling_set_profiler_enabled(Queue, /*Enable=*/1);
 
@@ -1810,10 +1783,11 @@ struct AMDGPUStreamTy {
     Error schedProfilerKernelTiming(GenericDeviceTy *Device, hsa_agent_t Agent,
                                     AMDGPUSignalTy *OutputSignal,
                                     double TicksToTime,
-                                    ompt::OmptEventInfoTy *OmptData) {
+                                    void *ProfilerSpecificData) {
       Callbacks.emplace_back(timeKernelInNsAsync);
-      ActionArgs.emplace_back().ProfilerArgs = ProfilingInfoTy{
-          &(Device->Plugin), Agent, OutputSignal, TicksToTime, OmptData};
+      ActionArgs.emplace_back().ProfilerArgs =
+          ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime,
+                          ProfilerSpecificData};
       return Plugin::success();
     }
 
@@ -1822,10 +1796,11 @@ struct AMDGPUStreamTy {
                                           hsa_agent_t Agent,
                                           AMDGPUSignalTy *OutputSignal,
                                           double TicksToTime,
-                                          ompt::OmptEventInfoTy *OmptData) {
+                                          void *ProfilerSpecificData) {
       Callbacks.emplace_back(timeDataTransferInNsAsync);
-      ActionArgs.emplace_back().ProfilerArgs = ProfilingInfoTy{
-          &(Device->Plugin), Agent, OutputSignal, TicksToTime, OmptData};
+      ActionArgs.emplace_back().ProfilerArgs =
+          ProfilingInfoTy{&(Device->Plugin), Agent, OutputSignal, TicksToTime,
+                          ProfilerSpecificData};
       return Plugin::success();
     }
 
@@ -2200,7 +2175,7 @@ struct AMDGPUStreamTy {
                          uint32_t NumThreads[3], uint32_t NumBlocks[3],
                          uint32_t GroupSize, uint32_t StackSize,
                          AMDGPUMemoryManagerTy &MemoryManager,
-                         ompt::OmptEventInfoTy *OmptInfo = nullptr) {
+                         void *ProfilerSpecificData = nullptr) {
     if (Queue == nullptr)
       return Plugin::error(ErrorCode::INVALID_NULL_POINTER,
                            "target queue was nullptr");
@@ -2223,13 +2198,12 @@ struct AMDGPUStreamTy {
 
       // TODO: Technically this conditional compilation is not needed anymore
 #ifdef OMPT_SUPPORT
-    if (OmptInfo) {
-      DP("OMPT-Async: Info in KernelTy >> TR ptr: %p\n", OmptInfo->TraceRecord);
+    if (ProfilerSpecificData) {
 
-      // OmptInfo holds function pointer to finish trace record once the kernel
-      // completed.
+      // ProfilerSpecificData holds function pointer to finish trace record once
+      // the kernel completed.
       if (auto Err = Slots[Curr].schedProfilerKernelTiming(
-              &Device, Agent, OutputSignal, TicksToTime, OmptInfo))
+              &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData))
         return Err;
     }
 #endif
@@ -2302,7 +2276,7 @@ struct AMDGPUStreamTy {
 
   /// Push an asynchronous memory copy between pinned memory buffers.
   Error pushPinnedMemoryCopyAsync(void *Dst, const void *Src, uint64_t CopySize,
-                                  ompt::OmptEventInfoTy *OmptInfo = nullptr) {
+                                  void *ProfilerSpecificData = nullptr) {
     // Retrieve an available signal for the operation's output.
     AMDGPUSignalTy *OutputSignal = nullptr;
     if (auto Err = SignalManager.getResource(OutputSignal))
@@ -2317,11 +2291,10 @@ struct AMDGPUStreamTy {
 
     // TODO: Technically this conditional compilation is not needed anymore
 #ifdef OMPT_SUPPORT
-    if (OmptInfo) {
-      DP("OMPT-Async: Registering data timing in pushPinnedMemoryCopyAsync\n");
+    if (ProfilerSpecificData) {
       // Capture the time the data transfer required for the d2h transfer.
       if (auto Err = Slots[Curr].schedProfilerDataTransferTiming(
-              &Device, Agent, OutputSignal, TicksToTime, OmptInfo))
+              &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData))
         return Err;
     }
 #endif
@@ -2348,7 +2321,7 @@ struct AMDGPUStreamTy {
   Error pushMemoryCopyD2HAsync(void *Dst, const void *Src, void *Inter,
                                uint64_t CopySize,
                                AMDGPUMemoryManagerTy &MemoryManager,
-                               ompt::OmptEventInfoTy *OmptInfo = nullptr) {
+                               void *ProfilerSpecificData = nullptr) {
     // Retrieve available signals for the operation's outputs.
     AMDGPUSignalTy *OutputSignals[2] = {};
     if (auto Err = SignalManager.getResources(/*Num=*/2, OutputSignals))
@@ -2374,11 +2347,11 @@ struct AMDGPUStreamTy {
 
         // TODO: Technically this conditional compilation is not needed anymore
 #ifdef OMPT_SUPPORT
-    if (OmptInfo) {
-      DP("OMPT-Async: Registering data timing in pushMemoryCopyD2HAsync\n");
+    if (ProfilerSpecificData) {
       // Capture the time the data transfer required for the d2h transfer.
       if (auto Err = Slots[Curr].schedProfilerDataTransferTiming(
-              &Device, Agent, OutputSignals[0], TicksToTime, OmptInfo))
+              &Device, Agent, OutputSignals[0], TicksToTime,
+              ProfilerSpecificData))
         return Err;
     }
 #endif
@@ -2435,7 +2408,7 @@ struct AMDGPUStreamTy {
   Error pushMemoryCopyH2DAsync(void *Dst, const void *Src, void *Inter,
                                uint64_t CopySize,
                                AMDGPUMemoryManagerTy &MemoryManager,
-                               ompt::OmptEventInfoTy *OmptInfo = nullptr,
+                               void *ProfilerSpecificData = nullptr,
                                size_t NumTimes = 1) {
     // Retrieve available signals for the operation's outputs.
     AMDGPUSignalTy *OutputSignals[2] = {};
@@ -2498,11 +2471,11 @@ struct AMDGPUStreamTy {
 
       // TODO: Technically, this conditional compilation is not needed anymore
 #ifdef OMPT_SUPPORT
-    if (OmptInfo) {
-      DP("OMPT-Async: Registering data timing in pushMemoryCopyH2DAsync\n");
+    if (ProfilerSpecificData) {
       // Capture the time the data transfer required for the d2h transfer.
       if (auto Err = Slots[Curr].schedProfilerDataTransferTiming(
-              &Device, Agent, OutputSignals[0], TicksToTime, OmptInfo))
+              &Device, Agent, OutputSignals[0], TicksToTime,
+              ProfilerSpecificData))
         return Err;
     }
 #endif
@@ -2523,7 +2496,7 @@ struct AMDGPUStreamTy {
   // AMDGPUDeviceTy is incomplete here, passing the underlying agent instead
   Error pushMemoryCopyD2DAsync(void *Dst, hsa_agent_t DstAgent, const void *Src,
                                hsa_agent_t SrcAgent, uint64_t CopySize,
-                               ompt::OmptEventInfoTy *OmptInfo = nullptr) {
+                               void *ProfilerSpecificData = nullptr) {
     AMDGPUSignalTy *OutputSignal;
     if (auto Err = SignalManager.getResources(/*Num=*/1, &OutputSignal))
       return Err;
@@ -2537,11 +2510,10 @@ struct AMDGPUStreamTy {
 
     // TODO: Technically, this conditional compilation is not needed anymore
 #ifdef OMPT_SUPPORT
-    if (OmptInfo) {
-      DP("OMPT-Async: Registering data timing in pushMemoryCopyD2DAsync\n");
+    if (ProfilerSpecificData) {
       // Capture the time the data transfer required for the d2h transfer.
       if (auto Err = Slots[Curr].schedProfilerDataTransferTiming(
-              &Device, Agent, OutputSignal, TicksToTime, OmptInfo))
+              &Device, Agent, OutputSignal, TicksToTime, ProfilerSpecificData))
         return Err;
     }
 #endif
@@ -3746,8 +3718,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     void *PinnedPtr = nullptr;
 
     // Obtain the OMPT-related callback data
-    DP("OMPT-Async: dataSubmitImpl\n");
-    auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper);
+    auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper);
 
     // Prefault GPU page table in XNACK-Enabled case, on APUs,
     // under the assumption that explicitly allocated memory
@@ -3763,9 +3734,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
       if (auto Err = getStream(AsyncInfoWrapper, Stream))
         return Err;
-      DP("OMPT-Async: Pinned Copy\n");
       return Stream->pushPinnedMemoryCopyAsync(TgtPtr, PinnedPtr, Size,
-                                               LocalOmptEventInfo);
+                                               ProfilerSpecificData);
     }
 
     // For large transfers use synchronous behavior.
@@ -3793,7 +3763,6 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
       if (auto Err = Signal.init())
         return Err;
 
-      DP("OMPT-Async: Sync Copy\n");
       if (auto Err = hsa_utils::asyncMemCopy(useMultipleSdmaEngines(), TgtPtr,
                                          Agent, PinnedPtr, Agent, Size, 0,
                                          nullptr, Signal.get()))
@@ -3803,9 +3772,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         return Err;
 
 #ifdef OMPT_SUPPORT
-      if (LocalOmptEventInfo) {
+      if (Plugin.getProfiler()->isProfilingEnabled()) {
         ProfilingInfoTy OmptKernelTimingArgsAsync{
-            &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo};
+            &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData};
 
         if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync))
           return Err;
@@ -3828,10 +3797,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     if (auto Err = getStream(AsyncInfoWrapper, Stream))
       return Err;
 
-    DP("OMPT-Async: ASync Copy\n");
     return Stream->pushMemoryCopyH2DAsync(TgtPtr, HstPtr, PinnedPtr, Size,
                                           PinnedMemoryManager,
-                                          LocalOmptEventInfo);
+                                          ProfilerSpecificData);
   }
 
   /// Retrieve data from the device (device to host transfer).
@@ -3841,8 +3809,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     void *PinnedPtr = nullptr;
 
     // Obtain the OMPT-related callback data
-    DP("OMPT-Async: dataRetrieveImpl\n");
-    auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper);
+    auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper);
 
     // Prefault GPU page table in XNACK-Enabled case, on APUs,
     // under the assumption that explicitly allocated memory
@@ -3858,9 +3825,8 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
             PinnedAllocs.getDeviceAccessiblePtrFromPinnedBuffer(HstPtr)) {
       if (auto Err = getStream(AsyncInfoWrapper, Stream))
         return Err;
-      DP("OMPT-Async: Pinned Copy\n");
       return Stream->pushPinnedMemoryCopyAsync(PinnedPtr, TgtPtr, Size,
-                                               LocalOmptEventInfo);
+                                               ProfilerSpecificData);
     }
 
     // For large transfers use synchronous behavior.
@@ -3898,9 +3864,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         return Err;
 
 #ifdef OMPT_SUPPORT
-      if (LocalOmptEventInfo) {
+      if (Plugin.getProfiler()->isProfilingEnabled()) {
         ProfilingInfoTy OmptKernelTimingArgsAsync{
-            &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo};
+            &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData};
 
         if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync))
           return Err;
@@ -3925,7 +3891,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     return Stream->pushMemoryCopyD2HAsync(HstPtr, TgtPtr, PinnedPtr, Size,
                                           PinnedMemoryManager,
-                                          LocalOmptEventInfo);
+                                          ProfilerSpecificData);
   }
 
   /// Exchange data between two devices within the plugin.
@@ -3934,8 +3900,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
                          AsyncInfoWrapperTy &AsyncInfoWrapper) override {
     AMDGPUDeviceTy &DstDevice = static_cast<AMDGPUDeviceTy &>(DstGenericDevice);
 
-    DP("OMPT-Async: dataExchangeImpl\n");
-    auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper);
+    auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper);
 
     // For large transfers use synchronous behavior.
     // If OMPT is enabled or synchronous behavior is explicitly requested:
@@ -3957,9 +3922,9 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
         return Err;
 
 #ifdef OMPT_SUPPORT
-      if (LocalOmptEventInfo) {
+      if (Plugin.getProfiler()->isProfilingEnabled()) {
         ProfilingInfoTy OmptKernelTimingArgsAsync{
-            &Plugin, Agent, &Signal, TicksToTime, LocalOmptEventInfo};
+            &Plugin, Agent, &Signal, TicksToTime, ProfilerSpecificData};
 
         if (auto Err = timeDataTransferInNsAsync(&OmptKernelTimingArgsAsync))
           return Err;
@@ -3977,7 +3942,7 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
 
     return Stream->pushMemoryCopyD2DAsync(DstPtr, DstDevice.getAgent(), SrcPtr,
                                           getAgent(), (uint64_t)Size,
-                                          LocalOmptEventInfo);
+                                          ProfilerSpecificData);
   }
 
   /// Insert a data fence between previous data operations and the following
@@ -4513,6 +4478,30 @@ struct AMDGPUDeviceTy : public GenericDeviceTy, AMDGenericDeviceTy {
     return Plugin::success();
   }
 
+  Expected<bool> isAccessiblePtrImpl(const void *Ptr, size_t Size) override {
+    hsa_amd_pointer_info_t Info;
+    Info.size = sizeof(hsa_amd_pointer_info_t);
+
+    hsa_agent_t *Agents = nullptr;
+    uint32_t Count = 0;
+    hsa_status_t Status =
+        hsa_amd_pointer_info(Ptr, &Info, malloc, &Count, &Agents);
+
+    if (auto Err = Plugin::check(Status, "error in hsa_amd_pointer_info: %s"))
+      return std::move(Err);
+
+    // Checks if the pointer is known by HSA and accessible by the device
+    for (uint32_t i = 0; i < Count; i++) {
+      if (Agents[i].handle == getAgent().handle)
+        return Info.sizeInBytes >= Size;
+    }
+
+    // If the pointer is unknown to HSA it's assumed a host pointer
+    // in that case the device can access it on unified memory support is
+    // enabled
+    return IsXnackEnabled;
+  }
+
   /// Getters and setters for stack and heap sizes.
   Error getDeviceStackSize(uint64_t &Value) override {
     Value = StackSize;
@@ -5601,12 +5590,12 @@ Error AMDGPUKernelTy::launchImpl(GenericDeviceTy &GenericDevice,
   }
 
   // Get required OMPT-related data
-  auto LocalOmptEventInfo = getOrNullOmptEventInfo(AsyncInfoWrapper);
+  auto ProfilerSpecificData = getOrNullProfilerSpecificData(AsyncInfoWrapper);
 
   // Push the kernel launch into the stream.
   return Stream->pushKernelLaunch(*this, AllArgs, NumThreads, NumBlocks,
                                   GroupSize, static_cast<uint32_t>(StackSize),
-                                  ArgsMemoryManager, LocalOmptEventInfo);
+                                  ArgsMemoryManager, ProfilerSpecificData);
 }
 
 void AMDGPUKernelTy::printAMDOneLineKernelTrace(GenericDeviceTy &GenericDevice,
@@ -5847,8 +5836,8 @@ static ProfilingInfoTy *getProfilingInfo(void *Data) {
 
 static std::pair<uint64_t, uint64_t>
 getKernelStartAndEndTime(const ProfilingInfoTy *Args) {
-  assert(Args->Plugin && "Invalid GenericPlugin pointer in OMPT profiling");
-  assert(Args->Signal && "Invalid AMDGPUSignal pointer in OMPT profiling");
+  assert(Args->Plugin && "Invalid GenericPlugin pointer in profiling");
+  assert(Args->Signal && "Invalid AMDGPUSignal pointer in profiling");
 
   hsa_amd_profiling_dispatch_time_t TimeRec{0, 0};
   hsa_status_t Status = hsa_amd_profiling_get_dispatch_time(
@@ -5872,7 +5861,7 @@ getKernelStartAndEndTime(const ProfilingInfoTy *Args) {
 
 static std::pair<uint64_t, uint64_t>
 getCopyStartAndEndTime(const ProfilingInfoTy *Args) {
-  assert(Args->Signal && "Invalid AMDGPUSignal Pointer in OMPT profiling");
+  assert(Args->Signal && "Invalid AMDGPUSignal Pointer in profiling");
 
   hsa_amd_profiling_async_copy_time_t TimeRec{0, 0};
   hsa_status_t Status =
diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
index f54eff3781023..2f1431bbc8578 100644
--- a/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
+++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.cpp
@@ -11,12 +11,14 @@
 //===----------------------------------------------------------------------===//
 
 #include "OmptProfiler.h"
+#include "OpenMP/OMPT/Interface.h"
 #include "PluginInterface.h"
 #include "Shared/Debug.h"
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleInit(
-    llvm::omp::target::plugin::GenericDeviceTy *Device,
-    llvm::omp::target::plugin::GenericPluginTy *Plugin) {
+using namespace llvm::omp::target;
+
+void ompt::OmptProfilerTy::handleInit(plugin::GenericDeviceTy *Device,
+                                      plugin::GenericPluginTy *Plugin) {
   auto DeviceId = Device->getDeviceId();
   auto DevicePtr = reinterpret_cast<ompt_device_t *>(Device);
   ompt::setDeviceId(DevicePtr, Plugin->getUserId(DeviceId));
@@ -32,9 +34,8 @@ void llvm::omp::target::ompt::OmptProfilerTy::handleInit(
   }
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleDeinit(
-    llvm::omp::target::plugin::GenericDeviceTy *Device,
-    llvm::omp::target::plugin::GenericPluginTy *Plugin) {
+void ompt::OmptProfilerTy::handleDeinit(
+    plugin::GenericDeviceTy *Device, target::plugin::GenericPluginTy *Plugin) {
   auto DeviceId = Device->getDeviceId();
 
   if (ompt::Initialized) {
@@ -45,55 +46,65 @@ void llvm::omp::target::ompt::OmptProfilerTy::handleDeinit(
   ompt::removeDeviceId(reinterpret_cast<ompt_device_t *>(Device));
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleLoadBinary(
-    llvm::omp::target::plugin::GenericDeviceTy *Device,
-    llvm::omp::target::plugin::GenericPluginTy *Plugin,
-    const StringRef InputTgtImage) {
-  auto DeviceId = Device->getDeviceId();
+void ompt::OmptProfilerTy::handleLoadBinary(plugin::GenericDeviceTy *Device,
+                                            plugin::GenericPluginTy *Plugin,
+                                            const StringRef InputTgtImage) {
 
-  if (ompt::Initialized) {
-    size_t Bytes = InputTgtImage.size();
-    performOmptCallback(
-        device_load, Plugin->getUserId(DeviceId),
-        /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
-        /*ImgSize=*/Bytes,
-        /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
-        /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
-  }
+  if (!ompt::Initialized)
+    return;
+
+  auto DeviceId = Device->getDeviceId();
+  size_t Bytes = InputTgtImage.size();
+  performOmptCallback(
+      device_load, Plugin->getUserId(DeviceId),
+      /*FileName=*/nullptr, /*FileOffset=*/0, /*VmaInFile=*/nullptr,
+      /*ImgSize=*/Bytes,
+      /*HostAddr=*/const_cast<unsigned char *>(InputTgtImage.bytes_begin()),
+      /*DeviceAddr=*/nullptr, /* FIXME: ModuleId */ 0);
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleDataAlloc(
-    uint64_t StartNanos, uint64_t EndNanos, void *HostPtr, uint64_t Size,
-    void *Data) {
+void ompt::OmptProfilerTy::handleDataAlloc(uint64_t StartNanos,
+                                           uint64_t EndNanos, void *HostPtr,
+                                           uint64_t Size, void *Data) {
   ompt::setOmptTimestamp(StartNanos, EndNanos);
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleDataDelete(
-    uint64_t StartNanos, uint64_t EndNanos, void *TgtPtr, void *Data) {
+void ompt::OmptProfilerTy::handleDataDelete(uint64_t StartNanos,
+                                            uint64_t EndNanos, void *TgtPtr,
+                                            void *Data) {
   ompt::setOmptTimestamp(StartNanos, EndNanos);
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handlePreKernelLaunch(
-    llvm::omp::target::plugin::GenericDeviceTy *Device, uint32_t NumBlocks[3],
+void ompt::OmptProfilerTy::handlePreKernelLaunch(
+    plugin::GenericDeviceTy *Device, uint32_t NumBlocks[3],
     __tgt_async_info *AI) {
-  OMPT_IF_TRACING_ENABLED(
-      if (llvm::omp::target::ompt::isTracedDevice(getDeviceId(Device))) {
-        if (AI->ProfilerData != nullptr) {
-          auto ProfilerSpecificData = reinterpret_cast<ompt::OmptEventInfoTy *>(AI->ProfilerData);
-          // Set number of granted teams for OMPT
-          setOmptGrantedNumTeams(NumBlocks[0]);
-          ProfilerSpecificData->NumTeams = NumBlocks[0];
-        }
-      });
+  if (!ompt::isTracedDevice(getDeviceId(Device)))
+    return;
+
+  if (AI->ProfilerData == nullptr)
+    return;
+
+  auto ProfilerSpecificData =
+      reinterpret_cast<ompt::OmptEventInfoTy *>(AI->ProfilerData);
+  assert(ProfilerSpecificData && "Invalid ProfilerSpecificData");
+  // Set number of granted teams for OMPT
+  setOmptGrantedNumTeams(NumBlocks[0]);
+  ProfilerSpecificData->NumTeams = NumBlocks[0];
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleKernelCompletion(
-    uint64_t StartNanos, uint64_t EndNanos, void *Data) {
+void ompt::OmptProfilerTy::handleKernelCompletion(uint64_t StartNanos,
+                                                  uint64_t EndNanos,
+                                                  void *Data) {
+
+  if (!isProfilingEnabled())
+    return;
 
-  if (!shouldEnableProfiling())
+  /// Empty data means no tracing in OMPT
+  /// offload/include/OpenMP/OMPT/Interface.h line 492
+  if (!Data)
     return;
 
-  DP("OMPT-Async: Time kernel for asynchronous execution (Plugin): Start %lu "
+  DP("OMPT-Async: Time kernel for asynchronous execution: Start %lu "
      "End %lu\n",
      StartNanos, EndNanos);
 
@@ -101,37 +112,44 @@ void llvm::omp::target::ompt::OmptProfilerTy::handleKernelCompletion(
   assert(OmptEventInfo && "Invalid OmptEventInfo");
   assert(OmptEventInfo->TraceRecord && "Invalid TraceRecord");
 
-  llvm::omp::target::ompt::RegionInterface.stopTargetSubmitTraceAsync(
-      OmptEventInfo->TraceRecord, OmptEventInfo->NumTeams, StartNanos,
-      EndNanos);
+  ompt::RegionInterface.stopTargetSubmitTraceAsync(OmptEventInfo->TraceRecord,
+                                                   OmptEventInfo->NumTeams,
+                                                   StartNanos, EndNanos);
 
   // Done processing, our responsibility to free the memory
   freeProfilerDataEntry(OmptEventInfo);
 }
 
-void llvm::omp::target::ompt::OmptProfilerTy::handleDataTransfer(
-    uint64_t StartNanos, uint64_t EndNanos, void *Data) {
+void ompt::OmptProfilerTy::handleDataTransfer(uint64_t StartNanos,
+                                              uint64_t EndNanos, void *Data) {
+
+  if (!isProfilingEnabled())
+    return;
 
-  if (!shouldEnableProfiling())
+  /// Empty data means no tracing in OMPT
+  /// offload/include/OpenMP/OMPT/Interface.h line 492
+  if (!Data)
     return;
 
+  DP("OMPT-Async: Time data for asynchronous execution: Start %lu "
+     "End %lu\n",
+     StartNanos, EndNanos);
+
   auto OmptEventInfo = reinterpret_cast<ompt::OmptEventInfoTy *>(Data);
   assert(OmptEventInfo && "Invalid OmptEventInfo");
   assert(OmptEventInfo->TraceRecord && "Invalid TraceRecord");
 
-  llvm::omp::target::ompt::RegionInterface.stopTargetDataMovementTraceAsync(
+  ompt::RegionInterface.stopTargetDataMovementTraceAsync(
       OmptEventInfo->TraceRecord, StartNanos, EndNanos);
 
   // Done processing, our responsibility to free the memory
   freeProfilerDataEntry(OmptEventInfo);
 }
 
-bool llvm::omp::target::ompt::OmptProfilerTy::shouldEnableProfiling() {
-  return llvm::omp::target::ompt::TracingActive;
-}
+bool ompt::OmptProfilerTy::isProfilingEnabled() { return ompt::TracingActive; }
 
-void llvm::omp::target::ompt::OmptProfilerTy::setTimeConversionFactorsImpl(
-    double Slope, double Offset) {
+void ompt::OmptProfilerTy::setTimeConversionFactorsImpl(double Slope,
+                                                        double Offset) {
   DP("Using Time Slope: %f and Offset: %f \n", Slope, Offset);
   setOmptHostToDeviceRate(Slope, Offset);
 }
diff --git a/offload/plugins-nextgen/common/OMPT/OmptProfiler.h b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h
index 0359d53eb7bd4..251689699ea94 100644
--- a/offload/plugins-nextgen/common/OMPT/OmptProfiler.h
+++ b/offload/plugins-nextgen/common/OMPT/OmptProfiler.h
@@ -19,7 +19,6 @@
 
 #include "OmptDeviceTracing.h"
 #include "OpenMP/OMPT/Callback.h"
-#include "OpenMP/OMPT/Interface.h"
 #include "Shared/Debug.h"
 #include "omp-tools.h"
 
@@ -80,7 +79,7 @@ class OmptProfilerTy : public plugin::GenericProfilerTy {
 #undef bindOmptTracingFunction
   }
 
-  bool shouldEnableProfiling() override;
+  bool isProfilingEnabled() override;
 
   void handleInit(plugin::GenericDeviceTy *Device,
                   plugin::GenericPluginTy *Plugin) override;
diff --git a/offload/plugins-nextgen/common/include/GenericProfiler.h b/offload/plugins-nextgen/common/include/GenericProfiler.h
index a7c5355533f6b..4df15c4f7b8be 100644
--- a/offload/plugins-nextgen/common/include/GenericProfiler.h
+++ b/offload/plugins-nextgen/common/include/GenericProfiler.h
@@ -60,7 +60,7 @@ class GenericProfilerTy {
   /// Obtain a pointer to profiler-specific data, if any.
   virtual void *getProfilerSpecificData() { return nullptr; }
 
-  virtual bool shouldEnableProfiling() { return false; }
+  virtual bool isProfilingEnabled() { return false; }
 
   /// Set the factors which are used to interpolate the device clock compared to
   /// the host clock. This follows a simple linear interpolation: Slope * <time>
diff --git a/offload/plugins-nextgen/common/include/PluginInterface.h b/offload/plugins-nextgen/common/include/PluginInterface.h
index bb8bb1d1badf2..d0f484210b785 100644
--- a/offload/plugins-nextgen/common/include/PluginInterface.h
+++ b/offload/plugins-nextgen/common/include/PluginInterface.h
@@ -1248,6 +1248,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// shared memory. Default implementation returns 'false',
   virtual bool useSharedMemForDescriptor(int64_t Size);
 
+  /// Returns true if the plugin can guarantee that the associated
+  /// storage is accessible
+  Expected<bool> isAccessiblePtr(const void *Ptr, size_t Size);
+
   virtual Expected<omp_interop_val_t *>
   createInterop(int32_t InteropType, interop_spec_t &InteropSpec) {
     return nullptr;
@@ -1367,6 +1371,10 @@ struct GenericDeviceTy : public DeviceAllocatorTy {
   /// Per device setting of MemoryManager's Threshold
   virtual size_t getMemoryManagerSizeThreshold() { return 0; }
 
+  virtual Expected<bool> isAccessiblePtrImpl(const void *Ptr, size_t Size) {
+    return false;
+  }
+
   /// Environment variables defined by the OpenMP standard.
   Int32Envar OMP_TeamLimit;
   Int32Envar OMP_NumTeams;
@@ -1866,6 +1874,9 @@ struct GenericPluginTy {
   void set_coarse_grain_mem(int32_t DeviceId, const void *ptr, int64_t size,
                             bool set_attr);
 
+  /// Returns if the associated storage is accessible for a given device.
+  int32_t is_accessible_ptr(int32_t DeviceId, const void *Ptr, size_t Size);
+
   /// Look up a global symbol in the given binary.
   int32_t get_global(__tgt_device_binary Binary, uint64_t Size,
                      const char *Name, void **DevicePtr);
diff --git a/offload/plugins-nextgen/common/src/PluginInterface.cpp b/offload/plugins-nextgen/common/src/PluginInterface.cpp
index 99e137f86f58c..07c8981701503 100644
--- a/offload/plugins-nextgen/common/src/PluginInterface.cpp
+++ b/offload/plugins-nextgen/common/src/PluginInterface.cpp
@@ -1846,6 +1846,10 @@ void GenericDeviceTy::clear_ArgBufs() {
   ArgBufEntries.clear();
 }
 
+Expected<bool> GenericDeviceTy::isAccessiblePtr(const void *Ptr, size_t Size) {
+  return isAccessiblePtrImpl(Ptr, Size);
+}
+
 Error GenericPluginTy::init() {
   if (Initialized)
     return Plugin::success();
@@ -2650,6 +2654,22 @@ void GenericPluginTy::set_coarse_grain_mem(int32_t DeviceId, const void *ptr,
   return;
 }
 
+int32_t GenericPluginTy::is_accessible_ptr(int32_t DeviceId, const void *Ptr,
+                                           size_t Size) {
+  auto HandleError = [&](Error Err) -> bool {
+    [[maybe_unused]] std::string ErrStr = toString(std::move(Err));
+    DP("Failure while checking accessibility of pointer %p for device %d: %s",
+       Ptr, DeviceId, ErrStr.c_str());
+    return false;
+  };
+
+  auto AccessibleOrErr = getDevice(DeviceId).isAccessiblePtr(Ptr, Size);
+  if (Error Err = AccessibleOrErr.takeError())
+    return HandleError(std::move(Err));
+
+  return *AccessibleOrErr;
+}
+
 int32_t GenericPluginTy::get_global(__tgt_device_binary Binary, uint64_t Size,
                                     const char *Name, void **DevicePtr) {
   auto T = logger::log<int32_t>(__func__, Binary.handle, Size, Name, DevicePtr);
diff --git a/offload/test/mapping/is_accessible.cpp b/offload/test/mapping/is_accessible.cpp
new file mode 100644
index 0000000000000..7fb23893408ea
--- /dev/null
+++ b/offload/test/mapping/is_accessible.cpp
@@ -0,0 +1,40 @@
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=1 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic
+
+// RUN: %libomptarget-compilexx-generic
+// RUN: env HSA_XNACK=0 %libomptarget-run-generic 2>&1 \
+// RUN: | %fcheck-generic -check-prefix=NO_USM
+
+// REQUIRES: unified_shared_memory
+// XFAIL: nvptx
+
+// CHECK: SUCCESS
+// NO_USM: Not accessible
+
+#include <assert.h>
+#include <iostream>
+#include <omp.h>
+#include <stdio.h>
+
+int main() {
+  int n = 10000;
+  int *a = new int[n];
+  int err = 0;
+
+  // program must be executed with HSA_XNACK=1
+  if (!omp_target_is_accessible(a, n * sizeof(int), /*device_num=*/0))
+    printf("Not accessible\n");
+  else {
+#pragma omp target teams distribute parallel for
+    for (int i = 0; i < n; i++)
+      a[i] = i;
+
+    for (int i = 0; i < n; i++)
+      if (a[i] != i)
+        err++;
+  }
+
+  printf("%s\n", err == 0 ? "SUCCESS" : "FAIL");
+  return err;
+}
diff --git a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
index 5e8769eb3079d..50a28e0d9514b 100644
--- a/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
+++ b/offload/test/mapping/use_device_addr/target_data_use_device_addr_class_member_ref_with_map.cpp
@@ -16,7 +16,7 @@ struct ST {
   int m = 0;
 
   void f6() {
-    uintptr_t offset = (uintptr_t)&d - n;
+    ptrdiff_t offset = (char *)&d - ((char *)(uintptr_t)n);
 #pragma omp target data map(to : m, d)
     {
       void *mapped_ptr = omp_get_mapped_ptr(&d, omp_get_default_device());
@@ -34,11 +34,15 @@ struct ST {
         // ref/attach modifiers:
         //  &ref_ptee(this[0].[d])), &ref_ptee(this[0].d), TO | FROM
         //  &ref_ptr(this[0].d), &ref_ptee(this[0].d), 4, ATTACH
-        // EXPECTED: 1 0
-        // CHECK:    0 1
-        printf("%d %d\n", &d == mapped_ptr,
-               (uintptr_t)&d == (uintptr_t)mapped_ptr - offset);
+        // EXPECTED:   1
+        // CHECK-NEXT: 0
+        printf("%d\n", &d == mapped_ptr);
+        ptrdiff_t offset_device = (char *)mapped_ptr - (char *)&d;
+        printf("offset = %td (%p), offset_device = %td (%p)\n", offset,
+               (void *)offset, offset_device, (void *)offset_device);
+        printf("mapped_ptr = %p, device_addr = %p, ", mapped_ptr, &d);
       }
+      printf("host_addr = %p\n", &d);
     }
   }
 };
diff --git a/offload/test/ompt/callbacks.h b/offload/test/ompt/callbacks.h
index 95437d9cdcfb1..2e7763f0abbac 100644
--- a/offload/test/ompt/callbacks.h
+++ b/offload/test/ompt/callbacks.h
@@ -5,6 +5,37 @@
 // Tool related code below
 #include <omp-tools.h>
 
+static const char *ompt_target_data_op_t_values[] = {
+    "",
+    "ompt_target_data_alloc",
+    "ompt_target_data_transfer_to_device",
+    "ompt_target_data_transfer_from_device",
+    "ompt_target_data_delete",
+    "ompt_target_data_associate",
+    "ompt_target_data_disassociate",
+    "ompt_target_data_alloc_async",
+    "ompt_target_data_transfer_to_device_async",
+    "ompt_target_data_transfer_from_device_async",
+    "ompt_target_data_delete_async"};
+
+static const char *ompt_scope_endpoint_t_values[] = {
+    "", "ompt_scope_begin", "ompt_scope_end", "ompt_scope_beginend"};
+
+static const char *ompt_target_t_values[] = {"",
+                                             "ompt_target",
+                                             "ompt_target_enter_data",
+                                             "ompt_target_exit_data",
+                                             "ompt_target_update",
+                                             "",
+                                             "",
+                                             "",
+                                             "",
+                                             "",
+                                             "ompt_target_nowait",
+                                             "ompt_target_enter_data_nowait",
+                                             "ompt_target_exit_data_nowait",
+                                             "ompt_target_update_nowait"};
+
 // For EMI callbacks
 ompt_id_t next_op_id = 0x8000000000000001;
 
@@ -38,11 +69,11 @@ static void on_ompt_callback_target_data_op(
     void *src_addr, int src_device_num, void *dest_addr, int dest_device_num,
     size_t bytes, const void *codeptr_ra) {
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("  Callback DataOp: target_id=%lu host_op_id=%lu optype=%d src=%p "
+  printf("  Callback DataOp: target_id=%lu host_op_id=%lu optype=%s src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         target_id, host_op_id, optype, src_addr, src_device_num, dest_addr,
-         dest_device_num, bytes, codeptr_ra);
+         target_id, host_op_id, ompt_target_data_op_t_values[optype], src_addr,
+         src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
 }
 
 static void on_ompt_callback_target(ompt_target_t kind,
@@ -51,9 +82,10 @@ static void on_ompt_callback_target(ompt_target_t kind,
                                     ompt_id_t target_id,
                                     const void *codeptr_ra) {
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
-  printf("Callback Target: target_id=%lu kind=%d endpoint=%d device_num=%d "
+  printf("Callback Target: target_id=%lu kind=%s endpoint=%s device_num=%d "
          "code=%p\n",
-         target_id, kind, endpoint, device_num, codeptr_ra);
+         target_id, ompt_target_t_values[kind],
+         ompt_scope_endpoint_t_values[endpoint], device_num, codeptr_ra);
 }
 
 static void on_ompt_callback_target_submit(ompt_id_t target_id,
@@ -84,13 +116,15 @@ static void on_ompt_callback_target_data_op_emi(
   // target_task_data may be null, avoid dereferencing it
   uint64_t target_task_data_value =
       (target_task_data) ? target_task_data->value : 0;
-  printf("  Callback DataOp EMI: endpoint=%d optype=%d target_task_data=%p "
+  printf("  Callback DataOp EMI: endpoint=%s optype=%s target_task_data=%p "
          "(0x%lx) target_data=%p (0x%lx) host_op_id=%p (0x%lx) src=%p "
          "src_device_num=%d "
          "dest=%p dest_device_num=%d bytes=%lu code=%p\n",
-         endpoint, optype, target_task_data, target_task_data_value,
-         target_data, target_data->value, host_op_id, *host_op_id, src_addr,
-         src_device_num, dest_addr, dest_device_num, bytes, codeptr_ra);
+         ompt_scope_endpoint_t_values[endpoint],
+         ompt_target_data_op_t_values[optype], target_task_data,
+         target_task_data_value, target_data, target_data->value, host_op_id,
+         *host_op_id, src_addr, src_device_num, dest_addr, dest_device_num,
+         bytes, codeptr_ra);
 }
 
 static void on_ompt_callback_target_emi(ompt_target_t kind,
@@ -102,20 +136,21 @@ static void on_ompt_callback_target_emi(ompt_target_t kind,
   assert(codeptr_ra != 0 && "Unexpected null codeptr");
   if (endpoint == ompt_scope_begin)
     target_data->value = next_op_id++;
-  printf("Callback Target EMI: kind=%d endpoint=%d device_num=%d task_data=%p "
+  printf("Callback Target EMI: kind=%s endpoint=%s device_num=%d task_data=%p "
          "(0x%lx) target_task_data=%p (0x%lx) target_data=%p (0x%lx) code=%p\n",
-         kind, endpoint, device_num, task_data, task_data->value,
-         target_task_data, target_task_data->value, target_data,
-         target_data->value, codeptr_ra);
+         ompt_target_t_values[kind], ompt_scope_endpoint_t_values[endpoint],
+         device_num, task_data, task_data ? task_data->value : 0,
+         target_task_data, target_task_data ? target_task_data->value : 0,
+         target_data, target_data->value, codeptr_ra);
 }
 
 static void on_ompt_callback_target_submit_emi(
     ompt_scope_endpoint_t endpoint, ompt_data_t *target_data,
     ompt_id_t *host_op_id, unsigned int requested_num_teams) {
-  printf("  Callback Submit EMI: endpoint=%d  req_num_teams=%d target_data=%p "
+  printf("  Callback Submit EMI: endpoint=%s  req_num_teams=%d target_data=%p "
          "(0x%lx) host_op_id=%p (0x%lx)\n",
-         endpoint, requested_num_teams, target_data, target_data->value,
-         host_op_id, *host_op_id);
+         ompt_scope_endpoint_t_values[endpoint], requested_num_teams,
+         target_data, target_data->value, host_op_id, *host_op_id);
 }
 
 static void on_ompt_callback_target_map_emi(ompt_data_t *target_data,
diff --git a/offload/test/ompt/omp_api.c b/offload/test/ompt/omp_api.c
index a16ef7a64aa7d..5fb2098f0ce79 100644
--- a/offload/test/ompt/omp_api.c
+++ b/offload/test/ompt/omp_api.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 #include "omp.h"
 #include <stdlib.h>
@@ -32,8 +34,8 @@ int main(int argc, char **argv) {
 
 // clang-format off
 /// CHECK: Callback Init:
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=5
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=6
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_associate
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_disassociate
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/target_memcpy.c b/offload/test/ompt/target_memcpy.c
index f244e0f418ed6..f769995579f50 100644
--- a/offload/test/ompt/target_memcpy.c
+++ b/offload/test/ompt/target_memcpy.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Verify that for the target OpenMP APIs, the return address is non-null and
@@ -46,26 +48,26 @@ int main() {
 }
 
 // clang-format off
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
 /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-SAME: src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE1]]
 /// CHECK: code=[[CODE2:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE2]]
 /// CHECK: code=[[CODE3:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK-SAME: src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE3]]
 /// CHECK: code=[[CODE4:0x[0-f]+]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK-NOT: code=(nil)
 /// CHECK-NOT: code=[[CODE4]]
diff --git a/offload/test/ompt/target_memcpy_emi.c b/offload/test/ompt/target_memcpy_emi.c
index 934caba6efab3..39f262a366f94 100644
--- a/offload/test/ompt/target_memcpy_emi.c
+++ b/offload/test/ompt/target_memcpy_emi.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Verify all three data transfer directions: H2D, D2D and D2H
@@ -54,28 +56,28 @@ int main(void) {
 /// CHECK: Callback Init:
 
 /// CHECK: Allocating Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK-SAME: src_device_num=[[HOST:[0-9]+]]
 /// CHECK-SAME: dest_device_num=[[DEVICE:[0-9]+]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Host to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2 {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device {{.+}} src_device_num=[[HOST]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Device to Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[DEVICE]]
 
 /// CHECK: Testing: Device to Host
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3 {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device {{.+}} src_device_num=[[DEVICE]] {{.+}} dest_device_num=[[HOST]]
 
 /// CHECK: Checking Correctness
 
 /// CHECK: Freeing Memory on Device
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4 {{.+}} src_device_num=[[DEVICE]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4 {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]]
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete {{.+}} src_device_num=[[DEVICE]]
 
 /// CHECK: Callback Fini:
 
diff --git a/offload/test/ompt/veccopy.c b/offload/test/ompt/veccopy.c
index f28d94f524bb8..24d7363e65599 100644
--- a/offload/test/ompt/veccopy.c
+++ b/offload/test/ompt/veccopy.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers non-EMI callbacks
@@ -54,48 +56,47 @@ int main() {
 // clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1 device_num=[[DEVICE_NUM:[0-9]+]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM:[0-9]+]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE1]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE1]]
 
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// device_num=[[DEVICE_NUM]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin device_num=[[DEVICE_NUM]]
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2 device_num=[[DEVICE_NUM]] code=[[CODE2]]
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end device_num=[[DEVICE_NUM]] code=[[CODE2]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_data.c b/offload/test/ompt/veccopy_data.c
index 059ca97c3cde3..9df5374193e94 100644
--- a/offload/test/ompt/veccopy_data.c
+++ b/offload/test/ompt/veccopy_data.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers EMI callbacks.
@@ -73,85 +75,86 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK-NOT: Callback Target EMI:
 /// CHECK-NOT: device_num=-1
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=2 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=2 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_enter_data endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end  req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE3:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=3 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_exit_data endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE3]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE4:.*]]
-/// CHECK: Callback Submit EMI: endpoint=1  req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2  req_num_teams=1
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin  req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end  req_num_teams=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE4]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_begin
 /// CHECK-NOT: device_num=-1
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE5:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE5]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE5]]
-/// CHECK: Callback Target EMI: kind=4 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target_update endpoint=ompt_scope_end
 /// CHECK-NOT: device_num=-1
 /// CHECK: code=[[CODE5]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_disallow_both.c b/offload/test/ompt/veccopy_disallow_both.c
index b531a628803e4..bfc67c5f4d274 100644
--- a/offload/test/ompt/veccopy_disallow_both.c
+++ b/offload/test/ompt/veccopy_disallow_both.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that both EMI and non-EMI
@@ -54,48 +56,49 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_emi.c b/offload/test/ompt/veccopy_emi.c
index 2c57a85c14756..a1427b86a58fa 100644
--- a/offload/test/ompt/veccopy_emi.c
+++ b/offload/test/ompt/veccopy_emi.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that registers EMI callbacks
@@ -52,89 +54,90 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE1:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE1]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: code=[[CODE1]]
 
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
 /// CHECK-NOT: code=(nil)
 /// CHECK: code=[[CODE2:.*]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
 /// CHECK: code=[[CODE2]]
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: code=[[CODE2]]
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_emi_map.c b/offload/test/ompt/veccopy_emi_map.c
index fa18a43cd8a50..450faa1f28b0e 100644
--- a/offload/test/ompt/veccopy_emi_map.c
+++ b/offload/test/ompt/veccopy_emi_map.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that map-EMI callbacks are not supported.
@@ -52,51 +54,52 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: 0: Could not register callback 'ompt_callback_target_map_emi'
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=1
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
-/// CHECK: Callback Target EMI: kind=1 endpoint=1
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=1
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=1
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=1
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_alloc
 /// CHECK-NOT: dest=(nil)
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=2
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=2
-/// CHECK: Callback Submit EMI: endpoint=1 req_num_teams=0
-/// CHECK: Callback Submit EMI: endpoint=2 req_num_teams=0
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=3
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=1 optype=4
-/// CHECK: Callback DataOp EMI: endpoint=2 optype=4
-/// CHECK: Callback Target EMI: kind=1 endpoint=2
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_begin req_num_teams=0
+/// CHECK: Callback Submit EMI: endpoint=ompt_scope_end req_num_teams=0
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_begin optype=ompt_target_data_delete
+/// CHECK: Callback DataOp EMI: endpoint=ompt_scope_end optype=ompt_target_data_delete
+/// CHECK: Callback Target EMI: kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_map.c b/offload/test/ompt/veccopy_map.c
index 108693161e3b2..12e141ea74d07 100644
--- a/offload/test/ompt/veccopy_map.c
+++ b/offload/test/ompt/veccopy_map.c
@@ -1,6 +1,8 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
 // REQUIRES: gpu
+// clang-format on
 
 /*
  * Example OpenMP program that shows that map callbacks are not supported.
@@ -51,30 +53,31 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK: 0: Could not register callback 'ompt_callback_target_map'
 /// CHECK: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK: Callback Fini:
diff --git a/offload/test/ompt/veccopy_no_device_init.c b/offload/test/ompt/veccopy_no_device_init.c
index 8ee8243281187..ade06fcc92290 100644
--- a/offload/test/ompt/veccopy_no_device_init.c
+++ b/offload/test/ompt/veccopy_no_device_init.c
@@ -1,6 +1,7 @@
 // clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
+// clang-format on
 
 /*
  * Example OpenMP program that shows that if no device init callback
@@ -51,30 +52,31 @@ int main() {
 
   return rc;
 }
+
 // clang-format off
 /// CHECK-NOT: Callback Init:
 /// CHECK: Callback Load:
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: Callback Fini:
diff --git a/offload/test/ompt/veccopy_wrong_return.c b/offload/test/ompt/veccopy_wrong_return.c
index 2d07b4e1bf04a..17327f3553817 100644
--- a/offload/test/ompt/veccopy_wrong_return.c
+++ b/offload/test/ompt/veccopy_wrong_return.c
@@ -1,5 +1,7 @@
+// clang-format off
 // RUN: %libomptarget-compile-run-and-check-generic
 // REQUIRES: ompt
+// clang-format on
 
 /*
  * Example OpenMP program that shows that if the initialize function
@@ -51,29 +53,30 @@ int main() {
   return rc;
 }
 
+// clang-format off
 /// CHECK-NOT: Callback Init:
 /// CHECK-NOT: Callback Load:
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=1
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=2
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_begin
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_alloc
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_to_device
 /// CHECK-NOT: Callback Submit: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] req_num_teams=0
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=3
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=4
-/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=1 endpoint=2
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_transfer_from_device
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback DataOp: target_id=[[TARGET_ID:[0-9]+]] host_op_id=[[HOST_OP_ID:[0-9]+]] optype=ompt_target_data_delete
+/// CHECK-NOT: Callback Target: target_id=[[TARGET_ID:[0-9]+]] kind=ompt_target endpoint=ompt_scope_end
 /// CHECK-NOT: Callback Fini
diff --git a/polly/docs/ReleaseNotes.rst b/polly/docs/ReleaseNotes.rst
index f7c9689089be2..f5ea47b69cf02 100644
--- a/polly/docs/ReleaseNotes.rst
+++ b/polly/docs/ReleaseNotes.rst
@@ -11,3 +11,5 @@ In Polly |version| the following important changes have been incorporated.
     the new features that have recently been committed to our development
     branch.
 
+ * ScopInliner has been updated for the New Pass Manager.
+
diff --git a/polly/docs/experiments/matmul/matmul.ll b/polly/docs/experiments/matmul/matmul.ll
index b5bc4b0b7f690..6c789edc19ea1 100644
--- a/polly/docs/experiments/matmul/matmul.ll
+++ b/polly/docs/experiments/matmul/matmul.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() #0 {
+define dso_local void @init_array() {
 entry:
   %i = alloca i32, align 4
   %j = alloca i32, align 4
@@ -88,7 +88,7 @@ for.end19:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() #0 {
+define dso_local void @print_array() {
 entry:
   %i = alloca i32, align 4
   %j = alloca i32, align 4
@@ -154,10 +154,10 @@ for.end12:                                        ; preds = %for.cond
   ret void
 }
 
-declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...)
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   %retval = alloca i32, align 4
   %i = alloca i32, align 4
@@ -261,9 +261,6 @@ for.end30:                                        ; preds = %for.cond
   ret i32 0
 }
 
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/polly/docs/experiments/matmul/matmul.normalopt.ll b/polly/docs/experiments/matmul/matmul.normalopt.ll
index 1625dc7511a2b..a2c28b89da500 100644
--- a/polly/docs/experiments/matmul/matmul.normalopt.ll
+++ b/polly/docs/experiments/matmul/matmul.normalopt.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -60,7 +60,7 @@ for.end19:                                        ; preds = %for.inc17
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -75,7 +75,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -103,10 +103,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   br label %for.cond1.preheader
@@ -164,13 +164,7 @@ for.end30:                                        ; preds = %for.inc28
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
index 3216d79c2b448..3be4393f3051a 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector+openmp.ll
@@ -13,18 +13,18 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   %polly.par.userContext = alloca {}, align 8
   %polly.par.userContext1 = bitcast {}* %polly.par.userContext to i8*
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3
-  call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @init_array_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1)
+  call void @init_array_polly_subfn(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
   ret void
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -39,7 +39,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #3
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -67,27 +67,27 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #0 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   %polly.par.userContext3 = alloca {}, align 8
   tail call void @init_array()
   %polly.par.userContext1 = bitcast {}* %polly.par.userContext3 to i8*
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1) #3
-  call void @main_polly_subfn(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
-  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64) #3
-  call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1) #3
-  call void @GOMP_parallel_end() #3
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 1)
+  call void @main_polly_subfn(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
+  call void @GOMP_parallel_loop_runtime_start(void (i8*)* nonnull @main_polly_subfn_1, i8* nonnull %polly.par.userContext1, i32 0, i64 0, i64 1536, i64 64)
+  call void @main_polly_subfn_1(i8* nonnull %polly.par.userContext1)
+  call void @GOMP_parallel_end()
   ret i32 0
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #3
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
-define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @init_array_polly_subfn(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -146,7 +146,7 @@ declare void @GOMP_parallel_loop_runtime_start(void (i8*)*, i8*, i32, i64, i64,
 
 declare void @GOMP_parallel_end() local_unnamed_addr
 
-define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @main_polly_subfn(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -175,7 +175,7 @@ polly.par.loadIVBounds:                           ; preds = %polly.par.setup, %p
   br i1 %7, label %polly.par.exit, label %polly.par.loadIVBounds
 }
 
-define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) #4 {
+define internal void @main_polly_subfn_1(i8* nocapture readnone %polly.par.userContext) {
 polly.par.setup:
   %polly.par.LBPtr = alloca i64, align 8
   %polly.par.UBPtr = alloca i64, align 8
@@ -363,14 +363,7 @@ vector.ph:                                        ; preds = %polly.loop_header14
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-attributes #4 = { "polly.skip.fn" }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
index 4afccd594245a..60228a455d56b 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled+vector.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -282,17 +282,10 @@ vector.ph:                                        ; preds = %polly.loop_header26
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
index 0641b551ebecd..d2786e0272b02 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged+tiled.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -343,17 +343,10 @@ vector.ph:                                        ; preds = %polly.loop_header26
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
index bfc74f2bd290f..5854cadff1c2f 100644
--- a/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
+++ b/polly/docs/experiments/matmul/matmul.polly.interchanged.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @C = common dso_local local_unnamed_addr global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: noinline norecurse nounwind uwtable writeonly
-define dso_local void @init_array() local_unnamed_addr #0 {
+define dso_local void @init_array() local_unnamed_addr {
 entry:
   br label %polly.loop_header
 
@@ -61,7 +61,7 @@ polly.loop_header1:                               ; preds = %polly.loop_header1,
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() local_unnamed_addr #1 {
+define dso_local void @print_array() local_unnamed_addr {
 entry:
   br label %for.cond1.preheader
 
@@ -76,7 +76,7 @@ for.body3:                                        ; preds = %for.inc, %for.cond1
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #4
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -104,10 +104,10 @@ for.end12:                                        ; preds = %for.end
 }
 
 ; Function Attrs: nounwind
-declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr #2
+declare dso_local i32 @fprintf(%struct._IO_FILE* nocapture, i8* nocapture readonly, ...) local_unnamed_addr
 
 ; Function Attrs: noinline norecurse nounwind uwtable
-define dso_local i32 @main() local_unnamed_addr #3 {
+define dso_local i32 @main() local_unnamed_addr {
 entry:
   tail call void @init_array()
   call void @llvm.memset.p0i8.i64(i8* align 16 bitcast ([1536 x [1536 x float]]* @C to i8*), i8 0, i64 9437184, i1 false)
@@ -189,17 +189,10 @@ polly.loop_exit22:                                ; preds = %vector.body
 }
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr #4
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture) local_unnamed_addr
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #5
-
-attributes #0 = { noinline norecurse nounwind uwtable writeonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "polly-optimized" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind }
-attributes #5 = { argmemonly nounwind }
+declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/docs/experiments/matmul/matmul.preopt.ll b/polly/docs/experiments/matmul/matmul.preopt.ll
index 3b9446fcc9668..e55be8bad6f2e 100644
--- a/polly/docs/experiments/matmul/matmul.preopt.ll
+++ b/polly/docs/experiments/matmul/matmul.preopt.ll
@@ -14,7 +14,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @.str.1 = private unnamed_addr constant [2 x i8] c"\0A\00", align 1
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @init_array() #0 {
+define dso_local void @init_array() {
 entry:
   br label %entry.split
 
@@ -52,7 +52,7 @@ for.end19:                                        ; preds = %for.inc17
 }
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local void @print_array() #0 {
+define dso_local void @print_array() {
 entry:
   br label %entry.split
 
@@ -70,7 +70,7 @@ for.body3:                                        ; preds = %for.cond1.preheader
   %arrayidx5 = getelementptr inbounds [1536 x [1536 x float]], [1536 x [1536 x float]]* @C, i64 0, i64 %indvars.iv6, i64 %indvars.iv
   %2 = load float, float* %arrayidx5, align 4
   %conv = fpext float %2 to double
-  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv) #2
+  %call = tail call i32 (%struct._IO_FILE*, i8*, ...) @fprintf(%struct._IO_FILE* %1, i8* getelementptr inbounds ([5 x i8], [5 x i8]* @.str, i64 0, i64 0), double %conv)
   %3 = trunc i64 %indvars.iv to i32
   %rem = urem i32 %3, 80
   %cmp6 = icmp eq i32 %rem, 79
@@ -98,10 +98,10 @@ for.end12:                                        ; preds = %for.end
   ret void
 }
 
-declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...) #1
+declare dso_local i32 @fprintf(%struct._IO_FILE*, i8*, ...)
 
 ; Function Attrs: noinline nounwind uwtable
-define dso_local i32 @main() #0 {
+define dso_local i32 @main() {
 entry:
   br label %entry.split
 
@@ -148,14 +148,10 @@ for.end30:                                        ; preds = %for.inc28
 }
 
 ; Function Attrs: nounwind
-declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture) #2
+declare i64 @fwrite(i8* nocapture, i64, i64, %struct._IO_FILE* nocapture)
 
 ; Function Attrs: nounwind
-declare i32 @fputc(i32, %struct._IO_FILE* nocapture) #2
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare i32 @fputc(i32, %struct._IO_FILE* nocapture)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
diff --git a/polly/include/polly/LinkAllPasses.h b/polly/include/polly/LinkAllPasses.h
index c3b68a74056ac..9978344c73e9f 100644
--- a/polly/include/polly/LinkAllPasses.h
+++ b/polly/include/polly/LinkAllPasses.h
@@ -119,7 +119,7 @@ struct PollyForcePassLinking {
 
 namespace llvm {
 void initializeCodePreparationPass(llvm::PassRegistry &);
-void initializeScopInlinerPass(llvm::PassRegistry &);
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
 void initializeScopDetectionWrapperPassPass(llvm::PassRegistry &);
 void initializeScopDetectionPrinterLegacyPassPass(llvm::PassRegistry &);
 void initializeScopInfoRegionPassPass(PassRegistry &);
diff --git a/polly/include/polly/ScopInliner.h b/polly/include/polly/ScopInliner.h
new file mode 100644
index 0000000000000..014667804330f
--- /dev/null
+++ b/polly/include/polly/ScopInliner.h
@@ -0,0 +1,34 @@
+//===------ ScopInliner.h ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef POLLY_POLLYINLINER_H
+#define POLLY_POLLYINLINER_H
+
+#include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/LazyCallGraph.h"
+#include "llvm/IR/PassManager.h"
+
+namespace polly {
+class ScopInlinerPass : public llvm::PassInfoMixin<ScopInlinerPass> {
+public:
+  ScopInlinerPass();
+
+  llvm::PreservedAnalyses run(llvm::LazyCallGraph::SCC &C,
+                              llvm::CGSCCAnalysisManager &AM,
+                              llvm::LazyCallGraph &CG,
+                              llvm::CGSCCUpdateResult &UR);
+};
+
+llvm::Pass *createScopInlinerWrapperPass();
+} // namespace polly
+
+namespace llvm {
+void initializeScopInlinerWrapperPassPass(llvm::PassRegistry &);
+}
+
+#endif /* POLLY_POLLYINLINER_H */
diff --git a/polly/lib/Support/PollyPasses.def b/polly/lib/Support/PollyPasses.def
index e068f31fdb703..2c792a5867100 100644
--- a/polly/lib/Support/PollyPasses.def
+++ b/polly/lib/Support/PollyPasses.def
@@ -1,3 +1,9 @@
+#ifndef CGSCC_PASS
+#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)
+#endif
+CGSCC_PASS("polly-inline", ScopInlinerPass(), parseNoOptions)
+#undef CGSCC_PASS
+
 #ifndef FUNCTION_ANALYSIS
 #define FUNCTION_ANALYSIS(NAME, CREATE_PASS)
 #endif
diff --git a/polly/lib/Support/RegisterPasses.cpp b/polly/lib/Support/RegisterPasses.cpp
index 0420dff944f62..04f8715502c38 100644
--- a/polly/lib/Support/RegisterPasses.cpp
+++ b/polly/lib/Support/RegisterPasses.cpp
@@ -35,6 +35,7 @@
 #include "polly/ScopDetection.h"
 #include "polly/ScopGraphPrinter.h"
 #include "polly/ScopInfo.h"
+#include "polly/ScopInliner.h"
 #include "polly/Simplify.h"
 #include "polly/Support/DumpFunctionPass.h"
 #include "polly/Support/DumpModulePass.h"
@@ -46,10 +47,13 @@
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Passes/PassPlugin.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/TargetSelect.h"
 #include "llvm/Transforms/IPO.h"
 
+using namespace llvm;
 namespace cl = llvm::cl;
+using namespace polly;
 
 using llvm::FunctionPassManager;
 using llvm::OptimizationLevel;
@@ -233,7 +237,7 @@ void initializePollyPasses(llvm::PassRegistry &Registry) {
   initializePollyCanonicalizePass(Registry);
   initializeScopDetectionWrapperPassPass(Registry);
   initializeScopDetectionPrinterLegacyPassPass(Registry);
-  initializeScopInlinerPass(Registry);
+  initializeScopInlinerWrapperPassPass(Registry);
   initializeScopInfoRegionPassPass(Registry);
   initializeScopInfoPrinterLegacyRegionPassPass(Registry);
   initializeScopInfoWrapperPassPass(Registry);
@@ -434,6 +438,16 @@ static void buildLatePollyPipeline(FunctionPassManager &PM,
         false);
 }
 
+static llvm::Expected<std::monostate> parseNoOptions(StringRef Params) {
+  if (!Params.empty())
+    return make_error<StringError>(
+        formatv("'{0}' passed to pass that does not take any options", Params)
+            .str(),
+        inconvertibleErrorCode());
+
+  return std::monostate{};
+}
+
 static OwningScopAnalysisManagerFunctionProxy
 createScopAnalyses(FunctionAnalysisManager &FAM,
                    PassInstrumentationCallbacks *PIC) {
@@ -461,6 +475,23 @@ static void registerFunctionAnalyses(FunctionAnalysisManager &FAM,
   FAM.registerPass([&FAM, PIC] { return createScopAnalyses(FAM, PIC); });
 }
 
+static llvm::Expected<bool>
+parseCGPipeline(StringRef Name, llvm::CGSCCPassManager &CGPM,
+                PassInstrumentationCallbacks *PIC,
+                ArrayRef<PassBuilder::PipelineElement> Pipeline) {
+#define CGSCC_PASS(NAME, CREATE_PASS, PARSER)                                  \
+  if (PassBuilder::checkParametrizedPassName(Name, NAME)) {                    \
+    auto Params = PassBuilder::parsePassParameters(PARSER, Name, NAME);        \
+    if (!Params)                                                               \
+      return Params.takeError();                                               \
+    CGPM.addPass(CREATE_PASS);                                                 \
+    return true;                                                               \
+  }
+#include "PollyPasses.def"
+
+  return false;
+}
+
 static bool
 parseFunctionPipeline(StringRef Name, FunctionPassManager &FPM,
                       ArrayRef<PassBuilder::PipelineElement> Pipeline) {
@@ -598,6 +629,12 @@ void registerPollyPasses(PassBuilder &PB) {
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
         return parseScopPipeline(Name, FPM, PIC, Pipeline);
       });
+  PB.registerPipelineParsingCallback(
+      [PIC](StringRef Name, CGSCCPassManager &CGPM,
+            ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
+        ExitOnError Err("Unable to parse Polly call graph pass: ");
+        return Err(parseCGPipeline(Name, CGPM, PIC, Pipeline));
+      });
   PB.registerParseTopLevelPipelineCallback(
       [PIC](llvm::ModulePassManager &MPM,
             ArrayRef<PassBuilder::PipelineElement> Pipeline) -> bool {
diff --git a/polly/lib/Transform/ScopInliner.cpp b/polly/lib/Transform/ScopInliner.cpp
index b78206c1e40ba..c04ba3498339e 100644
--- a/polly/lib/Transform/ScopInliner.cpp
+++ b/polly/lib/Transform/ScopInliner.cpp
@@ -13,10 +13,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "polly/LinkAllPasses.h"
+#include "polly/ScopInliner.h"
 #include "polly/ScopDetection.h"
+#include "polly/ScopInliner.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/RegionInfo.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Passes/PassBuilder.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
@@ -28,13 +32,77 @@ using namespace llvm;
 using namespace polly;
 
 namespace {
-class ScopInliner final : public CallGraphSCCPass {
+
+/// Inliner implementation that works with both, LPM (using SCC_t=CallGraph) and
+/// NPM (using SCC_t=LazyCallGraph::SCC)
+template <typename SCC_t> bool runScopInlinerImpl(Function *F, SCC_t &SCC) {
+  // We do not try to inline non-trivial SCCs because this would lead to
+  // "infinite" inlining if we are not careful.
+  if (SCC.size() > 1)
+    return false;
+  assert(SCC.size() == 1 && "found empty SCC");
+
+  // If the function is a nullptr, or the function is a declaration.
+  if (!F)
+    return false;
+  if (F->isDeclaration()) {
+    POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
+                       << "because it is a declaration.\n");
+    return false;
+  }
+
+  PassBuilder PB;
+  // Populate analysis managers and register Polly-specific analyses.
+  LoopAnalysisManager LAM;
+  FunctionAnalysisManager FAM;
+  CGSCCAnalysisManager CGAM;
+  ModuleAnalysisManager MAM;
+  PB.registerModuleAnalyses(MAM);
+  PB.registerCGSCCAnalyses(CGAM);
+  PB.registerFunctionAnalyses(FAM);
+  PB.registerLoopAnalyses(LAM);
+  PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
+
+  auto &DT = FAM.getResult<DominatorTreeAnalysis>(*F);
+  auto &SE = FAM.getResult<ScalarEvolutionAnalysis>(*F);
+  auto &LI = FAM.getResult<LoopAnalysis>(*F);
+  auto &RI = FAM.getResult<RegionInfoAnalysis>(*F);
+  auto &AA = FAM.getResult<AAManager>(*F);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  ScopDetection SD(DT, SE, LI, RI, AA, ORE);
+  SD.detect(*F);
+
+  const bool HasScopAsTopLevelRegion =
+      SD.ValidRegions.contains(RI.getTopLevelRegion());
+
+  bool Changed = false;
+  if (HasScopAsTopLevelRegion) {
+    POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
+                       << " has scop as top level region");
+    F->addFnAttr(llvm::Attribute::AlwaysInline);
+
+    ModulePassManager MPM;
+    MPM.addPass(AlwaysInlinerPass());
+    Module *M = F->getParent();
+    assert(M && "Function has illegal module");
+    PreservedAnalyses PA = MPM.run(*M, MAM);
+    if (!PA.areAllPreserved())
+      Changed = true;
+  } else {
+    POLLY_DEBUG(dbgs() << F->getName()
+                       << " does NOT have scop as top level region\n");
+  }
+
+  return Changed;
+}
+
+class ScopInlinerWrapperPass final : public CallGraphSCCPass {
   using llvm::Pass::doInitialization;
 
 public:
   static char ID;
 
-  ScopInliner() : CallGraphSCCPass(ID) {}
+  ScopInlinerWrapperPass() : CallGraphSCCPass(ID) {}
 
   bool doInitialization(CallGraph &CG) override {
     if (!polly::PollyAllowFullFunction) {
@@ -50,60 +118,8 @@ class ScopInliner final : public CallGraphSCCPass {
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override {
-    // We do not try to inline non-trivial SCCs because this would lead to
-    // "infinite" inlining if we are not careful.
-    if (SCC.size() > 1)
-      return false;
-    assert(SCC.size() == 1 && "found empty SCC");
     Function *F = (*SCC.begin())->getFunction();
-
-    // If the function is a nullptr, or the function is a declaration.
-    if (!F)
-      return false;
-    if (F->isDeclaration()) {
-      POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
-                         << "because it is a declaration.\n");
-      return false;
-    }
-
-    PassBuilder PB;
-    // Populate analysis managers and register Polly-specific analyses.
-    LoopAnalysisManager LAM;
-    FunctionAnalysisManager FAM;
-    CGSCCAnalysisManager CGAM;
-    ModuleAnalysisManager MAM;
-    FAM.registerPass([] { return ScopAnalysis(); });
-    PB.registerModuleAnalyses(MAM);
-    PB.registerCGSCCAnalyses(CGAM);
-    PB.registerFunctionAnalyses(FAM);
-    PB.registerLoopAnalyses(LAM);
-    PB.crossRegisterProxies(LAM, FAM, CGAM, MAM);
-
-    RegionInfo &RI = FAM.getResult<RegionInfoAnalysis>(*F);
-    ScopDetection &SD = FAM.getResult<ScopAnalysis>(*F);
-
-    const bool HasScopAsTopLevelRegion =
-        SD.ValidRegions.contains(RI.getTopLevelRegion());
-
-    bool Changed = false;
-    if (HasScopAsTopLevelRegion) {
-      POLLY_DEBUG(dbgs() << "Skipping " << F->getName()
-                         << " has scop as top level region");
-      F->addFnAttr(llvm::Attribute::AlwaysInline);
-
-      ModulePassManager MPM;
-      MPM.addPass(AlwaysInlinerPass());
-      Module *M = F->getParent();
-      assert(M && "Function has illegal module");
-      PreservedAnalyses PA = MPM.run(*M, MAM);
-      if (!PA.areAllPreserved())
-        Changed = true;
-    } else {
-      POLLY_DEBUG(dbgs() << F->getName()
-                         << " does NOT have scop as top level region\n");
-    }
-
-    return Changed;
+    return runScopInlinerImpl(F, SCC);
   };
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -111,18 +127,39 @@ class ScopInliner final : public CallGraphSCCPass {
   }
 };
 } // namespace
-char ScopInliner::ID;
+char ScopInlinerWrapperPass::ID;
 
-Pass *polly::createScopInlinerPass() {
-  ScopInliner *pass = new ScopInliner();
+Pass *polly::createScopInlinerWrapperPass() {
+  ScopInlinerWrapperPass *pass = new ScopInlinerWrapperPass();
   return pass;
 }
 
 INITIALIZE_PASS_BEGIN(
-    ScopInliner, "polly-scop-inliner",
+    ScopInlinerWrapperPass, "polly-scop-inliner",
     "inline functions based on how much of the function is a scop.", false,
     false)
 INITIALIZE_PASS_END(
-    ScopInliner, "polly-scop-inliner",
+    ScopInlinerWrapperPass, "polly-scop-inliner",
     "inline functions based on how much of the function is a scop.", false,
     false)
+
+polly::ScopInlinerPass::ScopInlinerPass() {
+  if (!polly::PollyAllowFullFunction) {
+    report_fatal_error(
+        "Aborting from ScopInliner because it only makes sense to run with "
+        "-polly-allow-full-function. "
+        "The heurtistic for ScopInliner checks that the full function is a "
+        "Scop, which happens if and only if polly-allow-full-function is "
+        " enabled. "
+        " If not, the entry block is not included in the Scop");
+  }
+}
+
+PreservedAnalyses polly::ScopInlinerPass::run(llvm::LazyCallGraph::SCC &SCC,
+                                              llvm::CGSCCAnalysisManager &AM,
+                                              llvm::LazyCallGraph &CG,
+                                              llvm::CGSCCUpdateResult &UR) {
+  Function *F = &SCC.begin()->getFunction();
+  bool Changed = runScopInlinerImpl(F, SCC);
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
diff --git a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
index ac78b4e6c0c55..4deab1af0ccf0 100644
--- a/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
+++ b/polly/test/CodeGen/OpenMP/mapped-phi-access.ll
@@ -5,7 +5,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @main() local_unnamed_addr #0 {
+define void @main() local_unnamed_addr {
 entry:
   %0 = load ptr, ptr undef, align 8, !tbaa !1
   %1 = load ptr, ptr undef, align 8, !tbaa !1
@@ -35,8 +35,6 @@ kernel_gemver_StrictFP.exit:                      ; preds = %for.inc85.i238
   ret void
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0 "}
@@ -47,7 +45,6 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 !5 = !{!6, !6, i64 0}
 !6 = !{!"double", !3, i64 0}
 
-
 ; CHECK-LABEL: define internal void @main_polly_subfn(ptr %polly.par.userContext)
 ;
 ; CHECK:       polly.stmt.for.body65.i226:
diff --git a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
index 96dc4250cd05f..c207f589e4da0 100644
--- a/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
+++ b/polly/test/CodeGen/OpenMP/reference-argument-from-non-affine-region.ll
@@ -23,7 +23,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @longLimit = external global [9 x [23 x i32]], align 16
 @shortLimit = external global [9 x [14 x i32]], align 16
 
-define void @init_layer3(i32 %down_sample_sblimit) #0 {
+define void @init_layer3(i32 %down_sample_sblimit) {
 entry:
   br label %for.cond.463.preheader
 
@@ -63,8 +63,6 @@ for.inc.530:                                      ; preds = %for.inc.527
   br i1 %exitcond142, label %for.cond.499.preheader, label %for.cond.533.preheader
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 246359)"}
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
index 6ffe6bf67d54f..dfef4202391d4 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_1.ll
@@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @global4 = external global ptr, align 8
 
 ; Function Attrs: uwtable
-define i32 @foo(ptr %arg) #0 personality ptr @blam {
+define i32 @foo(ptr %arg) personality ptr @blam {
 bb:
   br label %bb3
 
@@ -83,40 +83,33 @@ bb19:                                             ; preds = %bb19, %bb14
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind readnone
-declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #2
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @eggs(i64) #3
+declare noalias ptr @eggs(i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @bar(i64) #3
+declare noalias ptr @bar(i64)
 
 ; Function Attrs: uwtable
-declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2
+declare void @zot(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2
 
 declare i32 @blam(...)
 
 ; Function Attrs: nobuiltin nounwind
-declare void @zot5(ptr) #4
+declare void @zot5(ptr)
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
+declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 ; Function Attrs: uwtable
-declare i32 @eggs6(ptr) #0
+declare i32 @eggs6(ptr)
 
 ; Function Attrs: nounwind uwtable
-declare void @eggs7(ptr, i32, i32, i32) unnamed_addr #5 align 2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
-attributes #2 = { nounwind readnone }
-attributes #3 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #5 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @eggs7(ptr, i32, i32, i32) unnamed_addr align 2
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
index 68c247a608311..fcc6764ce9c21 100644
--- a/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
+++ b/polly/test/CodeGen/dead_invariant_load_instruction_referenced_by_parameter_2.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @REGISTER = external global [10 x i32], align 16
 
 ; Function Attrs: nounwind uwtable
-define void @FORMAT3_4() #0 {
+define void @FORMAT3_4() {
 entry:
   %INSTR = alloca [32 x i32], align 16
   br label %entry.split
@@ -20,7 +20,7 @@ entry.split:                                      ; preds = %entry
   br i1 %cmp, label %if.end.36, label %if.else
 
 if.else:                                          ; preds = %entry.split
-  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2
+  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef)
   %1 = load i32, ptr undef, align 4
   %cmp14 = icmp eq i32 %1, 1
   br i1 %cmp14, label %land.lhs.true, label %if.end.36
@@ -179,8 +179,4 @@ return:                                           ; preds = %if.then.219, %if.th
   ret void
 }
 
-declare void @BYTES_TO_BITS(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @BYTES_TO_BITS(...)
diff --git a/polly/test/CodeGen/debug-intrinsics.ll b/polly/test/CodeGen/debug-intrinsics.ll
index 65fa6780d9720..ed4b81a8e3a3c 100644
--- a/polly/test/CodeGen/debug-intrinsics.ll
+++ b/polly/test/CodeGen/debug-intrinsics.ll
@@ -9,7 +9,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %A, i64 %N) #0 !dbg !4 {
+define void @foo(ptr %A, i64 %N) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -49,13 +49,10 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 ; CHECK-NOT: #dbg_value
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12}
diff --git a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
index 008e16caf9c23..abec28894f45b 100644
--- a/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
+++ b/polly/test/CodeGen/error_block_contains_invalid_memory_access.ll
@@ -17,10 +17,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.timeb.11.32.53.242.347.557.599.851.998.1208.2069.2153.2174.2237.2258.2279.2321 = type { i64, i16, i16, i16 }
 %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323 = type { i32, i32, i32, i32, i32, i32 }
 
-declare void @getLuma4x4Neighbour() #0
+declare void @getLuma4x4Neighbour()
 
 ; Function Attrs: nounwind uwtable
-define void @readCBP_CABAC(ptr %img) #1 {
+define void @readCBP_CABAC(ptr %img) {
 entry:
   %block_a = alloca %struct.pix_pos.13.34.55.244.349.559.601.853.1000.1210.2071.2155.2176.2239.2260.2281.2323, align 4
   %mb_data = getelementptr inbounds %struct.img_par.12.33.54.243.348.558.600.852.999.1209.2070.2154.2175.2238.2259.2280.2322, ptr %img, i64 0, i32 39
@@ -45,7 +45,7 @@ if.end.35:                                        ; preds = %if.else.19, %if.els
   br i1 %cmp36, label %if.then.38, label %if.else.66
 
 if.then.38:                                       ; preds = %if.end.35
-  call void @getLuma4x4Neighbour() #2
+  call void @getLuma4x4Neighbour()
   %0 = load i32, ptr null, align 4
   %tobool = icmp eq i32 %0, 0
   br i1 %tobool, label %if.end.72, label %if.then.42
@@ -72,7 +72,3 @@ for.inc.84:                                       ; preds = %if.end.72
 for.end.86:                                       ; preds = %for.inc.84
   ret void
 }
-
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
diff --git a/polly/test/CodeGen/hoisting_1.ll b/polly/test/CodeGen/hoisting_1.ll
index 1f065bec80327..aa29bfd7dbcbc 100644
--- a/polly/test/CodeGen/hoisting_1.ll
+++ b/polly/test/CodeGen/hoisting_1.ll
@@ -26,7 +26,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.bar.11 = type { ptr, ptr, ptr }
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %arg) #0 {
+define void @foo(ptr %arg) {
 bb:
   br label %bb2
 
@@ -49,8 +49,6 @@ bb10:                                             ; preds = %bb9
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/hoisting_2.ll b/polly/test/CodeGen/hoisting_2.ll
index e76ee066af08d..1b913f2cb07be 100644
--- a/polly/test/CodeGen/hoisting_2.ll
+++ b/polly/test/CodeGen/hoisting_2.ll
@@ -27,7 +27,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.foo.12 = type { %struct.foo.4, i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind uwtable
-define void @eggs(ptr %arg) #0 {
+define void @eggs(ptr %arg) {
 bb:
   %tmp = load ptr, ptr undef, align 8, !tbaa !1
   br label %bb5
@@ -62,8 +62,6 @@ bb22:                                             ; preds = %bb13.bb22_crit_edge
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/intrinsics_lifetime.ll b/polly/test/CodeGen/intrinsics_lifetime.ll
index 6dca218b63862..a708548798ebb 100644
--- a/polly/test/CodeGen/intrinsics_lifetime.ll
+++ b/polly/test/CodeGen/intrinsics_lifetime.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -32,7 +32,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.start(i64 4096, ptr %tmp)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -59,7 +59,7 @@ for.end:                                          ; preds = %for.cond2
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
   store i32 %tmp8, ptr %arrayidx10, align 4
-  call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.end(i64 4096, ptr %tmp)
   br label %for.inc11
 
 for.inc11:                                        ; preds = %for.end
@@ -71,10 +71,7 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
diff --git a/polly/test/CodeGen/intrinsics_misc.ll b/polly/test/CodeGen/intrinsics_misc.ll
index 84164893ebf72..a643b8accd4e9 100644
--- a/polly/test/CodeGen/intrinsics_misc.ll
+++ b/polly/test/CodeGen/intrinsics_misc.ll
@@ -28,7 +28,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -39,7 +39,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1
+  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -68,7 +68,7 @@ for.end:                                          ; preds = %for.cond2
   %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
-  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1
+  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A)
   store i32 %tmp8, ptr %arrayidx10, align 4
   br label %for.inc11
 
@@ -81,19 +81,16 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.donothing() #1
+declare void @llvm.donothing()
 
 ; Function Attrs: nounwind
-declare void @llvm.assume(i1) #1
+declare void @llvm.assume(i1)
 
 ; Function Attrs: nounwind
-declare i1 @llvm.expect.i1(i1, i1) #1
+declare i1 @llvm.expect.i1(i1, i1)
 
 ; Function Attrs: nounwind
-declare ptr @llvm.invariant.start(i64, ptr nocapture) #1
+declare ptr @llvm.invariant.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.invariant.end(ptr, i64, ptr nocapture)
diff --git a/polly/test/CodeGen/invariant_cannot_handle_void.ll b/polly/test/CodeGen/invariant_cannot_handle_void.ll
index 0859a4e4997e1..420cb608f9ba4 100644
--- a/polly/test/CodeGen/invariant_cannot_handle_void.ll
+++ b/polly/test/CodeGen/invariant_cannot_handle_void.ll
@@ -24,7 +24,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @sudecrypt(ptr %buff) #0 {
+define void @sudecrypt(ptr %buff) {
 entry:
   br i1 undef, label %cleanup, label %if.end
 
@@ -62,8 +62,6 @@ cleanup:                                          ; preds = %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/invariant_load_different_sized_types.ll b/polly/test/CodeGen/invariant_load_different_sized_types.ll
index 2995bce4c6601..0a88bb70966d2 100644
--- a/polly/test/CodeGen/invariant_load_different_sized_types.ll
+++ b/polly/test/CodeGen/invariant_load_different_sized_types.ll
@@ -8,11 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT:   %polly.access.tmp2.load = load i32, ptr %polly.access.tmp2, align 1
 ; CHECK-NEXT:   store i32 %polly.access.tmp2.load, ptr %tmp.preload.s2a
 
-
 %struct.hoge = type { [4 x i8], i32, i32, i32, i32, i32, [16 x i8], [16 x i8], i64, i64, i64, i64, i64 }
 
 ; Function Attrs: nounwind uwtable
-define void @widget() #0 {
+define void @widget() {
 bb:
   %tmp2 = alloca %struct.hoge, align 1
   br label %bb3
@@ -40,8 +39,6 @@ bb13:                                             ; preds = %bb10
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259771)"}
diff --git a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
index 01b01761d908b..5a11adcdebbc5 100644
--- a/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
+++ b/polly/test/CodeGen/invariant_load_not_executed_but_in_parameters.ll
@@ -35,7 +35,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @donestkptr = external global ptr, align 8
 
 ; Function Attrs: uwtable
-define i32 @_Z13dotableswitchP9Classfile(ptr %c) #0 personality ptr @__gxx_personality_v0 {
+define i32 @_Z13dotableswitchP9Classfile(ptr %c) personality ptr @__gxx_personality_v0 {
 entry:
   br label %entry.split
 
@@ -82,33 +82,27 @@ for.end:                                          ; preds = %for.cond.for.end_cr
 }
 
 ; Function Attrs: nounwind readnone
-declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64) #1
+declare { i64, i1 } @llvm.umul.with.overflow.i64(i64, i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @_Znam(i64) #2
+declare noalias ptr @_Znam(i64)
 
 ; Function Attrs: nobuiltin
-declare noalias ptr @_Znwm(i64) #2
+declare noalias ptr @_Znwm(i64)
 
 ; Function Attrs: uwtable
-declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr #0 align 2
+declare void @_ZN3ExpC2Ejj7Exptype4Type2OpPS_jjP4Case(ptr, i32, i32, i32, i32, i32, ptr, i32, i32, ptr) unnamed_addr align 2
 
 declare i32 @__gxx_personality_v0(...)
 
 ; Function Attrs: nobuiltin nounwind
-declare void @_ZdlPv(ptr) #3
+declare void @_ZdlPv(ptr)
 
 ; Function Attrs: uwtable
-declare i32 @_Z10doluswitchP9Classfile(ptr) #0
+declare i32 @_Z10doluswitchP9Classfile(ptr)
 
 ; Function Attrs: nounwind uwtable
-declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr #4 align 2
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { nobuiltin "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nobuiltin nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #4 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_ZN4Exp_C2E7Exptype4Type2Op(ptr, i32, i32, i32) unnamed_addr align 2
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/invariant_verify_function_failed.ll b/polly/test/CodeGen/invariant_verify_function_failed.ll
index c9affac076e92..1dcc175ebb163 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed.ll
@@ -15,7 +15,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @fileblobSetFilename() #0 {
+define void @fileblobSetFilename() {
 entry:
   br i1 undef, label %if.end, label %cleanup
 
@@ -23,7 +23,7 @@ if.end:                                           ; preds = %entry
   br i1 undef, label %land.lhs.true, label %if.end.18
 
 land.lhs.true:                                    ; preds = %if.end
-  %call9 = tail call ptr @__errno_location() #2
+  %call9 = tail call ptr @__errno_location()
   %tmp = load i32, ptr %call9, align 4, !tbaa !1
   br i1 false, label %if.then.12, label %if.end.18
 
@@ -45,11 +45,7 @@ cleanup:                                          ; preds = %if.end.27, %entry
 }
 
 ; Function Attrs: nounwind readnone
-declare ptr @__errno_location() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare ptr @__errno_location()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/CodeGen/invariant_verify_function_failed_2.ll b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
index 7ef5608d7d197..43b3d99e11a2f 100644
--- a/polly/test/CodeGen/invariant_verify_function_failed_2.ll
+++ b/polly/test/CodeGen/invariant_verify_function_failed_2.ll
@@ -39,7 +39,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @enc_picture = external global ptr, align 8
 
 ; Function Attrs: nounwind uwtable
-define void @compute_colocated(ptr %listX, ptr %A, ptr %B) #0 {
+define void @compute_colocated(ptr %listX, ptr %A, ptr %B) {
 entry:
   br label %for.body2414
 
@@ -82,8 +82,6 @@ if.end2624:                                       ; preds = %for.inc2621
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
index d9065858ff250..f43247b3e5057 100644
--- a/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
+++ b/polly/test/CodeGen/loop-invariant-load-type-mismatch.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 
 ; Just make sure this test passes correctly.
 
-define void @kernel_ludcmp(ptr %b, ptr %y) #0 {
+define void @kernel_ludcmp(ptr %b, ptr %y) {
 entry:
   br label %entry.split
 
@@ -115,8 +115,6 @@ for.end.131:                                      ; preds = %for.end.118
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/multiple-types-invariant-load-2.ll b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
index f6aca37c932bd..101fcaff0c82e 100644
--- a/polly/test/CodeGen/multiple-types-invariant-load-2.ll
+++ b/polly/test/CodeGen/multiple-types-invariant-load-2.ll
@@ -5,7 +5,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @hoge(ptr %arg) #0 {
+define void @hoge(ptr %arg) {
 bb:
   br label %bb3
 
@@ -22,8 +22,6 @@ bb7:                                              ; preds = %bb4, %bb3
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 259751) (llvm/trunk 259869)"}
diff --git a/polly/test/CodeGen/out-of-scop-phi-node-use.ll b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
index a4f942309ed28..dd0a24b14a3b8 100644
--- a/polly/test/CodeGen/out-of-scop-phi-node-use.ll
+++ b/polly/test/CodeGen/out-of-scop-phi-node-use.ll
@@ -13,7 +13,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK-NEXT: %_s.sroa.343.0.ph5161118.ph.final_reload = load i32, ptr %_s.sroa.343.0.ph5161118.s2a
 
 ; Function Attrs: nounwind uwtable
-define void @lzmaDecode() #0 {
+define void @lzmaDecode() {
 entry:
   br label %for.cond.outer.outer.outer
 
@@ -54,8 +54,6 @@ cleanup.1072:                                     ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/phi-defined-before-scop.ll b/polly/test/CodeGen/phi-defined-before-scop.ll
index 23612061156d9..447a14e9999c2 100644
--- a/polly/test/CodeGen/phi-defined-before-scop.ll
+++ b/polly/test/CodeGen/phi-defined-before-scop.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @global = external global ptr, align 8
 
 ; Function Attrs: nounwind uwtable
-define void @wobble() #0 {
+define void @wobble() {
 bb:
   br label %bb1
 
@@ -41,8 +41,6 @@ bb9:                                              ; preds = %bb8
   unreachable
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/pr25241.ll b/polly/test/CodeGen/pr25241.ll
index 4a4add8ba2a6d..7547b0bbed749 100644
--- a/polly/test/CodeGen/pr25241.ll
+++ b/polly/test/CodeGen/pr25241.ll
@@ -20,12 +20,11 @@
 ; CHECK:         %curr.3.ph.final_reload = load i32, ptr %curr.3.s2a
 ; CHECK:         br label
 
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @BZ2_decompress() #0 {
+define void @BZ2_decompress() {
 entry:
   %tmp = load i32, ptr undef, align 4, !tbaa !1
   switch i32 undef, label %save_state_and_return [
@@ -56,8 +55,6 @@ save_state_and_return:                            ; preds = %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
index 6c6c2572da109..0adb0ba7eea81 100644
--- a/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
+++ b/polly/test/CodeGen/scev_expansion_in_nonaffine.ll
@@ -31,7 +31,7 @@ target triple = "x86_64-unknown-linux-gnu"
 @reduce_sidechannel = external global i32, align 4
 
 ; Function Attrs: nounwind uwtable
-define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) #0 {
+define void @iteration_init(ptr %gfp, ptr %l3_side, ptr %l3_enc) {
 entry:
   %resvDrain = getelementptr inbounds %struct.III_side_info_t.7.62.139.227.293, ptr %l3_side, i64 0, i32 2
   store i32 0, ptr %resvDrain, align 8
@@ -86,5 +86,3 @@ for.inc.117:                                      ; preds = %for.inc.114, %for.c
   %cmp95 = icmp slt i64 %indvars.iv.next158, %6
   br i1 %cmp95, label %for.cond.98.preheader, label %for.cond.120.preheader
 }
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/polly/test/DependenceInfo/fine_grain_dep_0.ll b/polly/test/DependenceInfo/fine_grain_dep_0.ll
index f93814c1c4be0..5abbf48136891 100644
--- a/polly/test/DependenceInfo/fine_grain_dep_0.ll
+++ b/polly/test/DependenceInfo/fine_grain_dep_0.ll
@@ -31,7 +31,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @test(ptr %a, ptr %b, i64 %N) #0 {
+define void @test(ptr %a, ptr %b, i64 %N) {
 entry:
   br label %for.cond
 
@@ -67,8 +67,6 @@ for.end:                                          ; preds = %for.cond
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/ForwardOpTree/atax.ll b/polly/test/ForwardOpTree/atax.ll
index 496e8315b068b..6c81fb12e8cdc 100644
--- a/polly/test/ForwardOpTree/atax.ll
+++ b/polly/test/ForwardOpTree/atax.ll
@@ -2,7 +2,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr #0 {
+define internal fastcc void @kernel_atax(ptr nocapture readonly %A, ptr nocapture readonly %x, ptr nocapture %y, ptr nocapture %tmp) unnamed_addr {
 entry:
   br label %entry.split
 
@@ -61,10 +61,7 @@ for.end42:                                        ; preds = %for.inc40
 }
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1) #1
-
-attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { argmemonly nounwind }
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i32, i1)
 
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
@@ -78,7 +75,6 @@ attributes #1 = { argmemonly nounwind }
 !6 = !{!7, !7, i64 0}
 !7 = !{!"double", !4, i64 0}
 
-
 ; CHECK: Statistics {
 ; CHECK:     Operand trees forwarded: 2
 ; CHECK:     Statements with forwarded operand trees: 2
diff --git a/polly/test/ForwardOpTree/jacobi-1d.ll b/polly/test/ForwardOpTree/jacobi-1d.ll
index c9c71a15a4261..cb035bb749c7b 100644
--- a/polly/test/ForwardOpTree/jacobi-1d.ll
+++ b/polly/test/ForwardOpTree/jacobi-1d.ll
@@ -2,7 +2,7 @@
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr #0 {
+define internal fastcc void @kernel_jacobi_1d(ptr noalias nocapture %A, ptr noalias nocapture %B) unnamed_addr {
 entry:
   br label %entry.split
 
@@ -46,8 +46,6 @@ for.end35:                                        ; preds = %for.inc33
   ret void
 }
 
-attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
@@ -60,7 +58,6 @@ attributes #0 = { noinline norecurse nounwind uwtable "correctly-rounded-divide-
 !6 = !{!7, !7, i64 0}
 !7 = !{!"double", !4, i64 0}
 
-
 ; CHECK: Statistics {
 ; CHECK:     Operand trees forwarded: 2
 ; CHECK:     Statements with forwarded operand trees: 1
diff --git a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
index 8c3f230cb413c..aef509a865b6a 100644
--- a/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
+++ b/polly/test/IstAstInfo/runtime_context_with_error_blocks.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 $_ZNKSt5ctypeIcE5widenEc = comdat any
 
 ; Function Attrs: uwtable
-define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) #0 comdat align 2 {
+define weak_odr signext i8 @_ZNKSt5ctypeIcE5widenEc(ptr %this, i8 signext %__c) comdat align 2 {
 entry:
   br label %entry.split
 
@@ -45,10 +45,7 @@ return:                                           ; preds = %if.end, %if.then
   ret i8 %retval.0
 }
 
-declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr) #1
-
-attributes #0 = { uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
+declare void @_ZNKSt5ctypeIcE13_M_widen_initEv(ptr)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
index 6e9ade869ec6c..a19b93d9915dd 100644
--- a/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
+++ b/polly/test/ScheduleOptimizer/pattern-matching-based-opts-after-delicm.ll
@@ -47,7 +47,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: norecurse nounwind uwtable
-define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr #0 {
+define void @kernel_2mm(i32 %ni, i32 %nj, i32 %nk, i32 %nl, double %alpha, double %beta, ptr nocapture %tmp, ptr nocapture readonly %A, ptr nocapture readonly %B, ptr nocapture readnone %C, ptr nocapture readnone %D) local_unnamed_addr {
 entry:
   br label %entry.split
 
@@ -93,8 +93,6 @@ for.end27:                                        ; preds = %for.inc25
   ret void
 }
 
-attributes #0 = { norecurse nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+x87,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512ifma,-avx512pf,-avx512vbmi,-avx512vl,-avx512vpopcntdq,-clflushopt,-clwb,-clzero,-fma4,-lwp,-mwaitx,-pku,-prefetchwt1,-prfchw,-rdseed,-rtm,-sgx,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0}
 !llvm.ident = !{!1}
 
diff --git a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
index a18ba1daef84b..1c6d289744e39 100644
--- a/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization-without-tiling.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @foo() #0 {
+define void @foo() {
 entry:
   br label %entry.split
 
@@ -51,8 +51,6 @@ for.end30:                                        ; preds = %for.inc28
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 1535; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 383; c1 += 1)
diff --git a/polly/test/ScheduleOptimizer/prevectorization.ll b/polly/test/ScheduleOptimizer/prevectorization.ll
index 4db61ad032ea8..1ff20d165ce5e 100644
--- a/polly/test/ScheduleOptimizer/prevectorization.ll
+++ b/polly/test/ScheduleOptimizer/prevectorization.ll
@@ -8,7 +8,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @B = common global [1536 x [1536 x float]] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @foo() #0 {
+define void @foo() {
 entry:
   br label %entry.split
 
@@ -53,8 +53,6 @@ for.end30:                                        ; preds = %for.inc28
   ret void
 }
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 ; CHECK: #pragma known-parallel
 ; CHECK: for (int c0 = 0; c0 <= 47; c0 += 1)
 ; CHECK:   for (int c1 = 0; c1 <= 47; c1 += 1)
@@ -95,7 +93,6 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"=
 ; VEC16:                 Stmt_for_body8(32 * c0 + c3, 32 * c1 + 16 * c4 + c6, 32 * c2 + c5);
 ; VEC16: }
 
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.5.0 "}
diff --git a/polly/test/ScopDetect/error-block-always-executed.ll b/polly/test/ScopDetect/error-block-always-executed.ll
index d799d575a530b..20d02b1c1ae0b 100644
--- a/polly/test/ScopDetect/error-block-always-executed.ll
+++ b/polly/test/ScopDetect/error-block-always-executed.ll
@@ -8,14 +8,14 @@ target triple = "x86_64-unknown-linux-gnu"
 %struct.hoge = type { i32, i32, i32, i32 }
 
 ; Function Attrs: nounwind uwtable
-define void @widget() #0 {
+define void @widget() {
 bb13:
   %tmp1 = alloca %struct.hoge, align 4
   br i1 undef, label %bb14, label %bb19
 
 bb14:                                             ; preds = %bb13
   %tmp = load i32, ptr undef, align 4, !tbaa !1
-  call void @quux() #2
+  call void @quux()
   br i1 false, label %bb15, label %bb18
 
 bb15:                                             ; preds = %bb14
@@ -46,11 +46,7 @@ bb25:                                             ; preds = %bb2
   unreachable
 }
 
-declare void @quux() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @quux()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopDetect/error-block-referenced-from-scop.ll b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
index ba271f34ea7b4..6c66f6df14af5 100644
--- a/polly/test/ScopDetect/error-block-referenced-from-scop.ll
+++ b/polly/test/ScopDetect/error-block-referenced-from-scop.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @hoge() #0 {
+define void @hoge() {
 bb:
   br label %bb1
 
@@ -15,7 +15,7 @@ bb1:                                              ; preds = %bb
 
 bb2:                                              ; preds = %bb1
   %tmp = load i32, ptr undef, align 8, !tbaa !1
-  %tmp3 = tail call i32 @widget() #2
+  %tmp3 = tail call i32 @widget()
   br i1 false, label %bb4, label %bb5
 
 bb4:                                              ; preds = %bb2
@@ -33,11 +33,7 @@ bb8:                                              ; preds = %bb7, %bb5, %bb4
 }
 
 ; Function Attrs: inlinehint nounwind readonly uwtable
-declare i32 @widget() #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { inlinehint nounwind readonly uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind readonly }
+declare i32 @widget()
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopDetect/expand-region-correctly-2.ll b/polly/test/ScopDetect/expand-region-correctly-2.ll
index df35d05674f95..a5c9626d28361 100644
--- a/polly/test/ScopDetect/expand-region-correctly-2.ll
+++ b/polly/test/ScopDetect/expand-region-correctly-2.ll
@@ -5,7 +5,7 @@
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @qtm_decompress() #0 {
+define void @qtm_decompress() {
 entry:
   br label %if.end.1631
 
@@ -45,8 +45,6 @@ cleanup.1785:                                     ; preds = %if.then.1659
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 250010) (llvm/trunk 250018)"}
diff --git a/polly/test/ScopDetect/intrinsics_1.ll b/polly/test/ScopDetect/intrinsics_1.ll
index 0f9c70084a3da..58c9197f7f799 100644
--- a/polly/test/ScopDetect/intrinsics_1.ll
+++ b/polly/test/ScopDetect/intrinsics_1.ll
@@ -15,7 +15,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @jd(ptr noalias %A, ptr noalias %B) #0 {
+define void @jd(ptr noalias %A, ptr noalias %B) {
 entry:
   br label %for.cond
 
@@ -28,29 +28,29 @@ for.body:                                         ; preds = %for.cond
   %tmp = trunc i64 %indvars.iv to i32
   %conv = sitofp i32 %tmp to double
   %tmp1 = call double @llvm.sqrt.f64(double %conv)
-  %call = call double @__log10_finite(double %tmp1) #2
-  %call1 = call double @ceil(double %call) #2
+  %call = call double @__log10_finite(double %tmp1)
+  %call1 = call double @ceil(double %call)
   %tmp2 = trunc i64 %indvars.iv to i32
   %conv2 = sitofp i32 %tmp2 to double
-  %call3 = call double @__log2_finite(double %conv2) #2
-  %call4 = call double @floor(double %call3) #2
+  %call3 = call double @__log2_finite(double %conv2)
+  %call4 = call double @floor(double %call3)
   %tmp3 = call double @llvm.pow.f64(double %call1, double %call4)
   %conv5 = fptosi double %tmp3 to i32
   %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv
   store i32 %conv5, ptr %arrayidx, align 4
   %tmp4 = trunc i64 %indvars.iv to i32
   %conv6 = sitofp i32 %tmp4 to double
-  %call7 = call double @sin(double %conv6) #2
-  %call8 = call double @__log_finite(double %call7) #2
+  %call7 = call double @sin(double %conv6)
+  %call8 = call double @__log_finite(double %call7)
   %tmp5 = trunc i64 %indvars.iv to i32
   %conv9 = sitofp i32 %tmp5 to double
-  %call10 = call double @cos(double %conv9) #2
-  %call11 = call double @__exp2_finite(double %call10) #2
+  %call10 = call double @cos(double %conv9)
+  %call11 = call double @__exp2_finite(double %call10)
   %add = fadd fast double %call8, %call11
-  %call12 = call double @fabs(double %add) #2
+  %call12 = call double @fabs(double %add)
   %tmp6 = trunc i64 %indvars.iv to i32
   %conv13 = sitofp i32 %tmp6 to double
-  %call14 = call double @__exp_finite(double %conv13) #2
+  %call14 = call double @__exp_finite(double %conv13)
   %add15 = fadd fast double %call12, %call14
   %conv16 = fptrunc double %add15 to float
   %arrayidx18 = getelementptr inbounds float, ptr %B, i64 %indvars.iv
@@ -66,41 +66,37 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare double @ceil(double) #1
+declare double @ceil(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log10_finite(double) #1
+declare double @__log10_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @llvm.sqrt.f64(double) #2
+declare double @llvm.sqrt.f64(double)
 
 ; Function Attrs: nounwind readnone
-declare double @floor(double) #1
+declare double @floor(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log2_finite(double) #1
+declare double @__log2_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @llvm.pow.f64(double, double) #2
+declare double @llvm.pow.f64(double, double)
 
 ; Function Attrs: nounwind readnone
-declare double @fabs(double) #1
+declare double @fabs(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__log_finite(double) #1
+declare double @__log_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @sin(double) #1
+declare double @sin(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__exp2_finite(double) #1
+declare double @__exp2_finite(double)
 
 ; Function Attrs: nounwind readnone
-declare double @cos(double) #1
+declare double @cos(double)
 
 ; Function Attrs: nounwind readnone
-declare double @__exp_finite(double) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #2 = { nounwind readnone }
+declare double @__exp_finite(double)
diff --git a/polly/test/ScopDetect/intrinsics_2.ll b/polly/test/ScopDetect/intrinsics_2.ll
index 1db9807cadb8d..f71016e6d04cd 100644
--- a/polly/test/ScopDetect/intrinsics_2.ll
+++ b/polly/test/ScopDetect/intrinsics_2.ll
@@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -31,7 +31,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  call void @llvm.lifetime.start(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.start(i64 4096, ptr %tmp)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -62,7 +62,7 @@ for.end:                                          ; preds = %for.cond2
 
 for.inc11:                                        ; preds = %for.end
   %indvars.iv.next4 = add nuw nsw i64 %indvars.iv3, 1
-  call void @llvm.lifetime.end(i64 4096, ptr %tmp) #1
+  call void @llvm.lifetime.end(i64 4096, ptr %tmp)
   br label %for.cond
 
 for.end13:                                        ; preds = %for.cond
@@ -70,10 +70,7 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
diff --git a/polly/test/ScopDetect/intrinsics_3.ll b/polly/test/ScopDetect/intrinsics_3.ll
index a230d0aa831c5..579d5bd481d44 100644
--- a/polly/test/ScopDetect/intrinsics_3.ll
+++ b/polly/test/ScopDetect/intrinsics_3.ll
@@ -20,7 +20,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @A = common global [1024 x i32] zeroinitializer, align 16
 
 ; Function Attrs: nounwind uwtable
-define void @jd() #0 {
+define void @jd() {
 entry:
   %tmp = alloca [1024 x i32], align 16
   br label %for.cond
@@ -31,7 +31,7 @@ for.cond:                                         ; preds = %for.inc11, %entry
   br i1 %exitcond5, label %for.body, label %for.end13
 
 for.body:                                         ; preds = %for.cond
-  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A) #1
+  %lis = call ptr @llvm.invariant.start(i64 4096, ptr @A)
   br label %for.cond2
 
 for.cond2:                                        ; preds = %for.inc, %for.body
@@ -60,7 +60,7 @@ for.end:                                          ; preds = %for.cond2
   %arrayidx8 = getelementptr inbounds [1024 x i32], ptr %tmp, i64 0, i64 %indvars.iv3
   %tmp8 = load i32, ptr %arrayidx8, align 4
   %arrayidx10 = getelementptr inbounds [1024 x i32], ptr @A, i64 0, i64 %indvars.iv3
-  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A) #1
+  call void @llvm.invariant.end(ptr %lis, i64 4096, ptr @A)
   store i32 %tmp8, ptr %arrayidx10, align 4
   br label %for.inc11
 
@@ -73,19 +73,16 @@ for.end13:                                        ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.donothing() #1
+declare void @llvm.donothing()
 
 ; Function Attrs: nounwind
-declare void @llvm.assume(i1) #1
+declare void @llvm.assume(i1)
 
 ; Function Attrs: nounwind
-declare i1 @llvm.expect.i1(i1, i1) #1
+declare i1 @llvm.expect.i1(i1, i1)
 
 ; Function Attrs: nounwind
-declare ptr @llvm.invariant.start(i64, ptr nocapture) #1
+declare ptr @llvm.invariant.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.invariant.end(ptr, i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.invariant.end(ptr, i64, ptr nocapture)
diff --git a/polly/test/ScopDetect/report-scop-location.ll b/polly/test/ScopDetect/report-scop-location.ll
index a99a2ef2b4841..5e4c38db5e53c 100644
--- a/polly/test/ScopDetect/report-scop-location.ll
+++ b/polly/test/ScopDetect/report-scop-location.ll
@@ -2,7 +2,7 @@
 target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(ptr %A) #0 !dbg !4 {
+define void @foo(ptr %A) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -28,7 +28,7 @@ for.end:                                          ; preds = %for.body
 ; CHECK: test.c:3: End of scop
 
 ; Function Attrs: nounwind uwtable
-define void @bar(ptr %A) #0 !dbg !7 {
+define void @bar(ptr %A) !dbg !7 {
 entry:
   br label %entry.split
 
@@ -53,8 +53,6 @@ for.end:                                          ; preds = %for.body
 ; CHECK: test.c:9: Start of scop
 ; CHECK: test.c:13: End of scop
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
 !llvm.ident = !{!10}
@@ -77,4 +75,3 @@ attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"=
 !16 = distinct !DILexicalBlock(line: 9, column: 0, file: !1, scope: !7)
 !17 = !DILocation(line: 13, scope: !16)
 !18 = !DILocation(line: 14, scope: !7)
-
diff --git a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
index d22c3b6d27c3d..30e5fb9fdeba8 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportIrreducibleRegion.ll
@@ -19,9 +19,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Irreducible region encountered in control flow.
 ; CHECK-NEXT: remark: ReportIrreducibleRegion.c:9:4: Invalid Scop candidate ends here.
 
-
 ; Function Attrs: nounwind uwtable
-define void @foo(i32 %a, i32 %b) #0 !dbg !4 {
+define void @foo(i32 %a, i32 %b) !dbg !4 {
 entry:
   %a.addr = alloca i32, align 4
   %b.addr = alloca i32, align 4
@@ -66,10 +65,7 @@ if.end6:                                          ; preds = %if.end5, %entry
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!8, !9}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
index cb913000a9933..a96b64e4e0d54 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopBound-01.ll
@@ -79,9 +79,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
index 3743bfae9fcaf..6156efaea1909 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportLoopHasNoExit.ll
@@ -19,12 +19,10 @@
 
 ; CHECK: remark: ReportLoopHasNoExit.c:7:7: Loop cannot be handled because it has no exit.
 
-
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @func(i32 %param0, i32 %N, ptr %A) #0 !dbg !6 {
+define void @func(i32 %param0, i32 %N, ptr %A) !dbg !6 {
 entry:
   %param0.addr = alloca i32, align 4
   %N.addr = alloca i32, align 4
@@ -80,10 +78,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
index 832045f089d64..13ac9d5ace2d3 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportNonAffineAccess-01.ll
@@ -5,7 +5,6 @@
 ;     A[i*i] = 0;
 ; }
 
-
 ; CHECK: remark: ReportNonAffineAccess-01.c:2:7: The following errors keep this region from being a Scop.
 ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: The array subscript of "A" is not affine
 ; CHECK: remark: ReportNonAffineAccess-01.c:3:5: Invalid Scop candidate ends here.
@@ -40,9 +39,6 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!10, !11}
 !llvm.ident = !{!12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
index b951487d61971..93e9e8b14038b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportUnprofitable.ll
@@ -74,9 +74,8 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 ; YAML: Args:
 ; YAML:   - String:          Invalid Scop candidate ends here.
 
-
 ; Function Attrs: nounwind uwtable
-define void @onlyWrite(ptr %A) #0 !dbg !4 {
+define void @onlyWrite(ptr %A) !dbg !4 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !14, metadata !15), !dbg !16
   call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !17, metadata !15), !dbg !20
@@ -102,10 +101,10 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
 ; Function Attrs: nounwind uwtable
-define void @onlyRead(ptr %A) #0 !dbg !10 {
+define void @onlyRead(ptr %A) !dbg !10 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !29, metadata !15), !dbg !30
   call void @llvm.dbg.value(metadata i64 0, i64 0, metadata !31, metadata !15), !dbg !33
@@ -131,10 +130,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!11, !12}
diff --git a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
index c2efd6165a26b..5f296fae9532b 100644
--- a/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
+++ b/polly/test/ScopDetectionDiagnostics/ReportVariantBasePtr-01.ll
@@ -21,7 +21,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 %struct.b = type { ptr }
 
-define void @a(ptr nocapture readonly %A) #0 !dbg !4 {
+define void @a(ptr nocapture readonly %A) !dbg !4 {
 entry:
   br label %entry.split
 
@@ -57,10 +57,7 @@ for.end:                                          ; preds = %for.body
   ret void, !dbg !34
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21}
diff --git a/polly/test/ScopInfo/BoundChecks/single-loop.ll b/polly/test/ScopInfo/BoundChecks/single-loop.ll
index 10a0a58f381d2..0b69beaaf3f9c 100644
--- a/polly/test/ScopInfo/BoundChecks/single-loop.ll
+++ b/polly/test/ScopInfo/BoundChecks/single-loop.ll
@@ -38,7 +38,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %for.cond
 
@@ -51,7 +51,7 @@ for.body:                                         ; preds = %for.cond
   br i1 false, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end:                                           ; preds = %for.body
@@ -59,7 +59,7 @@ if.end:                                           ; preds = %for.body
   br i1 %cmp2, label %if.then.3, label %if.end.4
 
 if.then.3:                                        ; preds = %if.end
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end.4:                                         ; preds = %if.end
@@ -79,11 +79,7 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: noreturn
-declare void @exception(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noreturn nounwind }
+declare void @exception(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/BoundChecks/two-loops.ll b/polly/test/ScopInfo/BoundChecks/two-loops.ll
index c85ac5b4ba8fd..f2ba17d33c0ea 100644
--- a/polly/test/ScopInfo/BoundChecks/two-loops.ll
+++ b/polly/test/ScopInfo/BoundChecks/two-loops.ll
@@ -35,7 +35,7 @@
 ; AST:     {  /* original code */ }
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %for.cond
 
@@ -56,7 +56,7 @@ for.body.3:                                       ; preds = %for.cond.1
   br i1 false, label %if.then, label %if.end
 
 if.then:                                          ; preds = %for.body.3
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end:                                           ; preds = %for.body.3
@@ -64,7 +64,7 @@ if.end:                                           ; preds = %for.body.3
   br i1 %cmp5, label %if.then.6, label %if.end.7
 
 if.then.6:                                        ; preds = %if.end
-  call void (...) @exception() #2
+  call void (...) @exception()
   unreachable
 
 if.end.7:                                         ; preds = %if.end
@@ -90,11 +90,7 @@ for.end.10:                                       ; preds = %for.cond
   ret void
 }
 
-declare void @exception(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { noreturn "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { noreturn nounwind }
+declare void @exception(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/complex-expression.ll b/polly/test/ScopInfo/complex-expression.ll
index 6a6dde62d1ae5..4a2a1d2a64a6d 100644
--- a/polly/test/ScopInfo/complex-expression.ll
+++ b/polly/test/ScopInfo/complex-expression.ll
@@ -13,7 +13,7 @@
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 
 ; Function Attrs: norecurse nounwind
-define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) #0 {
+define i32 @foo(ptr nocapture readonly %src1, ptr nocapture readonly %src2, ptr nocapture %score, ptr nocapture %max, i32 %n) {
 entry:
   %cmp33 = icmp sgt i32 %n, 0
   br i1 %cmp33, label %for.body.preheader, label %for.body7.preheader
@@ -129,8 +129,6 @@ cleanup:                                          ; preds = %for.body7.preheader
   ret i32 %retval.0
 }
 
-attributes #0 = { norecurse nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="arm7tdmi" "target-features"="+strict-align" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1}
 !llvm.ident = !{!2}
 
diff --git a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
index baa423f407802..a3ca59563ab1f 100644
--- a/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
+++ b/polly/test/ScopInfo/do-not-model-error-block-accesses.ll
@@ -6,7 +6,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @FORMAT3_4() #0 {
+define void @FORMAT3_4() {
 entry:
   br label %entry.split
 
@@ -14,7 +14,7 @@ entry.split:                                      ; preds = %entry
   br i1 false, label %if.end.38, label %if.else
 
 if.else:                                          ; preds = %entry.split
-  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef) #2
+  call void (i32, i32, ptr, ...) @BYTES_TO_BITS(i32 undef, i32 1, ptr undef)
   %0 = load i32, ptr null, align 4
   br label %if.end.38
 
@@ -22,11 +22,7 @@ if.end.38:                                        ; preds = %if.else, %entry.spl
   unreachable
 }
 
-declare void @BYTES_TO_BITS(...) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="haswell" "target-features"="+aes,+avx,+avx2,+bmi,+bmi2,+cmov,+cx16,+f16c,+fma,+fsgsbase,+fxsr,+hle,+lzcnt,+mmx,+movbe,+pclmul,+popcnt,+rdrnd,+rtm,+sse,+sse2,+sse3,+sse4.1,+sse4.2,+ssse3,+xsave,+xsaveopt,-adx,-avx512bw,-avx512cd,-avx512dq,-avx512er,-avx512f,-avx512pf,-avx512vl,-fma4,-prfchw,-rdseed,-sha,-sse4a,-tbm,-xop,-xsavec,-xsaves" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind }
+declare void @BYTES_TO_BITS(...)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/early_exit_for_complex_domains.ll b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
index eed19b3214a73..9a1edcbfb7796 100644
--- a/polly/test/ScopInfo/early_exit_for_complex_domains.ll
+++ b/polly/test/ScopInfo/early_exit_for_complex_domains.ll
@@ -7,7 +7,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.regnode_charclass_class.2.42.654.690.726.870.978.1770.1806.1842.2166.2274.2382.2598.2814.3030.3064 = type { i8, i8, i16, i32, [32 x i8], [4 x i8] }
 
 ; Function Attrs: nounwind uwtable
-define void @S_cl_or(ptr %cl, ptr %or_with) #0 {
+define void @S_cl_or(ptr %cl, ptr %or_with) {
 entry:
   %0 = load i8, ptr %or_with, align 4, !tbaa !1
   %conv = zext i8 %0 to i32
@@ -37,8 +37,6 @@ if.end91:                                         ; preds = %for.body71, %for.bo
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0"}
diff --git a/polly/test/ScopInfo/expensive-boundary-context.ll b/polly/test/ScopInfo/expensive-boundary-context.ll
index 1a8858d8fce20..95212f83acdca 100644
--- a/polly/test/ScopInfo/expensive-boundary-context.ll
+++ b/polly/test/ScopInfo/expensive-boundary-context.ll
@@ -16,7 +16,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 %struct.foo.1 = type { [3 x i32], [3 x i32], i32, i32, [4 x i32], [4 x i32] }
 
 ; Function Attrs: nounwind uwtable
-define void @hoge() #0 {
+define void @hoge() {
 bb:
   %tmp52 = alloca ptr, align 8
   %tmp53 = alloca ptr, align 8
@@ -250,8 +250,6 @@ bb245:                                            ; preds = %bb244, %bb232
   unreachable
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0, !0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/intrinsics.ll b/polly/test/ScopInfo/intrinsics.ll
index c5bbacbe6d8c2..e6d9e733e35bf 100644
--- a/polly/test/ScopInfo/intrinsics.ll
+++ b/polly/test/ScopInfo/intrinsics.ll
@@ -14,7 +14,7 @@
 ;
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-define void @fun() #0 {
+define void @fun() {
 entry:
   %A = alloca [1024 x i32], align 16
   br label %for.cond
@@ -40,7 +40,4 @@ for.end:                                          ; preds = %for.cond
 }
 
 ; Function Attrs: nounwind readnone
-declare void @llvm.donothing() #1
-
-attributes #0 = { noinline nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
+declare void @llvm.donothing()
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
index 6027975b563b7..e32748a4bbb57 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks-2.ll
@@ -13,13 +13,13 @@ target triple = "x86_64-unknown-linux-gnu"
 @global2 = external unnamed_addr constant [79 x i8], align 1
 @global3 = external unnamed_addr constant [57 x i8], align 1
 
-declare void @widget() #0
+declare void @widget()
 
 ; Function Attrs: nounwind
-declare void @quux(ptr, i64, ptr, ...) #1
+declare void @quux(ptr, i64, ptr, ...)
 
 ; Function Attrs: nounwind uwtable
-define void @hoge(ptr %A) #2 {
+define void @hoge(ptr %A) {
 bb:
   br label %bb15
 
@@ -39,7 +39,7 @@ bb19:                                             ; preds = %bb15
   br i1 %tmp22, label %bb24, label %bb23
 
 bb23:                                             ; preds = %bb19
-  call void @widget() #3
+  call void @widget()
   br label %bb24
 
 bb24:                                             ; preds = %bb23, %bb19, %bb15
@@ -57,7 +57,7 @@ bb29:                                             ; preds = %bb24
   br i1 %tmp32, label %bb33, label %bb34
 
 bb33:                                             ; preds = %bb29
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144)
   br label %bb34
 
 bb34:                                             ; preds = %bb33, %bb29, %bb24
@@ -84,7 +84,7 @@ bb43:                                             ; preds = %bb39
   br i1 %tmp47, label %bb48, label %bb49
 
 bb48:                                             ; preds = %bb43
-  call void @widget() #3
+  call void @widget()
   br label %bb49
 
 bb49:                                             ; preds = %bb48, %bb43, %bb39, %bb34
@@ -103,7 +103,7 @@ bb54:                                             ; preds = %bb49
   br i1 %tmp57, label %bb58, label %bb59
 
 bb58:                                             ; preds = %bb54
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global3)
   br label %bb59
 
 bb59:                                             ; preds = %bb58, %bb54, %bb49
@@ -121,11 +121,6 @@ bb65:                                             ; preds = %bb64, %bb59
   ret void
 }
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
index 4ef5ef09c44b7..b32b87b5c3f3a 100644
--- a/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
+++ b/polly/test/ScopInfo/long-sequence-of-error-blocks.ll
@@ -51,13 +51,13 @@ target triple = "x86_64-unknown-linux-gnu"
 @global2 = external unnamed_addr constant [79 x i8], align 1
 @global3 = external unnamed_addr constant [57 x i8], align 1
 
-declare void @widget() #0
+declare void @widget()
 
 ; Function Attrs: nounwind
-declare void @quux(ptr, i64, ptr, ...) #1
+declare void @quux(ptr, i64, ptr, ...)
 
 ; Function Attrs: nounwind uwtable
-define void @hoge(ptr %A) #2 {
+define void @hoge(ptr %A) {
 bb:
   br label %bb15
 
@@ -77,7 +77,7 @@ bb19:                                             ; preds = %bb15
   br i1 %tmp22, label %bb24, label %bb23
 
 bb23:                                             ; preds = %bb19
-  call void @widget() #3
+  call void @widget()
   br label %bb24
 
 bb24:                                             ; preds = %bb23, %bb19, %bb15
@@ -95,18 +95,13 @@ bb29:                                             ; preds = %bb24
   br i1 %tmp32, label %bb33, label %bb34
 
 bb33:                                             ; preds = %bb29
-  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144) #3
+  call void (ptr, i64, ptr, ...) @quux(ptr @global, i64 300, ptr @global2, i32 144)
   br label %bb34
 
 bb34:                                             ; preds = %bb33, %bb29, %bb24
   ret void
 }
 
-attributes #0 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #2 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.8.0 (trunk 252261) (llvm/trunk 252271)"}
diff --git a/polly/test/ScopInfo/memcpy-raw-source.ll b/polly/test/ScopInfo/memcpy-raw-source.ll
index d9024cd27346f..6c45b0d41b76b 100644
--- a/polly/test/ScopInfo/memcpy-raw-source.ll
+++ b/polly/test/ScopInfo/memcpy-raw-source.ll
@@ -8,10 +8,10 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @tonemasks = external global [17 x [6 x [56 x float]]], align 16
 
 ; Function Attrs: argmemonly nounwind
-declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1) #0
+declare void @llvm.memcpy.p0.p0.i64(ptr nocapture writeonly, ptr nocapture readonly, i64, i32, i1)
 
 ; Function Attrs: nounwind uwtable
-define void @setup_tone_curves() #1 {
+define void @setup_tone_curves() {
 entry:
   %workc = alloca [17 x [8 x [56 x float]]], align 16
   br label %for.cond7.preheader
@@ -46,9 +46,6 @@ for.inc104:                                       ; preds = %for.body74
   ret void
 }
 
-attributes #0 = { argmemonly nounwind }
-attributes #1 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 4.0.0 (trunk 285057) (llvm/trunk 285063)"}
diff --git a/polly/test/ScopInfo/mismatching-array-dimensions.ll b/polly/test/ScopInfo/mismatching-array-dimensions.ll
index ed1e28cbee6ef..f825cbff1ec56 100644
--- a/polly/test/ScopInfo/mismatching-array-dimensions.ll
+++ b/polly/test/ScopInfo/mismatching-array-dimensions.ll
@@ -6,7 +6,7 @@ target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.9.0"
 
 ; Function Attrs: nounwind ssp uwtable
-define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) #0 {
+define void @hoge(ptr %arg, ptr %arg5, i32 %arg6) {
 bb:
   br i1 undef, label %bb7, label %bb25
 
@@ -37,8 +37,6 @@ bb25:                                             ; preds = %bb21, %bb
   ret void
 }
 
-attributes #0 = { nounwind ssp uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="core2" "target-features"="+cx16,+fxsr,+mmx,+sse,+sse2,+sse3,+ssse3" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.module.flags = !{!0, !1, !2}
 !llvm.ident = !{!3}
 
diff --git a/polly/test/ScopInfo/multidim_srem.ll b/polly/test/ScopInfo/multidim_srem.ll
index c965e2c86e2ba..88c8c6af648e0 100644
--- a/polly/test/ScopInfo/multidim_srem.ll
+++ b/polly/test/ScopInfo/multidim_srem.ll
@@ -19,11 +19,9 @@
 ; CHECK-NEXT:             [n] -> { Stmt_for_body_8[i0, i1, i2] -> MemRef_A[1, i1, i2] : (1 + i0) mod 2 = 0; Stmt_for_body_8[i0, i1, i2] -> MemRef_A[0, i1, i2] : (i0) mod 2 = 0 };
 ; CHECK-NEXT: }
 
-
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
-
-define void @foo(i64 %n, ptr %A) #0 {
+define void @foo(i64 %n, ptr %A) {
 entry:
   br label %entry.split
 
@@ -83,13 +81,10 @@ for.end.16:                                       ; preds = %for.inc.14
 }
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.start(i64, ptr nocapture) #1
+declare void @llvm.lifetime.start(i64, ptr nocapture)
 
 ; Function Attrs: nounwind
-declare void @llvm.lifetime.end(i64, ptr nocapture) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind }
+declare void @llvm.lifetime.end(i64, ptr nocapture)
 
 !llvm.ident = !{!0}
 
diff --git a/polly/test/ScopInfo/remarks.ll b/polly/test/ScopInfo/remarks.ll
index 2c173a31c46e9..10cc57aa27a14 100644
--- a/polly/test/ScopInfo/remarks.ll
+++ b/polly/test/ScopInfo/remarks.ll
@@ -41,7 +41,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 @.str = private unnamed_addr constant [8 x i8] c"Printf!\00", align 1
 
-define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) #0 !dbg !4 {
+define void @valid(ptr %A, ptr %B, i32 %N, i32 %M, ptr %C, i32 %Debug) !dbg !4 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !23, metadata !24), !dbg !25
   call void @llvm.dbg.value(metadata ptr %B, i64 0, metadata !26, metadata !24), !dbg !27
@@ -105,7 +105,7 @@ for.end:                                          ; preds = %for.cond.3
   br i1 %tobool, label %if.end.18, label %if.then.17, !dbg !72
 
 if.then.17:                                       ; preds = %for.end
-  %call = call i32 (ptr, ...) @printf(ptr nonnull @.str) #3, !dbg !73
+  %call = call i32 (ptr, ...) @printf(ptr nonnull @.str), !dbg !73
   br label %if.end.18, !dbg !73
 
 if.end.18:                                        ; preds = %for.end, %if.then.17
@@ -120,11 +120,11 @@ for.end.21:                                       ; preds = %for.cond
   ret void, !dbg !76
 }
 
-declare void @llvm.dbg.declare(metadata, metadata, metadata) #1
+declare void @llvm.dbg.declare(metadata, metadata, metadata)
 
-declare i32 @printf(ptr, ...) #2
+declare i32 @printf(ptr, ...)
 
-define void @invalid0(ptr %A) #0 !dbg !13 {
+define void @invalid0(ptr %A) !dbg !13 {
 entry:
   call void @llvm.dbg.value(metadata ptr %A, i64 0, metadata !77, metadata !24), !dbg !78
   call void @llvm.dbg.value(metadata i32 0, i64 0, metadata !79, metadata !24), !dbg !81
@@ -173,12 +173,7 @@ for.end.7:                                        ; preds = %for.cond
   ret void, !dbg !105
 }
 
-declare void @llvm.dbg.value(metadata, i64, metadata, metadata) #1
-
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #1 = { nounwind readnone }
-attributes #2 = { "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-attributes #3 = { nounwind }
+declare void @llvm.dbg.value(metadata, i64, metadata, metadata)
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!20, !21}
diff --git a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
index 55192b5bc2c97..fa0c81fe9a48e 100644
--- a/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
+++ b/polly/test/ScopInfo/scev-div-with-evaluatable-divisor.ll
@@ -9,7 +9,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
 ; Function Attrs: nounwind uwtable
-define void @cfft2(i32 %n, ptr %A) local_unnamed_addr #0 {
+define void @cfft2(i32 %n, ptr %A) local_unnamed_addr {
 entry:
   br i1 true, label %for.body.lr.ph, label %for.end
 
@@ -49,8 +49,6 @@ for.end:                                          ; preds = %for.inc, %entry
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 3.9.0 (trunk 273249) (llvm/trunk 273255)"}
diff --git a/polly/test/ScopInfo/unnamed_stmts.ll b/polly/test/ScopInfo/unnamed_stmts.ll
index 5a189454471f4..163170ce74895 100644
--- a/polly/test/ScopInfo/unnamed_stmts.ll
+++ b/polly/test/ScopInfo/unnamed_stmts.ll
@@ -48,7 +48,7 @@
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
 ; Function Attrs: nounwind uwtable
-define void @vec3(i64 %n, ptr, ptr) #0 {
+define void @vec3(i64 %n, ptr, ptr) {
   br label %.split
 
 .split:                                           ; preds = %0
@@ -141,8 +141,6 @@ define void @vec3(i64 %n, ptr, ptr) #0 {
   ret void
 }
 
-attributes #0 = { nounwind uwtable "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="all" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"Ubuntu clang version 3.7.1-3ubuntu4 (tags/RELEASE_371/final) (based on LLVM 3.7.1)"}
diff --git a/polly/test/ScopInliner/ignore-declares.ll b/polly/test/ScopInliner/ignore-declares.ll
index 11722dcb32166..5c0cfa103f0bf 100644
--- a/polly/test/ScopInliner/ignore-declares.ll
+++ b/polly/test/ScopInliner/ignore-declares.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-scops -disable-output < %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s
 
 ; Check that we do not crash if there are declares. We should skip function
 ; declarations and not try to query for domtree.
diff --git a/polly/test/ScopInliner/invariant-load-func.ll b/polly/test/ScopInliner/invariant-load-func.ll
index ffd2ec9cdb60f..58c556a455fb9 100644
--- a/polly/test/ScopInliner/invariant-load-func.ll
+++ b/polly/test/ScopInliner/invariant-load-func.ll
@@ -1,12 +1,9 @@
-; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-invariant-load-hoisting '-passes=print<polly-function-scops>' -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions -polly-invariant-load-hoisting '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s 2>&1 | FileCheck %s
 
 ; Check that we inline a function that requires invariant load hoisting
 ; correctly.
 ; CHECK:    Max Loop Depth:  2
 
-; REQUIRES: pollyacc
-
 
 ; void to_be_inlined(int A[], int *begin, int *end) {
 ;     for(int i = *begin; i < *end; i++) {
diff --git a/polly/test/ScopInliner/simple-inline-loop.ll b/polly/test/ScopInliner/simple-inline-loop.ll
index a5e3483edad05..f12798a3d831a 100644
--- a/polly/test/ScopInliner/simple-inline-loop.ll
+++ b/polly/test/ScopInliner/simple-inline-loop.ll
@@ -1,5 +1,4 @@
-; RUN: opt %loadPolly -polly-detect-full-functions -polly-scop-inliner \
-; RUN: -polly-print-scops -disable-output < %s | FileCheck %s
+; RUN: opt %loadNPMPolly -polly-detect-full-functions '-passes=cgscc(polly-inline),function(print<polly-function-scops>)' -disable-output < %s  2>&1 | FileCheck %s
 
 ; Check that we get the 2 nested loops by inlining `to_be_inlined` into
 ; `inline_site`.
diff --git a/polly/test/Simplify/phi_in_regionstmt.ll b/polly/test/Simplify/phi_in_regionstmt.ll
index 76efd484f547f..2bb05738955a3 100644
--- a/polly/test/Simplify/phi_in_regionstmt.ll
+++ b/polly/test/Simplify/phi_in_regionstmt.ll
@@ -14,7 +14,7 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 @qmatrix = external local_unnamed_addr global [8 x ptr], align 16
 
 ; Function Attrs: nounwind uwtable
-define void @AssignQuantParam(ptr %pps) local_unnamed_addr #0 {
+define void @AssignQuantParam(ptr %pps) local_unnamed_addr {
 entry:
   br label %entry.split
 
@@ -46,8 +46,6 @@ if.else121.us.7:                                  ; preds = %if.else135.us.6, %i
   br label %if.end161
 }
 
-attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "frame-pointer"="none" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+fxsr,+mmx,+sse,+sse2,+x87" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
 !llvm.ident = !{!0}
 
 !0 = !{!"clang version 6.0.0 (trunk 308961)"}
@@ -59,5 +57,4 @@ attributes #0 = { nounwind uwtable "correctly-rounded-divide-sqrt-fp-math"="fals
 !6 = !{!"any pointer", !3, i64 0}
 !7 = !{!6, !6, i64 0}
 
-
 ; CHECK: SCoP could not be simplified
diff --git a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
index 2d9433f6e4dc7..599bc4b3d8bbf 100644
--- a/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/BUILD.bazel
@@ -3093,8 +3093,8 @@ libc_support_library(
 libc_support_library(
     name = "__support_sincosf_utils",
     hdrs = [
-        "src/__support/math/sincosf_utils.h",
         "src/__support/math/sincosf_float_eval.h",
+        "src/__support/math/sincosf_utils.h",
     ],
     deps = [
         ":__support_fputil_double_double",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
index b44273123dcad..522a2bd62db82 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/libc/test/UnitTest/BUILD.bazel
@@ -110,6 +110,7 @@ libc_test_library(
         "//libc:__support_fputil_rounding_mode",
         "//libc:__support_libc_errno",
         "//libc:__support_macros_config",
+        "//libc:__support_macros_optimization",
         "//libc:__support_macros_properties_architectures",
         "//libc:hdr_fenv_macros",
         "//libc:hdr_math_macros",
diff --git a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
index d2297d51383cc..4b8a34646d637 100644
--- a/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
+++ b/utils/bazel/llvm-project-overlay/libc/test/src/math/libc_math_test_rules.bzl
@@ -35,6 +35,7 @@ def math_test(name, hdrs = [], deps = [], **kwargs):
             "//libc:__support_fputil_manipulation_functions",
             "//libc:__support_fputil_nearest_integer_operations",
             "//libc:__support_fputil_normal_float",
+            "//libc:__support_macros_optimization",
             "//libc:__support_macros_properties_architectures",
             "//libc:__support_macros_properties_os",
             "//libc:__support_macros_properties_types",
diff --git a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
index b415a367d2c19..bcee6c4e243a4 100644
--- a/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/BUILD.bazel
@@ -577,6 +577,7 @@ cc_library(
     deps = [
         ":BinaryFormat",
         ":DebugInfoCodeView",
+        ":DebugInfoDWARFLowLevel",
         ":Support",
         ":TargetParser",
         ":config",
@@ -5697,10 +5698,17 @@ cc_binary(
     copts = llvm_copts,
     stamp = 0,
     deps = [
+        ":AllTargetsAsmParsers",
+        ":AllTargetsCodeGens",
+        ":AllTargetsMCAs",
         ":Analysis",
+        ":CodeGen",
         ":Core",
         ":IRReader",
+        ":MC",
         ":Support",
+        ":Target",
+        ":TargetParser",
         ":config",
     ],
 )
diff --git a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
index 0a3c837da34bd..bfdf37b0c969b 100644
--- a/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/llvm/unittests/BUILD.bazel
@@ -423,6 +423,7 @@ cc_test(
         "//llvm:JITLink",
         "//llvm:MC",
         "//llvm:Object",
+        "//llvm:ObjectYAML",
         "//llvm:OrcDebugging",
         "//llvm:OrcJIT",
         "//llvm:OrcShared",
diff --git a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
index b5f14ff6f9b71..d528daeb160cf 100644
--- a/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -9397,6 +9397,7 @@ cc_library(
         ":MemRefTransforms",
         ":NVGPUTransforms",
         ":OpenACCTransforms",
+        ":OpenMPTransforms",
         ":QuantTransforms",
         ":SCFTransforms",
         ":SPIRVTransforms",
@@ -10161,6 +10162,42 @@ gentbl_cc_library(
     deps = [":PassBaseTdFiles"],
 )
 
+cc_library(
+    name = "OpenACCAnalysis",
+    srcs = glob(
+        [
+            "lib/Dialect/OpenACC/Analysis/*.cpp",
+        ],
+    ),
+    hdrs = glob(["include/mlir/Dialect/OpenACC/Analysis/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":IR",
+        ":OpenACCDialect",
+        ":OpenACCUtils",
+        ":Pass",
+    ],
+)
+
+cc_library(
+    name = "OpenACCUtils",
+    srcs = glob(
+        [
+            "lib/Dialect/OpenACC/Utils/*.cpp",
+        ],
+    ),
+    includes = ["include"],
+    deps = [
+        ":OpenACCDialect",
+        ":OpenACCOpsIncGen",
+        ":OpenACCPassIncGen",
+        ":OpenACCTypeInterfacesIncGen",
+        ":Support",
+        ":ViewLikeInterface",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "OpenACCTransforms",
     srcs = glob(
@@ -10314,6 +10351,40 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "OpenMPPassIncGen",
+    tbl_outs = {"include/mlir/Dialect/OpenMP/Transforms/Passes.h.inc": [
+        "-gen-pass-decls",
+        "-name=OpenMP",
+    ]},
+    tblgen = ":mlir-tblgen",
+    td_file = "include/mlir/Dialect/OpenMP/Transforms/Passes.td",
+    deps = [":PassBaseTdFiles"],
+)
+
+cc_library(
+    name = "OpenMPTransforms",
+    srcs = glob(
+        [
+            "lib/Dialect/OpenMP/Transforms/*.cpp",
+        ],
+    ),
+    hdrs = glob(["include/mlir/Dialect/OpenMP/Transforms/*.h"]),
+    includes = ["include"],
+    deps = [
+        ":Analysis",
+        ":FuncDialect",
+        ":IR",
+        ":LLVMDialect",
+        ":OpenMPDialect",
+        ":OpenMPPassIncGen",
+        ":Pass",
+        ":Support",
+        ":Transforms",
+        "//llvm:Support",
+    ],
+)
+
 cc_library(
     name = "OpenACCToSCF",
     srcs = glob([
diff --git a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
index 0a84bd246b8e0..be18ba5b46806 100644
--- a/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/python/BUILD.bazel
@@ -1009,6 +1009,39 @@ filegroup(
     ],
 )
 
+##---------------------------------------------------------------------------##
+# Shard dialect.
+##---------------------------------------------------------------------------##
+
+gentbl_filegroup(
+    name = "ShardOpsPyGen",
+    tbl_outs = {
+        "mlir/dialects/_shard_enum_gen.py": [
+            "-gen-python-enum-bindings",
+            "-bind-dialect=shard",
+        ],
+        "mlir/dialects/_shard_ops_gen.py": [
+            "-gen-python-op-bindings",
+            "-bind-dialect=shard",
+        ],
+    },
+    tblgen = "//mlir:mlir-tblgen",
+    td_file = "mlir/dialects/ShardOps.td",
+    deps = [
+        "//mlir:OpBaseTdFiles",
+        "//mlir:ShapeOpsTdFiles",
+        "//mlir:ShardTdFiles",
+    ],
+)
+
+filegroup(
+    name = "ShardOpsPyFiles",
+    srcs = [
+        "mlir/dialects/shard.py",
+        ":ShardOpsPyGen",
+    ],
+)
+
 ##---------------------------------------------------------------------------##
 # Shape dialect.
 ##---------------------------------------------------------------------------##
diff --git a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
index 56e4e70f18f26..778f0be86025a 100644
--- a/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/mlir/test/BUILD.bazel
@@ -963,6 +963,7 @@ cc_library(
         "//mlir:FuncDialect",
         "//mlir:IR",
         "//mlir:MemRefDialect",
+        "//mlir:OpenACCAnalysis",
         "//mlir:OpenACCDialect",
         "//mlir:Pass",
         "//mlir:Support",